Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 24 Aug 2011 16:11:08 +0000 (09:11 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 24 Aug 2011 16:11:08 +0000 (09:11 -0700)
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (29 commits)
  bridge: fix a possible net_device leak
  net: Documentation: RFC 2553bis is now RFC 3493
  atm: br2684: Fix oops due to skb->dev being NULL
  ipv6: Fix ipv6_getsockopt for IPV6_2292PKTOPTIONS
  net: netdev-features.txt update to Documentation/networking/00-INDEX
  vlan: reset headers on accel emulation path
  forcedeth: call vlan_mode only if hw supports vlans
  via-velocity: remove non-tagged packet filtering
  bonding:reset backup and inactive flag of slave
  net_sched: fix port mirror/redirect stats reporting
  sit tunnels: propagate IPv6 transport class to IPv4 Type of Service
  gianfar: reduce stack usage in gianfar_ethtool.c
  net: minor update to Documentation/networking/scaling.txt
  net: add missing entries to Documentation/networking/00-INDEX
  gianfar: prevent buggy hw rx vlan tagging
  net: sh_eth: Fix build by forgot including linux/interrupt.h
  drivers/net/can/sja1000/plx_pci.c: eliminate double free
  usbnet/cdc_ncm: Don't use stack variables for DMA
  vmxnet3: Don't enable vlan filters in promiscuous mode.
  iwlagn: sysfs couldn't find the priv pointer
  ...

328 files changed:
Documentation/00-INDEX
Documentation/PCI/MSI-HOWTO.txt
Documentation/SubmittingDrivers
Documentation/SubmittingPatches
Documentation/block/cfq-iosched.txt
Documentation/email-clients.txt
Documentation/filesystems/befs.txt
Documentation/kernel-docs.txt
Documentation/kernel-parameters.txt
Documentation/ramoops.txt [new file with mode: 0644]
Documentation/virtual/00-INDEX
Documentation/virtual/lguest/lguest.c
Documentation/virtual/virtio-spec.txt [new file with mode: 0644]
MAINTAINERS
Makefile
arch/ia64/Kconfig
arch/ia64/configs/generic_defconfig
arch/m68k/include/asm/page_mm.h
arch/sparc/Kconfig
arch/sparc/include/asm/spinlock_32.h
arch/sparc/include/asm/spinlock_64.h
arch/sparc/kernel/pcic.c
arch/x86/include/asm/xen/page.h
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kvm/Kconfig
arch/x86/mm/fault.c
arch/x86/pci/acpi.c
arch/x86/platform/olpc/olpc.c
arch/x86/vdso/vdso32/sysenter.S
arch/x86/xen/Makefile
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/smp.c
block/Kconfig
block/Makefile
block/blk-core.c
block/blk-flush.c
block/blk-softirq.c
block/blk-throttle.c
block/blk.h
block/bsg-lib.c [new file with mode: 0644]
block/cfq-iosched.c
block/genhd.c
drivers/ata/Kconfig
drivers/ata/Makefile
drivers/ata/pata_imx.c [new file with mode: 0644]
drivers/ata/pata_via.c
drivers/ata/sata_dwc_460ex.c
drivers/ata/sata_sil.c
drivers/base/power/domain.c
drivers/base/regmap/regmap-i2c.c
drivers/base/regmap/regmap-spi.c
drivers/base/regmap/regmap.c
drivers/block/Kconfig
drivers/block/drbd/drbd_nl.c
drivers/block/loop.c
drivers/block/swim3.c
drivers/block/xen-blkfront.c
drivers/cdrom/cdrom.c
drivers/edac/i7core_edac.c
drivers/firewire/core-cdev.c
drivers/firewire/core-device.c
drivers/firewire/ohci.c
drivers/gpu/drm/i915/i915_debugfs.c
drivers/gpu/drm/i915/i915_drv.h
drivers/gpu/drm/i915/i915_irq.c
drivers/gpu/drm/i915/i915_reg.h
drivers/gpu/drm/i915/i915_suspend.c
drivers/gpu/drm/i915/intel_display.c
drivers/gpu/drm/i915/intel_dp.c
drivers/gpu/drm/i915/intel_drv.h
drivers/gpu/drm/i915/intel_lvds.c
drivers/gpu/drm/i915/intel_opregion.c
drivers/gpu/drm/i915/intel_panel.c
drivers/gpu/drm/i915/intel_ringbuffer.c
drivers/gpu/drm/radeon/atombios_dp.c
drivers/gpu/drm/radeon/evergreen.c
drivers/gpu/drm/radeon/radeon_connectors.c
drivers/gpu/drm/radeon/radeon_device.c
drivers/gpu/drm/radeon/radeon_encoders.c
drivers/gpu/drm/radeon/radeon_mode.h
drivers/gpu/drm/radeon/radeon_test.c
drivers/gpu/drm/radeon/radeon_ttm.c
drivers/gpu/drm/ttm/ttm_bo.c
drivers/gpu/drm/ttm/ttm_bo_util.c
drivers/hwmon/ibmaem.c
drivers/hwmon/pmbus/lm25066.c
drivers/hwmon/pmbus/pmbus.h
drivers/hwmon/pmbus/pmbus_core.c
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/iser/iser_initiator.c
drivers/misc/cb710/core.c
drivers/mmc/card/mmc_test.c
drivers/mmc/core/core.c
drivers/mmc/core/mmc.c
drivers/mmc/core/mmc_ops.c
drivers/mmc/host/dw_mmc.c
drivers/mmc/host/sdhci-esdhc-imx.c
drivers/mmc/host/sdhci-pxav3.c
drivers/mmc/host/sdhci-s3c.c
drivers/mmc/host/sdhci.c
drivers/mmc/host/tmio_mmc.c
drivers/pci/hotplug/pcihp_slot.c
drivers/pci/of.c
drivers/pci/pci.c
drivers/pci/pci.h
drivers/pci/probe.c
drivers/pci/setup-bus.c
drivers/pci/setup-res.c
drivers/power/max8997_charger.c
drivers/power/max8998_charger.c
drivers/power/s3c_adc_battery.c
drivers/rtc/interface.c
drivers/staging/gma500/mdfld_dsi_dbi.c
drivers/staging/gma500/mdfld_dsi_dbi.h
drivers/staging/gma500/mdfld_dsi_dpi.c
drivers/staging/gma500/mdfld_dsi_output.c
drivers/staging/gma500/medfield.h
drivers/staging/gma500/psb_drv.h
drivers/xen/xen-selfballoon.c
fs/befs/linuxvfs.c
fs/btrfs/ctree.h
fs/btrfs/extent-tree.c
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/tree-log.c
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/cifs/cifs_debug.c
fs/cifs/cifsacl.c
fs/cifs/cifsfs.h
fs/cifs/cifsglob.h
fs/cifs/connect.c
fs/cifs/dir.c
fs/cifs/transport.c
fs/ext4/ext4_jbd2.h
fs/ext4/indirect.c
fs/ext4/inode.c
fs/ext4/page-io.c
fs/ext4/super.c
fs/fat/dir.c
fs/fat/inode.c
fs/jfs/jfs_umount.c
fs/nfs/blocklayout/blocklayout.c
fs/nfs/callback.h
fs/nfs/callback_proc.c
fs/nfs/callback_xdr.c
fs/nfs/objlayout/objio_osd.c
fs/nfs/objlayout/pnfs_osd_xdr_cli.c
fs/xfs/Makefile
fs/xfs/kmem.c [new file with mode: 0644]
fs/xfs/kmem.h [new file with mode: 0644]
fs/xfs/linux-2.6/kmem.c [deleted file]
fs/xfs/linux-2.6/kmem.h [deleted file]
fs/xfs/linux-2.6/mrlock.h [deleted file]
fs/xfs/linux-2.6/time.h [deleted file]
fs/xfs/linux-2.6/xfs_acl.c [deleted file]
fs/xfs/linux-2.6/xfs_aops.c [deleted file]
fs/xfs/linux-2.6/xfs_aops.h [deleted file]
fs/xfs/linux-2.6/xfs_buf.c [deleted file]
fs/xfs/linux-2.6/xfs_buf.h [deleted file]
fs/xfs/linux-2.6/xfs_discard.c [deleted file]
fs/xfs/linux-2.6/xfs_discard.h [deleted file]
fs/xfs/linux-2.6/xfs_export.c [deleted file]
fs/xfs/linux-2.6/xfs_export.h [deleted file]
fs/xfs/linux-2.6/xfs_file.c [deleted file]
fs/xfs/linux-2.6/xfs_fs_subr.c [deleted file]
fs/xfs/linux-2.6/xfs_globals.c [deleted file]
fs/xfs/linux-2.6/xfs_ioctl.c [deleted file]
fs/xfs/linux-2.6/xfs_ioctl.h [deleted file]
fs/xfs/linux-2.6/xfs_ioctl32.c [deleted file]
fs/xfs/linux-2.6/xfs_ioctl32.h [deleted file]
fs/xfs/linux-2.6/xfs_iops.c [deleted file]
fs/xfs/linux-2.6/xfs_iops.h [deleted file]
fs/xfs/linux-2.6/xfs_linux.h [deleted file]
fs/xfs/linux-2.6/xfs_message.c [deleted file]
fs/xfs/linux-2.6/xfs_message.h [deleted file]
fs/xfs/linux-2.6/xfs_quotaops.c [deleted file]
fs/xfs/linux-2.6/xfs_stats.c [deleted file]
fs/xfs/linux-2.6/xfs_stats.h [deleted file]
fs/xfs/linux-2.6/xfs_super.c [deleted file]
fs/xfs/linux-2.6/xfs_super.h [deleted file]
fs/xfs/linux-2.6/xfs_sync.c [deleted file]
fs/xfs/linux-2.6/xfs_sync.h [deleted file]
fs/xfs/linux-2.6/xfs_sysctl.c [deleted file]
fs/xfs/linux-2.6/xfs_sysctl.h [deleted file]
fs/xfs/linux-2.6/xfs_trace.c [deleted file]
fs/xfs/linux-2.6/xfs_trace.h [deleted file]
fs/xfs/linux-2.6/xfs_vnode.h [deleted file]
fs/xfs/linux-2.6/xfs_xattr.c [deleted file]
fs/xfs/mrlock.h [new file with mode: 0644]
fs/xfs/quota/xfs_dquot.c [deleted file]
fs/xfs/quota/xfs_dquot.h [deleted file]
fs/xfs/quota/xfs_dquot_item.c [deleted file]
fs/xfs/quota/xfs_dquot_item.h [deleted file]
fs/xfs/quota/xfs_qm.c [deleted file]
fs/xfs/quota/xfs_qm.h [deleted file]
fs/xfs/quota/xfs_qm_bhv.c [deleted file]
fs/xfs/quota/xfs_qm_stats.c [deleted file]
fs/xfs/quota/xfs_qm_stats.h [deleted file]
fs/xfs/quota/xfs_qm_syscalls.c [deleted file]
fs/xfs/quota/xfs_quota_priv.h [deleted file]
fs/xfs/quota/xfs_trans_dquot.c [deleted file]
fs/xfs/support/uuid.c [deleted file]
fs/xfs/support/uuid.h [deleted file]
fs/xfs/time.h [new file with mode: 0644]
fs/xfs/uuid.c [new file with mode: 0644]
fs/xfs/uuid.h [new file with mode: 0644]
fs/xfs/xfs.h
fs/xfs/xfs_acl.c [new file with mode: 0644]
fs/xfs/xfs_aops.c [new file with mode: 0644]
fs/xfs/xfs_aops.h [new file with mode: 0644]
fs/xfs/xfs_buf.c [new file with mode: 0644]
fs/xfs/xfs_buf.h [new file with mode: 0644]
fs/xfs/xfs_discard.c [new file with mode: 0644]
fs/xfs/xfs_discard.h [new file with mode: 0644]
fs/xfs/xfs_dquot.c [new file with mode: 0644]
fs/xfs/xfs_dquot.h [new file with mode: 0644]
fs/xfs/xfs_dquot_item.c [new file with mode: 0644]
fs/xfs/xfs_dquot_item.h [new file with mode: 0644]
fs/xfs/xfs_export.c [new file with mode: 0644]
fs/xfs/xfs_export.h [new file with mode: 0644]
fs/xfs/xfs_file.c [new file with mode: 0644]
fs/xfs/xfs_fs_subr.c [new file with mode: 0644]
fs/xfs/xfs_globals.c [new file with mode: 0644]
fs/xfs/xfs_ioctl.c [new file with mode: 0644]
fs/xfs/xfs_ioctl.h [new file with mode: 0644]
fs/xfs/xfs_ioctl32.c [new file with mode: 0644]
fs/xfs/xfs_ioctl32.h [new file with mode: 0644]
fs/xfs/xfs_iops.c [new file with mode: 0644]
fs/xfs/xfs_iops.h [new file with mode: 0644]
fs/xfs/xfs_linux.h [new file with mode: 0644]
fs/xfs/xfs_message.c [new file with mode: 0644]
fs/xfs/xfs_message.h [new file with mode: 0644]
fs/xfs/xfs_qm.c [new file with mode: 0644]
fs/xfs/xfs_qm.h [new file with mode: 0644]
fs/xfs/xfs_qm_bhv.c [new file with mode: 0644]
fs/xfs/xfs_qm_stats.c [new file with mode: 0644]
fs/xfs/xfs_qm_stats.h [new file with mode: 0644]
fs/xfs/xfs_qm_syscalls.c [new file with mode: 0644]
fs/xfs/xfs_quota_priv.h [new file with mode: 0644]
fs/xfs/xfs_quotaops.c [new file with mode: 0644]
fs/xfs/xfs_stats.c [new file with mode: 0644]
fs/xfs/xfs_stats.h [new file with mode: 0644]
fs/xfs/xfs_super.c [new file with mode: 0644]
fs/xfs/xfs_super.h [new file with mode: 0644]
fs/xfs/xfs_sync.c [new file with mode: 0644]
fs/xfs/xfs_sync.h [new file with mode: 0644]
fs/xfs/xfs_sysctl.c [new file with mode: 0644]
fs/xfs/xfs_sysctl.h [new file with mode: 0644]
fs/xfs/xfs_trace.c [new file with mode: 0644]
fs/xfs/xfs_trace.h [new file with mode: 0644]
fs/xfs/xfs_trans_dquot.c [new file with mode: 0644]
fs/xfs/xfs_vnode.h [new file with mode: 0644]
fs/xfs/xfs_xattr.c [new file with mode: 0644]
include/asm-generic/memory_model.h
include/linux/blk_types.h
include/linux/blkdev.h
include/linux/blktrace_api.h
include/linux/bsg-lib.h [new file with mode: 0644]
include/linux/hash.h
include/linux/irq.h
include/linux/irqdesc.h
include/linux/loop.h
include/linux/miscdevice.h
include/linux/mm.h
include/linux/mmc/host.h
include/linux/pci.h
include/linux/pm_domain.h
include/linux/rtc.h
include/sound/tlv320aic3x.h
include/trace/events/block.h
kernel/irq/generic-chip.c
kernel/irq/irqdesc.c
kernel/irq/manage.c
kernel/lockdep.c
kernel/power/Kconfig
kernel/sysctl_binary.c
kernel/sysctl_check.c
kernel/trace/blktrace.c
mm/highmem.c
mm/vmalloc.c
sound/aoa/fabrics/layout.c
sound/pci/ac97/ac97_patch.c
sound/pci/azt3328.c
sound/pci/hda/alc268_quirks.c
sound/pci/hda/hda_eld.c
sound/pci/hda/patch_cirrus.c
sound/pci/hda/patch_realtek.c
sound/pci/hda/patch_sigmatel.c
sound/soc/codecs/wm8750.c
sound/soc/codecs/wm8903.c
sound/soc/codecs/wm8994.c
sound/soc/omap/n810.c
sound/soc/omap/omap-mcbsp.c
sound/soc/omap/omap-mcbsp.h
sound/soc/omap/omap-pcm.c
sound/soc/omap/omap-pcm.h
sound/soc/omap/rx51.c
sound/soc/samsung/Makefile
sound/soc/samsung/idma.c [new file with mode: 0644]
sound/soc/samsung/idma.h [new file with mode: 0644]
sound/soc/samsung/jive_wm8750.c
sound/soc/samsung/speyside_wm8962.c
sound/soc/tegra/tegra_pcm.c
sound/soc/tegra/tegra_wm8903.c
sound/usb/caiaq/audio.c
sound/usb/caiaq/device.h
sound/usb/mixer.c
sound/usb/quirks-table.h
tools/perf/builtin-probe.c
tools/perf/builtin-record.c
tools/perf/builtin-stat.c
tools/perf/util/dwarf-aux.c
tools/perf/util/dwarf-aux.h
tools/perf/util/evlist.c
tools/perf/util/header.c
tools/perf/util/include/linux/compiler.h
tools/perf/util/parse-events.c
tools/perf/util/probe-finder.c
tools/perf/util/probe-finder.h
tools/perf/util/symbol.c
tools/perf/util/symbol.h
tools/perf/util/ui/browsers/top.c

index 1f89424..65bbd26 100644 (file)
@@ -272,6 +272,8 @@ printk-formats.txt
        - how to get printk format specifiers right
 prio_tree.txt
        - info on radix-priority-search-tree use for indexing vmas.
+ramoops.txt
+       - documentation of the ramoops oops/panic logging module.
 rbtree.txt
        - info on what red-black trees are and what they are for.
 robust-futex-ABI.txt
index 3f5e0b0..53e6fca 100644 (file)
@@ -45,7 +45,7 @@ arrived in memory (this becomes more likely with devices behind PCI-PCI
 bridges).  In order to ensure that all the data has arrived in memory,
 the interrupt handler must read a register on the device which raised
 the interrupt.  PCI transaction ordering rules require that all the data
-arrives in memory before the value can be returned from the register.
+arrive in memory before the value may be returned from the register.
 Using MSIs avoids this problem as the interrupt-generating write cannot
 pass the data writes, so by the time the interrupt is raised, the driver
 knows that all the data has arrived in memory.
@@ -86,13 +86,13 @@ device.
 
 int pci_enable_msi(struct pci_dev *dev)
 
-A successful call will allocate ONE interrupt to the device, regardless
-of how many MSIs the device supports.  The device will be switched from
+A successful call allocates ONE interrupt to the device, regardless
+of how many MSIs the device supports.  The device is switched from
 pin-based interrupt mode to MSI mode.  The dev->irq number is changed
-to a new number which represents the message signaled interrupt.
-This function should be called before the driver calls request_irq()
-since enabling MSIs disables the pin-based IRQ and the driver will not
-receive interrupts on the old interrupt.
+to a new number which represents the message signaled interrupt;
+consequently, this function should be called before the driver calls
+request_irq(), because an MSI is delivered via a vector that is
+different from the vector of a pin-based interrupt.
 
 4.2.2 pci_enable_msi_block
 
@@ -111,20 +111,20 @@ the device are in the range dev->irq to dev->irq + count - 1.
 
 If this function returns a negative number, it indicates an error and
 the driver should not attempt to request any more MSI interrupts for
-this device.  If this function returns a positive number, it will be
-less than 'count' and indicate the number of interrupts that could have
-been allocated.  In neither case will the irq value have been
-updated, nor will the device have been switched into MSI mode.
+this device.  If this function returns a positive number, it is
+less than 'count' and indicates the number of interrupts that could have
+been allocated.  In neither case is the irq value updated or the device
+switched into MSI mode.
 
 The device driver must decide what action to take if
-pci_enable_msi_block() returns a value less than the number asked for.
-Some devices can make use of fewer interrupts than the maximum they
-request; in this case the driver should call pci_enable_msi_block()
+pci_enable_msi_block() returns a value less than the number requested.
+For instance, the driver could still make use of fewer interrupts;
+in this case the driver should call pci_enable_msi_block()
 again.  Note that it is not guaranteed to succeed, even when the
 'count' has been reduced to the value returned from a previous call to
 pci_enable_msi_block().  This is because there are multiple constraints
 on the number of vectors that can be allocated; pci_enable_msi_block()
-will return as soon as it finds any constraint that doesn't allow the
+returns as soon as it finds any constraint that doesn't allow the
 call to succeed.
 
 4.2.3 pci_disable_msi
@@ -137,10 +137,10 @@ interrupt number and frees the previously allocated message signaled
 interrupt(s).  The interrupt may subsequently be assigned to another
 device, so drivers should not cache the value of dev->irq.
 
-A device driver must always call free_irq() on the interrupt(s)
-for which it has called request_irq() before calling this function.
-Failure to do so will result in a BUG_ON(), the device will be left with
-MSI enabled and will leak its vector.
+Before calling this function, a device driver must always call free_irq()
+on any interrupt for which it previously called request_irq().
+Failure to do so results in a BUG_ON(), leaving the device with
+MSI enabled and thus leaking its vector.
 
 4.3 Using MSI-X
 
@@ -155,10 +155,10 @@ struct msix_entry {
 };
 
 This allows for the device to use these interrupts in a sparse fashion;
-for example it could use interrupts 3 and 1027 and allocate only a
+for example, it could use interrupts 3 and 1027 and yet allocate only a
 two-element array.  The driver is expected to fill in the 'entry' value
-in each element of the array to indicate which entries it wants the kernel
-to assign interrupts for.  It is invalid to fill in two entries with the
+in each element of the array to indicate for which entries the kernel
+should assign interrupts; it is invalid to fill in two entries with the
 same number.
 
 4.3.1 pci_enable_msix
@@ -168,10 +168,11 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
 Calling this function asks the PCI subsystem to allocate 'nvec' MSIs.
 The 'entries' argument is a pointer to an array of msix_entry structs
 which should be at least 'nvec' entries in size.  On success, the
-function will return 0 and the device will have been switched into
-MSI-X interrupt mode.  The 'vector' elements in each entry will have
-been filled in with the interrupt number.  The driver should then call
-request_irq() for each 'vector' that it decides to use.
+device is switched into MSI-X mode and the function returns 0.
+The 'vector' member in each entry is populated with the interrupt number;
+the driver should then call request_irq() for each 'vector' that it
+decides to use.  The device driver is responsible for keeping track of the
+interrupts assigned to the MSI-X vectors so it can free them again later.
 
 If this function returns a negative number, it indicates an error and
 the driver should not attempt to allocate any more MSI-X interrupts for
@@ -181,16 +182,14 @@ below.
 
 This function, in contrast with pci_enable_msi(), does not adjust
 dev->irq.  The device will not generate interrupts for this interrupt
-number once MSI-X is enabled.  The device driver is responsible for
-keeping track of the interrupts assigned to the MSI-X vectors so it can
-free them again later.
+number once MSI-X is enabled.
 
 Device drivers should normally call this function once per device
 during the initialization phase.
 
-It is ideal if drivers can cope with a variable number of MSI-X interrupts,
+It is ideal if drivers can cope with a variable number of MSI-X interrupts;
 there are many reasons why the platform may not be able to provide the
-exact number a driver asks for.
+exact number that a driver asks for.
 
 A request loop to achieve that might look like:
 
@@ -212,15 +211,15 @@ static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
 
 void pci_disable_msix(struct pci_dev *dev)
 
-This API should be used to undo the effect of pci_enable_msix().  It frees
+This function should be used to undo the effect of pci_enable_msix().  It frees
 the previously allocated message signaled interrupts.  The interrupts may
 subsequently be assigned to another device, so drivers should not cache
 the value of the 'vector' elements over a call to pci_disable_msix().
 
-A device driver must always call free_irq() on the interrupt(s)
-for which it has called request_irq() before calling this function.
-Failure to do so will result in a BUG_ON(), the device will be left with
-MSI enabled and will leak its vector.
+Before calling this function, a device driver must always call free_irq()
+on any interrupt for which it previously called request_irq().
+Failure to do so results in a BUG_ON(), leaving the device with
+MSI-X enabled and thus leaking its vector.
 
 4.3.3 The MSI-X Table
 
@@ -232,10 +231,10 @@ mask or unmask an interrupt, it should call disable_irq() / enable_irq().
 4.4 Handling devices implementing both MSI and MSI-X capabilities
 
 If a device implements both MSI and MSI-X capabilities, it can
-run in either MSI mode or MSI-X mode but not both simultaneously.
+run in either MSI mode or MSI-X mode, but not both simultaneously.
 This is a requirement of the PCI spec, and it is enforced by the
 PCI layer.  Calling pci_enable_msi() when MSI-X is already enabled or
-pci_enable_msix() when MSI is already enabled will result in an error.
+pci_enable_msix() when MSI is already enabled results in an error.
 If a device driver wishes to switch between MSI and MSI-X at runtime,
 it must first quiesce the device, then switch it back to pin-interrupt
 mode, before calling pci_enable_msi() or pci_enable_msix() and resuming
@@ -251,7 +250,7 @@ the MSI-X facilities in preference to the MSI facilities.  As mentioned
 above, MSI-X supports any number of interrupts between 1 and 2048.
 In constrast, MSI is restricted to a maximum of 32 interrupts (and
 must be a power of two).  In addition, the MSI interrupt vectors must
-be allocated consecutively, so the system may not be able to allocate
+be allocated consecutively, so the system might not be able to allocate
 as many vectors for MSI as it could for MSI-X.  On some platforms, MSI
 interrupts must all be targeted at the same set of CPUs whereas MSI-X
 interrupts can all be targeted at different CPUs.
@@ -281,7 +280,7 @@ disabled to enabled and back again.
 
 Using 'lspci -v' (as root) may show some devices with "MSI", "Message
 Signalled Interrupts" or "MSI-X" capabilities.  Each of these capabilities
-has an 'Enable' flag which will be followed with either "+" (enabled)
+has an 'Enable' flag which is followed with either "+" (enabled)
 or "-" (disabled).
 
 
@@ -298,7 +297,7 @@ The PCI stack provides three ways to disable MSIs:
 
 Some host chipsets simply don't support MSIs properly.  If we're
 lucky, the manufacturer knows this and has indicated it in the ACPI
-FADT table.  In this case, Linux will automatically disable MSIs.
+FADT table.  In this case, Linux automatically disables MSIs.
 Some boards don't include this information in the table and so we have
 to detect them ourselves.  The complete list of these is found near the
 quirk_disable_all_msi() function in drivers/pci/quirks.c.
@@ -317,7 +316,7 @@ Some bridges allow you to enable MSIs by changing some bits in their
 PCI configuration space (especially the Hypertransport chipsets such
 as the nVidia nForce and Serverworks HT2000).  As with host chipsets,
 Linux mostly knows about them and automatically enables MSIs if it can.
-If you have a bridge which Linux doesn't yet know about, you can enable
+If you have a bridge unknown to Linux, you can enable
 MSIs in configuration space using whatever method you know works, then
 enable MSIs on that bridge by doing:
 
@@ -327,7 +326,7 @@ where $bridge is the PCI address of the bridge you've enabled (eg
 0000:00:0e.0).
 
 To disable MSIs, echo 0 instead of 1.  Changing this value should be
-done with caution as it can break interrupt handling for all devices
+done with caution as it could break interrupt handling for all devices
 below this bridge.
 
 Again, please notify linux-pci@vger.kernel.org of any bridges that need
@@ -336,7 +335,7 @@ special handling.
 5.3. Disabling MSIs on a single device
 
 Some devices are known to have faulty MSI implementations.  Usually this
-is handled in the individual device driver but occasionally it's necessary
+is handled in the individual device driver, but occasionally it's necessary
 to handle this with a quirk.  Some drivers have an option to disable use
 of MSI.  While this is a convenient workaround for the driver author,
 it is not good practise, and should not be emulated.
@@ -350,7 +349,7 @@ for your machine.  You should also check your .config to be sure you
 have enabled CONFIG_PCI_MSI.
 
 Then, 'lspci -t' gives the list of bridges above a device.  Reading
-/sys/bus/pci/devices/*/msi_bus will tell you whether MSI are enabled (1)
+/sys/bus/pci/devices/*/msi_bus will tell you whether MSIs are enabled (1)
 or disabled (0).  If 0 is found in any of the msi_bus files belonging
 to bridges between the PCI root and the device, MSIs are disabled.
 
index 319baa8..36d16bb 100644 (file)
@@ -130,7 +130,7 @@ Linux kernel master tree:
        ftp.??.kernel.org:/pub/linux/kernel/...
        ?? == your country code, such as "us", "uk", "fr", etc.
 
-       http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git
+       http://git.kernel.org/?p=linux/kernel/git/torvalds/linux.git
 
 Linux kernel mailing list:
        linux-kernel@vger.kernel.org
index 569f353..4468ce2 100644 (file)
@@ -303,7 +303,7 @@ patches that are being emailed around.
 
 The sign-off is a simple line at the end of the explanation for the
 patch, which certifies that you wrote it or otherwise have the right to
-pass it on as a open-source patch.  The rules are pretty simple: if you
+pass it on as an open-source patch.  The rules are pretty simple: if you
 can certify the below:
 
         Developer's Certificate of Origin 1.1
index e578fee..6d670f5 100644 (file)
@@ -43,3 +43,74 @@ If one sets slice_idle=0 and if storage supports NCQ, CFQ internally switches
 to IOPS mode and starts providing fairness in terms of number of requests
 dispatched. Note that this mode switching takes effect only for group
 scheduling. For non-cgroup users nothing should change.
+
+CFQ IO scheduler Idling Theory
+===============================
+Idling on a queue is primarily about waiting for the next request to come
+on same queue after completion of a request. In this process CFQ will not
+dispatch requests from other cfq queues even if requests are pending there.
+
+The rationale behind idling is that it can cut down on number of seeks
+on rotational media. For example, if a process is doing dependent
+sequential reads (next read will come on only after completion of previous
+one), then not dispatching request from other queue should help as we
+did not move the disk head and kept on dispatching sequential IO from
+one queue.
+
+CFQ has following service trees and various queues are put on these trees.
+
+       sync-idle       sync-noidle     async
+
+All cfq queues doing synchronous sequential IO go on to sync-idle tree.
+On this tree we idle on each queue individually.
+
+All synchronous non-sequential queues go on sync-noidle tree. Also any
+request which are marked with REQ_NOIDLE go on this service tree. On this
+tree we do not idle on individual queues instead idle on the whole group
+of queues or the tree. So if there are 4 queues waiting for IO to dispatch
+we will idle only once last queue has dispatched the IO and there is
+no more IO on this service tree.
+
+All async writes go on async service tree. There is no idling on async
+queues.
+
+CFQ has some optimizations for SSDs and if it detects a non-rotational
+media which can support higher queue depth (multiple requests at in
+flight at a time), then it cuts down on idling of individual queues and
+all the queues move to sync-noidle tree and only tree idle remains. This
+tree idling provides isolation with buffered write queues on async tree.
+
+FAQ
+===
+Q1. Why to idle at all on queues marked with REQ_NOIDLE.
+
+A1. We only do tree idle (all queues on sync-noidle tree) on queues marked
+    with REQ_NOIDLE. This helps in providing isolation with all the sync-idle
+    queues. Otherwise in presence of many sequential readers, other
+    synchronous IO might not get fair share of disk.
+
+    For example, if there are 10 sequential readers doing IO and they get
+    100ms each. If a REQ_NOIDLE request comes in, it will be scheduled
+    roughly after 1 second. If after completion of REQ_NOIDLE request we
+    do not idle, and after a couple of milli seconds a another REQ_NOIDLE
+    request comes in, again it will be scheduled after 1second. Repeat it
+    and notice how a workload can lose its disk share and suffer due to
+    multiple sequential readers.
+
+    fsync can generate dependent IO where bunch of data is written in the
+    context of fsync, and later some journaling data is written. Journaling
+    data comes in only after fsync has finished its IO (atleast for ext4
+    that seemed to be the case). Now if one decides not to idle on fsync
+    thread due to REQ_NOIDLE, then next journaling write will not get
+    scheduled for another second. A process doing small fsync, will suffer
+    badly in presence of multiple sequential readers.
+
+    Hence doing tree idling on threads using REQ_NOIDLE flag on requests
+    provides isolation from multiple sequential readers and at the same
+    time we do not idle on individual threads.
+
+Q2. When to specify REQ_NOIDLE
+A2. I would think whenever one is doing synchronous write and not expecting
+    more writes to be dispatched from same context soon, should be able
+    to specify REQ_NOIDLE on writes and that probably should work well for
+    most of the cases.
index a0b58e2..860c29a 100644 (file)
@@ -199,18 +199,16 @@ to coerce it into behaving.
 
 To beat some sense out of the internal editor, do this:
 
-- Under account settings, composition and addressing, uncheck "Compose
-  messages in HTML format".
-
 - Edit your Thunderbird config settings so that it won't use format=flowed.
   Go to "edit->preferences->advanced->config editor" to bring up the
   thunderbird's registry editor, and set "mailnews.send_plaintext_flowed" to
   "false".
 
-- Enable "preformat" mode: Shft-click on the Write icon to bring up the HTML
-  composer, select "Preformat" from the drop-down box just under the subject
-  line, then close the message without saving.  (This setting also applies to
-  the text composer, but the only control for it is in the HTML composer.)
+- Disable HTML Format: Set "mail.identity.id1.compose_html" to "false".
+
+- Enable "preformat" mode: Set "editor.quotesPreformatted" to "true".
+
+- Enable UTF8: Set "prefs.converted-to-utf8" to "true".
 
 - Install the "toggle wordwrap" extension.  Download the file from:
     https://addons.mozilla.org/thunderbird/addon/2351/
index 6e49c36..da45e6c 100644 (file)
@@ -27,7 +27,7 @@ His original code can still be found at:
 Does anyone know of a more current email address for Makoto? He doesn't
 respond to the address given above...
 
-Current maintainer: Sergey S. Kostyliov <rathamahata@php4.ru>
+This filesystem doesn't have a maintainer.
 
 WHAT IS THIS DRIVER?
 ==================
index 9a86746..0e0734b 100644 (file)
        (including this document itself) have been moved there, and might
        be more up to date than the web version.
 
-     * Name: "Linux Source Driver"
-       URL: http://lsd.linux.cz
-       Keywords: Browsing source code.
-       Description: "Linux Source Driver (LSD) is an application, which
-       can make browsing source codes of Linux kernel easier than you can
-       imagine. You can select between multiple versions of kernel (e.g.
-       0.01, 1.0.0, 2.0.33, 2.0.34pre13, 2.0.0, 2.1.101 etc.). With LSD
-       you can search Linux kernel (fulltext, macros, types, functions
-       and variables) and LSD can generate patches for you on the fly
-       (files, directories or kernel)".
-
      * Name: "Linux Kernel Source Reference"
        Author: Thomas Graichen.
        URL: http://marc.info/?l=linux-kernel&m=96446640102205&w=4
index 78926aa..614d038 100644 (file)
@@ -40,6 +40,7 @@ parameter is applicable:
        ALSA    ALSA sound support is enabled.
        APIC    APIC support is enabled.
        APM     Advanced Power Management support is enabled.
+       ARM     ARM architecture is enabled.
        AVR32   AVR32 architecture is enabled.
        AX25    Appropriate AX.25 support is enabled.
        BLACKFIN Blackfin architecture is enabled.
@@ -49,6 +50,7 @@ parameter is applicable:
        EFI     EFI Partitioning (GPT) is enabled
        EIDE    EIDE/ATAPI support is enabled.
        FB      The frame buffer device is enabled.
+       FTRACE  Function tracing enabled.
        GCOV    GCOV profiling is enabled.
        HW      Appropriate hardware is enabled.
        IA-64   IA-64 architecture is enabled.
@@ -69,6 +71,7 @@ parameter is applicable:
                        Documentation/m68k/kernel-options.txt.
        MCA     MCA bus support is enabled.
        MDA     MDA console support is enabled.
+       MIPS    MIPS architecture is enabled.
        MOUSE   Appropriate mouse support is enabled.
        MSI     Message Signaled Interrupts (PCI).
        MTD     MTD (Memory Technology Device) support is enabled.
@@ -100,7 +103,6 @@ parameter is applicable:
        SPARC   Sparc architecture is enabled.
        SWSUSP  Software suspend (hibernation) is enabled.
        SUSPEND System suspend states are enabled.
-       FTRACE  Function tracing enabled.
        TPM     TPM drivers are enabled.
        TS      Appropriate touchscreen support is enabled.
        UMS     USB Mass Storage support is enabled.
@@ -115,7 +117,7 @@ parameter is applicable:
        X86-64  X86-64 architecture is enabled.
                        More X86-64 boot options can be found in
                        Documentation/x86/x86_64/boot-options.txt .
-       X86     Either 32bit or 64bit x86 (same as X86-32+X86-64)
+       X86     Either 32-bit or 64-bit x86 (same as X86-32+X86-64)
        XEN     Xen support is enabled
 
 In addition, the following text indicates that the option:
@@ -376,7 +378,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        atkbd.softrepeat= [HW]
                        Use software keyboard repeat
 
-       autotest        [IA64]
+       autotest        [IA-64]
 
        baycom_epp=     [HW,AX25]
                        Format: <io>,<mode>
@@ -681,8 +683,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                uart[8250],mmio32,<addr>[,options]
                        Start an early, polled-mode console on the 8250/16550
                        UART at the specified I/O port or MMIO address.
-                       MMIO inter-register address stride is either 8bit (mmio)
-                        or 32bit (mmio32).
+                       MMIO inter-register address stride is either 8-bit
+                       (mmio) or 32-bit (mmio32).
                        The options are the same as for ttyS, above.
 
        earlyprintk=    [X86,SH,BLACKFIN]
@@ -725,7 +727,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        See Documentation/block/as-iosched.txt and
                        Documentation/block/deadline-iosched.txt for details.
 
-       elfcorehdr=     [IA64,PPC,SH,X86]
+       elfcorehdr=     [IA-64,PPC,SH,X86]
                        Specifies physical address of start of kernel core
                        image elf header. Generally kexec loader will
                        pass this option to capture kernel.
@@ -791,7 +793,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        tracer at boot up. function-list is a comma separated
                        list of functions. This list can be changed at run
                        time by the set_ftrace_filter file in the debugfs
-                       tracing directory. 
+                       tracing directory.
 
        ftrace_notrace=[function-list]
                        [FTRACE] Do not trace the functions specified in
@@ -829,7 +831,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        hashdist=       [KNL,NUMA] Large hashes allocated during boot
                        are distributed across NUMA nodes.  Defaults on
-                       for 64bit NUMA, off otherwise.
+                       for 64-bit NUMA, off otherwise.
                        Format: 0 | 1 (for off | on)
 
        hcl=            [IA-64] SGI's Hardware Graph compatibility layer
@@ -998,10 +1000,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        DMA.
                forcedac [x86_64]
                        With this option iommu will not optimize to look
-                       for io virtual address below 32 bit forcing dual
+                       for io virtual address below 32-bit forcing dual
                        address cycle on pci bus for cards supporting greater
-                       than 32 bit addressing. The default is to look
-                       for translation below 32 bit and if not available
+                       than 32-bit addressing. The default is to look
+                       for translation below 32-bit and if not available
                        then look in the higher range.
                strict [Default Off]
                        With this option on every unmap_single operation will
@@ -1017,7 +1019,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        off     disable Interrupt Remapping
                        nosid   disable Source ID checking
 
-       inttest=        [IA64]
+       inttest=        [IA-64]
 
        iomem=          Disable strict checking of access to MMIO memory
                strict  regions from userspace.
@@ -1034,7 +1036,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                nomerge
                forcesac
                soft
-               pt      [x86, IA64]
+               pt      [x86, IA-64]
 
        io7=            [HW] IO7 for Marvel based alpha systems
                        See comment before marvel_specify_io7 in
@@ -1165,7 +1167,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        kvm-amd.npt=    [KVM,AMD] Disable nested paging (virtualized MMU)
                        for all guests.
-                       Default is 1 (enabled) if in 64bit or 32bit-PAE mode
+                       Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
 
        kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                        (virtualized MMU) support on capable Intel chips.
@@ -1202,10 +1204,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        libata.dma=0      Disable all PATA and SATA DMA
                        libata.dma=1      PATA and SATA Disk DMA only
                        libata.dma=2      ATAPI (CDROM) DMA only
-                       libata.dma=4      Compact Flash DMA only 
+                       libata.dma=4      Compact Flash DMA only
                        Combinations also work, so libata.dma=3 enables DMA
                        for disks and CDROMs, but not CFs.
-       
+
        libata.ignore_hpa=      [LIBATA] Ignore HPA limit
                        libata.ignore_hpa=0       keep BIOS limits (default)
                        libata.ignore_hpa=1       ignore limits, using full disk
@@ -1331,7 +1333,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
        ltpc=           [NET]
                        Format: <io>,<irq>,<dma>
 
-       machvec=        [IA64] Force the use of a particular machine-vector
+       machvec=        [IA-64] Force the use of a particular machine-vector
                        (machvec) in a generic kernel.
                        Example: machvec=hpzx1_swiotlb
 
@@ -1348,9 +1350,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        it is equivalent to "nosmp", which also disables
                        the IO APIC.
 
-       max_loop=       [LOOP] Maximum number of loopback devices that can
-                       be mounted
-                       Format: <1-256>
+       max_loop=       [LOOP] The number of loop block devices that get
+       (loop.max_loop) unconditionally pre-created at init time. The default
+                       number is configured by BLK_DEV_LOOP_MIN_COUNT. Instead
+                       of statically allocating a predefined number, loop
+                       devices can be requested on-demand with the
+                       /dev/loop-control interface.
 
        mcatest=        [IA-64]
 
@@ -1734,7 +1739,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        nointroute      [IA-64]
 
-       nojitter        [IA64] Disables jitter checking for ITC timers.
+       nojitter        [IA-64] Disables jitter checking for ITC timers.
 
        no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
 
@@ -1800,7 +1805,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
        nox2apic        [X86-64,APIC] Do not enable x2APIC mode.
 
-       nptcg=          [IA64] Override max number of concurrent global TLB
+       nptcg=          [IA-64] Override max number of concurrent global TLB
                        purges which is reported from either PAL_VM_SUMMARY or
                        SAL PALO.
 
@@ -2077,7 +2082,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                        Format: { parport<nr> | timid | 0 }
                        See also Documentation/parport.txt.
 
-       pmtmr=          [X86] Manual setup of pmtmr I/O Port. 
+       pmtmr=          [X86] Manual setup of pmtmr I/O Port.
                        Override pmtimer IOPort with a hex value.
                        e.g. pmtmr=0x508
 
@@ -2635,6 +2640,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
                                        medium is write-protected).
                        Example: quirks=0419:aaf5:rl,0421:0433:rc
 
+       user_debug=     [KNL,ARM]
+                       Format: <int>
+                       See arch/arm/Kconfig.debug help text.
+                                1 - undefined instruction events
+                                2 - system calls
+                                4 - invalid data aborts
+                                8 - SIGSEGV faults
+                               16 - SIGBUS faults
+                       Example: user_debug=31
+
        userpte=
                        [X86] Flags controlling user PTE allocations.
 
diff --git a/Documentation/ramoops.txt b/Documentation/ramoops.txt
new file mode 100644 (file)
index 0000000..8fb1ba7
--- /dev/null
@@ -0,0 +1,76 @@
+Ramoops oops/panic logger
+=========================
+
+Sergiu Iordache <sergiu@chromium.org>
+
+Updated: 8 August 2011
+
+0. Introduction
+
+Ramoops is an oops/panic logger that writes its logs to RAM before the system
+crashes. It works by logging oopses and panics in a circular buffer. Ramoops
+needs a system with persistent RAM so that the content of that area can
+survive after a restart.
+
+1. Ramoops concepts
+
+Ramoops uses a predefined memory area to store the dump. The start and size of
+the memory area are set using two variables:
+  * "mem_address" for the start
+  * "mem_size" for the size. The memory size will be rounded down to a
+  power of two.
+
+The memory area is divided into "record_size" chunks (also rounded down to
+power of two) and each oops/panic writes a "record_size" chunk of
+information.
+
+Dumping both oopses and panics can be done by setting 1 in the "dump_oops"
+variable while setting 0 in that variable dumps only the panics.
+
+The module uses a counter to record multiple dumps but the counter gets reset
+on restart (i.e. new dumps after the restart will overwrite old ones).
+
+2. Setting the parameters
+
+Setting the ramoops parameters can be done in 2 different manners:
+ 1. Use the module parameters (which have the names of the variables described
+ as before).
+ 2. Use a platform device and set the platform data. The parameters can then
+ be set through that platform data. An example of doing that is:
+
+#include <linux/ramoops.h>
+[...]
+
+static struct ramoops_platform_data ramoops_data = {
+        .mem_size               = <...>,
+        .mem_address            = <...>,
+        .record_size            = <...>,
+        .dump_oops              = <...>,
+};
+
+static struct platform_device ramoops_dev = {
+        .name = "ramoops",
+        .dev = {
+                .platform_data = &ramoops_data,
+        },
+};
+
+[... inside a function ...]
+int ret;
+
+ret = platform_device_register(&ramoops_dev);
+if (ret) {
+       printk(KERN_ERR "unable to register platform device\n");
+       return ret;
+}
+
+3. Dump format
+
+The data dump begins with a header, currently defined as "====" followed by a
+timestamp and a new line. The dump then continues with the actual data.
+
+4. Reading the data
+
+The dump data can be read from memory (through /dev/mem or other means).
+Getting the module parameters, which are needed in order to parse the data, can
+be done through /sys/module/ramoops/parameters/* .
index fe0251c..8e60199 100644 (file)
@@ -8,3 +8,6 @@ lguest/
        - Extremely simple hypervisor for experimental/educational use.
 uml/
        - User Mode Linux, builds/runs Linux kernel as a userspace program.
+virtio.txt
+       - Text version of draft virtio spec.
+          See http://ozlabs.org/~rusty/virtio-spec
index 043bd7d..d928c13 100644 (file)
@@ -1996,6 +1996,9 @@ int main(int argc, char *argv[])
        /* We use a simple helper to copy the arguments separated by spaces. */
        concat((char *)(boot + 1), argv+optind+2);
 
+       /* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
+       boot->hdr.kernel_alignment = 0x1000000;
+
        /* Boot protocol version: 2.07 supports the fields for lguest. */
        boot->hdr.version = 0x207;
 
diff --git a/Documentation/virtual/virtio-spec.txt b/Documentation/virtual/virtio-spec.txt
new file mode 100644 (file)
index 0000000..a350ae1
--- /dev/null
@@ -0,0 +1,2200 @@
+[Generated file: see http://ozlabs.org/~rusty/virtio-spec/]
+Virtio PCI Card Specification
+v0.9.1 DRAFT
+-
+
+Rusty Russell <rusty@rustcorp.com.au>IBM Corporation (Editor)
+
+2011 August 1.
+
+Purpose and Description
+
+This document describes the specifications of the “virtio” family
+of PCI[LaTeX Command: nomenclature] devices. These are devices
+are found in virtual environments[LaTeX Command: nomenclature],
+yet by design they are not all that different from physical PCI
+devices, and this document treats them as such. This allows the
+guest to use standard PCI drivers and discovery mechanisms.
+
+The purpose of virtio and this specification is that virtual
+environments and guests should have a straightforward, efficient,
+standard and extensible mechanism for virtual devices, rather
+than boutique per-environment or per-OS mechanisms.
+
+  Straightforward: Virtio PCI devices use normal PCI mechanisms
+  of interrupts and DMA which should be familiar to any device
+  driver author. There is no exotic page-flipping or COW
+  mechanism: it's just a PCI device.[footnote:
+This lack of page-sharing implies that the implementation of the
+device (e.g. the hypervisor or host) needs full access to the
+guest memory. Communication with untrusted parties (i.e.
+inter-guest communication) requires copying.
+]
+
+  Efficient: Virtio PCI devices consist of rings of descriptors
+  for input and output, which are neatly separated to avoid cache
+  effects from both guest and device writing to the same cache
+  lines.
+
+  Standard: Virtio PCI makes no assumptions about the environment
+  in which it operates, beyond supporting PCI. In fact the virtio
+  devices specified in the appendices do not require PCI at all:
+  they have been implemented on non-PCI buses.[footnote:
+The Linux implementation further separates the PCI virtio code
+from the specific virtio drivers: these drivers are shared with
+the non-PCI implementations (currently lguest and S/390).
+]
+
+  Extensible: Virtio PCI devices contain feature bits which are
+  acknowledged by the guest operating system during device setup.
+  This allows forwards and backwards compatibility: the device
+  offers all the features it knows about, and the driver
+  acknowledges those it understands and wishes to use.
+
+  Virtqueues
+
+The mechanism for bulk data transport on virtio PCI devices is
+pretentiously called a virtqueue. Each device can have zero or
+more virtqueues: for example, the network device has one for
+transmit and one for receive.
+
+Each virtqueue occupies two or more physically-contiguous pages
+(defined, for the purposes of this specification, as 4096 bytes),
+and consists of three parts:
+
+
++-------------------+-----------------------------------+-----------+
+| Descriptor Table  |   Available Ring     (padding)    | Used Ring |
++-------------------+-----------------------------------+-----------+
+
+
+When the driver wants to send buffers to the device, it puts them
+in one or more slots in the descriptor table, and writes the
+descriptor indices into the available ring. It then notifies the
+device. When the device has finished with the buffers, it writes
+the descriptors into the used ring, and sends an interrupt.
+
+Specification
+
+  PCI Discovery
+
+Any PCI device with Vendor ID 0x1AF4, and Device ID 0x1000
+through 0x103F inclusive is a virtio device[footnote:
+The actual value within this range is ignored
+]. The device must also have a Revision ID of 0 to match this
+specification.
+
+The Subsystem Device ID indicates which virtio device is
+supported by the device. The Subsystem Vendor ID should reflect
+the PCI Vendor ID of the environment (it's currently only used
+for informational purposes by the guest).
+
+
++----------------------+--------------------+---------------+
+| Subsystem Device ID  |   Virtio Device    | Specification |
++----------------------+--------------------+---------------+
++----------------------+--------------------+---------------+
+|          1           |   network card     |  Appendix C   |
++----------------------+--------------------+---------------+
+|          2           |   block device     |  Appendix D   |
++----------------------+--------------------+---------------+
+|          3           |      console       |  Appendix E   |
++----------------------+--------------------+---------------+
+|          4           |  entropy source    |  Appendix F   |
++----------------------+--------------------+---------------+
+|          5           | memory ballooning  |  Appendix G   |
++----------------------+--------------------+---------------+
+|          6           |     ioMemory       |       -       |
++----------------------+--------------------+---------------+
+|          9           |   9P transport     |       -       |
++----------------------+--------------------+---------------+
+
+
+  Device Configuration
+
+To configure the device, we use the first I/O region of the PCI
+device. This contains a virtio header followed by a
+device-specific region.
+
+There may be different widths of accesses to the I/O region; the “
+natural” access method for each field in the virtio header must
+be used (i.e. 32-bit accesses for 32-bit fields, etc), but the
+device-specific region can be accessed using any width accesses,
+and should obtain the same results.
+
+Note that this is possible because while the virtio header is PCI
+(i.e. little) endian, the device-specific region is encoded in
+the native endian of the guest (where such distinction is
+applicable).
+
+  Device Initialization Sequence
+
+We start with an overview of device initialization, then expand
+on the details of the device and how each step is preformed.
+
+  Reset the device. This is not required on initial start up.
+
+  The ACKNOWLEDGE status bit is set: we have noticed the device.
+
+  The DRIVER status bit is set: we know how to drive the device.
+
+  Device-specific setup, including reading the Device Feature
+  Bits, discovery of virtqueues for the device, optional MSI-X
+  setup, and reading and possibly writing the virtio
+  configuration space.
+
+  The subset of Device Feature Bits understood by the driver is
+  written to the device.
+
+  The DRIVER_OK status bit is set.
+
+  The device can now be used (ie. buffers added to the
+  virtqueues)[footnote:
+Historically, drivers have used the device before steps 5 and 6.
+This is only allowed if the driver does not use any features
+which would alter this early use of the device.
+]
+
+If any of these steps go irrecoverably wrong, the guest should
+set the FAILED status bit to indicate that it has given up on the
+device (it can reset the device later to restart if desired).
+
+We now cover the fields required for general setup in detail.
+
+  Virtio Header
+
+The virtio header looks as follows:
+
+
++------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
+| Bits       || 32                  | 32                  | 32       | 16     | 16      | 16      | 8       | 8      |
++------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
+| Read/Write || R                   | R+W                 | R+W      | R      | R+W     | R+W     | R+W     | R      |
++------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
+| Purpose    || Device              | Guest               | Queue    | Queue  | Queue   | Queue   | Device  | ISR    |
+|            || Features bits 0:31  | Features bits 0:31  | Address  | Size   | Select  | Notify  | Status  | Status |
++------------++---------------------+---------------------+----------+--------+---------+---------+---------+--------+
+
+
+If MSI-X is enabled for the device, two additional fields
+immediately follow this header:
+
+
++------------++----------------+--------+
+| Bits       || 16             | 16     |
+              +----------------+--------+
++------------++----------------+--------+
+| Read/Write || R+W            | R+W    |
++------------++----------------+--------+
+| Purpose    || Configuration  | Queue  |
+| (MSI-X)    || Vector         | Vector |
++------------++----------------+--------+
+
+
+Finally, if feature bits (VIRTIO_F_FEATURES_HI) this is
+immediately followed by two additional fields:
+
+
++------------++----------------------+----------------------
+| Bits       || 32                   | 32
++------------++----------------------+----------------------
+| Read/Write || R                    | R+W
++------------++----------------------+----------------------
+| Purpose    || Device               | Guest
+|            || Features bits 32:63  | Features bits 32:63
++------------++----------------------+----------------------
+
+
+Immediately following these general headers, there may be
+device-specific headers:
+
+
++------------++--------------------+
+| Bits       || Device Specific    |
+              +--------------------+
++------------++--------------------+
+| Read/Write || Device Specific    |
++------------++--------------------+
+| Purpose    || Device Specific... |
+|            ||                    |
++------------++--------------------+
+
+
+  Device Status
+
+The Device Status field is updated by the guest to indicate its
+progress. This provides a simple low-level diagnostic: it's most
+useful to imagine them hooked up to traffic lights on the console
+indicating the status of each device.
+
+The device can be reset by writing a 0 to this field, otherwise
+at least one bit should be set:
+
+  ACKNOWLEDGE (1) Indicates that the guest OS has found the
+  device and recognized it as a valid virtio device.
+
+  DRIVER (2) Indicates that the guest OS knows how to drive the
+  device. Under Linux, drivers can be loadable modules so there
+  may be a significant (or infinite) delay before setting this
+  bit.
+
+  DRIVER_OK (3) Indicates that the driver is set up and ready to
+  drive the device.
+
+  FAILED (8) Indicates that something went wrong in the guest,
+  and it has given up on the device. This could be an internal
+  error, or the driver didn't like the device for some reason, or
+  even a fatal error during device operation. The device must be
+  reset before attempting to re-initialize.
+
+  Feature Bits
+
+The least significant 31 bits of the first configuration field
+indicates the features that the device supports (the high bit is
+reserved, and will be used to indicate the presence of future
+feature bits elsewhere). If more than 31 feature bits are
+supported, the device indicates so by setting feature bit 31 (see
+[cha:Reserved-Feature-Bits]). The bits are allocated as follows:
+
+  0 to 23 Feature bits for the specific device type
+
+  24 to 40 Feature bits reserved for extensions to the queue and
+  feature negotiation mechanisms
+
+  41 to 63 Feature bits reserved for future extensions
+
+For example, feature bit 0 for a network device (i.e. Subsystem
+Device ID 1) indicates that the device supports checksumming of
+packets.
+
+The feature bits are negotiated: the device lists all the
+features it understands in the Device Features field, and the
+guest writes the subset that it understands into the Guest
+Features field. The only way to renegotiate is to reset the
+device.
+
+In particular, new fields in the device configuration header are
+indicated by offering a feature bit, so the guest can check
+before accessing that part of the configuration space.
+
+This allows for forwards and backwards compatibility: if the
+device is enhanced with a new feature bit, older guests will not
+write that feature bit back to the Guest Features field and it
+can go into backwards compatibility mode. Similarly, if a guest
+is enhanced with a feature that the device doesn't support, it
+will not see that feature bit in the Device Features field and
+can go into backwards compatibility mode (or, for poor
+implementations, set the FAILED Device Status bit).
+
+Access to feature bits 32 to 63 is enabled by Guest by setting
+feature bit 31. If this bit is unset, Device must assume that all
+feature bits > 31 are unset.
+
+  Configuration/Queue Vectors
+
+When MSI-X capability is present and enabled in the device
+(through standard PCI configuration space) 4 bytes at byte offset
+20 are used to map configuration change and queue interrupts to
+MSI-X vectors. In this case, the ISR Status field is unused, and
+device specific configuration starts at byte offset 24 in virtio
+header structure. When MSI-X capability is not enabled, device
+specific configuration starts at byte offset 20 in virtio header.
+
+Writing a valid MSI-X Table entry number, 0 to 0x7FF, to one of
+Configuration/Queue Vector registers, maps interrupts triggered
+by the configuration change/selected queue events respectively to
+the corresponding MSI-X vector. To disable interrupts for a
+specific event type, unmap it by writing a special NO_VECTOR
+value:
+
+/* Vector value used to disable MSI for queue */
+
+#define VIRTIO_MSI_NO_VECTOR            0xffff
+
+Reading these registers returns vector mapped to a given event,
+or NO_VECTOR if unmapped. All queue and configuration change
+events are unmapped by default.
+
+Note that mapping an event to vector might require allocating
+internal device resources, and might fail. Devices report such
+failures by returning the NO_VECTOR value when the relevant
+Vector field is read. After mapping an event to vector, the
+driver must verify success by reading the Vector field value: on
+success, the previously written value is returned, and on
+failure, NO_VECTOR is returned. If a mapping failure is detected,
+the driver can retry mapping with fewervectors, or disable MSI-X.
+
+  Virtqueue Configuration
+
+As a device can have zero or more virtqueues for bulk data
+transport (for example, the network driver has two), the driver
+needs to configure them as part of the device-specific
+configuration.
+
+This is done as follows, for each virtqueue a device has:
+
+  Write the virtqueue index (first queue is 0) to the Queue
+  Select field.
+
+  Read the virtqueue size from the Queue Size field, which is
+  always a power of 2. This controls how big the virtqueue is
+  (see below). If this field is 0, the virtqueue does not exist.
+
+  Allocate and zero virtqueue in contiguous physical memory, on a
+  4096 byte alignment. Write the physical address, divided by
+  4096 to the Queue Address field.[footnote:
+The 4096 is based on the x86 page size, but it's also large
+enough to ensure that the separate parts of the virtqueue are on
+separate cache lines.
+]
+
+  Optionally, if MSI-X capability is present and enabled on the
+  device, select a vector to use to request interrupts triggered
+  by virtqueue events. Write the MSI-X Table entry number
+  corresponding to this vector in Queue Vector field. Read the
+  Queue Vector field: on success, previously written value is
+  returned; on failure, NO_VECTOR value is returned.
+
+The Queue Size field controls the total number of bytes required
+for the virtqueue according to the following formula:
+
+#define ALIGN(x) (((x) + 4095) & ~4095)
+
+static inline unsigned vring_size(unsigned int qsz)
+
+{
+
+     return ALIGN(sizeof(struct vring_desc)*qsz + sizeof(u16)*(2
++ qsz))
+
+          + ALIGN(sizeof(struct vring_used_elem)*qsz);
+
+}
+
+This currently wastes some space with padding, but also allows
+future extensions. The virtqueue layout structure looks like this
+(qsz is the Queue Size field, which is a variable, so this code
+won't compile):
+
+struct vring {
+
+    /* The actual descriptors (16 bytes each) */
+
+    struct vring_desc desc[qsz];
+
+
+
+    /* A ring of available descriptor heads with free-running
+index. */
+
+    struct vring_avail avail;
+
+
+
+    // Padding to the next 4096 boundary.
+
+    char pad[];
+
+
+
+    // A ring of used descriptor heads with free-running index.
+
+    struct vring_used used;
+
+};
+
+  A Note on Virtqueue Endianness
+
+Note that the endian of these fields and everything else in the
+virtqueue is the native endian of the guest, not little-endian as
+PCI normally is. This makes for simpler guest code, and it is
+assumed that the host already has to be deeply aware of the guest
+endian so such an “endian-aware” device is not a significant
+issue.
+
+  Descriptor Table
+
+The descriptor table refers to the buffers the guest is using for
+the device. The addresses are physical addresses, and the buffers
+can be chained via the next field. Each descriptor describes a
+buffer which is read-only or write-only, but a chain of
+descriptors can contain both read-only and write-only buffers.
+
+No descriptor chain may be more than 2^32 bytes long in total.struct vring_desc {
+
+    /* Address (guest-physical). */
+
+    u64 addr;
+
+    /* Length. */
+
+    u32 len;
+
+/* This marks a buffer as continuing via the next field. */
+
+#define VRING_DESC_F_NEXT   1
+
+/* This marks a buffer as write-only (otherwise read-only). */
+
+#define VRING_DESC_F_WRITE     2
+
+/* This means the buffer contains a list of buffer descriptors.
+*/
+
+#define VRING_DESC_F_INDIRECT   4
+
+    /* The flags as indicated above. */
+
+    u16 flags;
+
+    /* Next field if flags & NEXT */
+
+    u16 next;
+
+};
+
+The number of descriptors in the table is specified by the Queue
+Size field for this virtqueue.
+
+  <sub:Indirect-Descriptors>Indirect Descriptors
+
+Some devices benefit by concurrently dispatching a large number
+of large requests. The VIRTIO_RING_F_INDIRECT_DESC feature can be
+used to allow this (see [cha:Reserved-Feature-Bits]). To increase
+ring capacity it is possible to store a table of indirect
+descriptors anywhere in memory, and insert a descriptor in main
+virtqueue (with flags&INDIRECT on) that refers to memory buffer
+containing this indirect descriptor table; fields addr and len
+refer to the indirect table address and length in bytes,
+respectively. The indirect table layout structure looks like this
+(len is the length of the descriptor that refers to this table,
+which is a variable, so this code won't compile):
+
+struct indirect_descriptor_table {
+
+    /* The actual descriptors (16 bytes each) */
+
+    struct vring_desc desc[len / 16];
+
+};
+
+The first indirect descriptor is located at start of the indirect
+descriptor table (index 0), additional indirect descriptors are
+chained by next field. An indirect descriptor without next field
+(with flags&NEXT off) signals the end of the indirect descriptor
+table, and transfers control back to the main virtqueue. An
+indirect descriptor can not refer to another indirect descriptor
+table (flags&INDIRECT must be off). A single indirect descriptor
+table can include both read-only and write-only descriptors;
+write-only flag (flags&WRITE) in the descriptor that refers to it
+is ignored.
+
+  Available Ring
+
+The available ring refers to what descriptors we are offering the
+device: it refers to the head of a descriptor chain. The “flags”
+field is currently 0 or 1: 1 indicating that we do not need an
+interrupt when the device consumes a descriptor from the
+available ring. Alternatively, the guest can ask the device to
+delay interrupts until an entry with an index specified by the “
+used_event” field is written in the used ring (equivalently,
+until the idx field in the used ring will reach the value
+used_event + 1). The method employed by the device is controlled
+by the VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
+). This interrupt suppression is merely an optimization; it may
+not suppress interrupts entirely.
+
+The “idx” field indicates where we would put the next descriptor
+entry (modulo the ring size). This starts at 0, and increases.
+
+struct vring_avail {
+
+#define VRING_AVAIL_F_NO_INTERRUPT      1
+
+   u16 flags;
+
+   u16 idx;
+
+   u16 ring[qsz]; /* qsz is the Queue Size field read from device
+*/
+
+   u16 used_event;
+
+};
+
+  Used Ring
+
+The used ring is where the device returns buffers once it is done
+with them. The flags field can be used by the device to hint that
+no notification is necessary when the guest adds to the available
+ring. Alternatively, the “avail_event” field can be used by the
+device to hint that no notification is necessary until an entry
+with an index specified by the “avail_event” is written in the
+available ring (equivalently, until the idx field in the
+available ring will reach the value avail_event + 1). The method
+employed by the device is controlled by the guest through the
+VIRTIO_RING_F_EVENT_IDX feature bit (see [cha:Reserved-Feature-Bits]
+). [footnote:
+These fields are kept here because this is the only part of the
+virtqueue written by the device
+].
+
+Each entry in the ring is a pair: the head entry of the
+descriptor chain describing the buffer (this matches an entry
+placed in the available ring by the guest earlier), and the total
+of bytes written into the buffer. The latter is extremely useful
+for guests using untrusted buffers: if you do not know exactly
+how much has been written by the device, you usually have to zero
+the buffer to ensure no data leakage occurs.
+
+/* u32 is used here for ids for padding reasons. */
+
+struct vring_used_elem {
+
+    /* Index of start of used descriptor chain. */
+
+    u32 id;
+
+    /* Total length of the descriptor chain which was used
+(written to) */
+
+    u32 len;
+
+};
+
+
+
+struct vring_used {
+
+#define VRING_USED_F_NO_NOTIFY  1
+
+    u16 flags;
+
+    u16 idx;
+
+    struct vring_used_elem ring[qsz];
+
+    u16 avail_event;
+
+};
+
+  Helpers for Managing Virtqueues
+
+The Linux Kernel Source code contains the definitions above and
+helper routines in a more usable form, in
+include/linux/virtio_ring.h. This was explicitly licensed by IBM
+and Red Hat under the (3-clause) BSD license so that it can be
+freely used by all other projects, and is reproduced (with slight
+variation to remove Linux assumptions) in Appendix A.
+
+  Device Operation
+
+There are two parts to device operation: supplying new buffers to
+the device, and processing used buffers from the device. As an
+example, the virtio network device has two virtqueues: the
+transmit virtqueue and the receive virtqueue. The driver adds
+outgoing (read-only) packets to the transmit virtqueue, and then
+frees them after they are used. Similarly, incoming (write-only)
+buffers are added to the receive virtqueue, and processed after
+they are used.
+
+  Supplying Buffers to The Device
+
+Actual transfer of buffers from the guest OS to the device
+operates as follows:
+
+  Place the buffer(s) into free descriptor(s).
+
+  If there are no free descriptors, the guest may choose to
+    notify the device even if notifications are suppressed (to
+    reduce latency).[footnote:
+The Linux drivers do this only for read-only buffers: for
+write-only buffers, it is assumed that the driver is merely
+trying to keep the receive buffer ring full, and no notification
+of this expected condition is necessary.
+]
+
+  Place the id of the buffer in the next ring entry of the
+  available ring.
+
+  The steps (1) and (2) may be performed repeatedly if batching
+  is possible.
+
+  A memory barrier should be executed to ensure the device sees
+  the updated descriptor table and available ring before the next
+  step.
+
+  The available “idx” field should be increased by the number of
+  entries added to the available ring.
+
+  A memory barrier should be executed to ensure that we update
+  the idx field before checking for notification suppression.
+
+  If notifications are not suppressed, the device should be
+  notified of the new buffers.
+
+Note that the above code does not take precautions against the
+available ring buffer wrapping around: this is not possible since
+the ring buffer is the same size as the descriptor table, so step
+(1) will prevent such a condition.
+
+In addition, the maximum queue size is 32768 (it must be a power
+of 2 which fits in 16 bits), so the 16-bit “idx” value can always
+distinguish between a full and empty buffer.
+
+Here is a description of each stage in more detail.
+
+  Placing Buffers Into The Descriptor Table
+
+A buffer consists of zero or more read-only physically-contiguous
+elements followed by zero or more physically-contiguous
+write-only elements (it must have at least one element). This
+algorithm maps it into the descriptor table:
+
+  for each buffer element, b:
+
+  Get the next free descriptor table entry, d
+
+  Set d.addr to the physical address of the start of b
+
+  Set d.len to the length of b.
+
+  If b is write-only, set d.flags to VRING_DESC_F_WRITE,
+    otherwise 0.
+
+  If there is a buffer element after this:
+
+    Set d.next to the index of the next free descriptor element.
+
+    Set the VRING_DESC_F_NEXT bit in d.flags.
+
+In practice, the d.next fields are usually used to chain free
+descriptors, and a separate count kept to check there are enough
+free descriptors before beginning the mappings.
+
+  Updating The Available Ring
+
+The head of the buffer we mapped is the first d in the algorithm
+above. A naive implementation would do the following:
+
+avail->ring[avail->idx % qsz] = head;
+
+However, in general we can add many descriptors before we update
+the “idx” field (at which point they become visible to the
+device), so we keep a counter of how many we've added:
+
+avail->ring[(avail->idx + added++) % qsz] = head;
+
+  Updating The Index Field
+
+Once the idx field of the virtqueue is updated, the device will
+be able to access the descriptor entries we've created and the
+memory they refer to. This is why a memory barrier is generally
+used before the idx update, to ensure it sees the most up-to-date
+copy.
+
+The idx field always increments, and we let it wrap naturally at
+65536:
+
+avail->idx += added;
+
+  <sub:Notifying-The-Device>Notifying The Device
+
+Device notification occurs by writing the 16-bit virtqueue index
+of this virtqueue to the Queue Notify field of the virtio header
+in the first I/O region of the PCI device. This can be expensive,
+however, so the device can suppress such notifications if it
+doesn't need them. We have to be careful to expose the new idx
+value before checking the suppression flag: it's OK to notify
+gratuitously, but not to omit a required notification. So again,
+we use a memory barrier here before reading the flags or the
+avail_event field.
+
+If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated, and if
+the VRING_USED_F_NOTIFY flag is not set, we go ahead and write to
+the PCI configuration space.
+
+If the VIRTIO_F_RING_EVENT_IDX feature is negotiated, we read the
+avail_event field in the available ring structure. If the
+available index crossed_the avail_event field value since the
+last notification, we go ahead and write to the PCI configuration
+space. The avail_event field wraps naturally at 65536 as well:
+
+(u16)(new_idx - avail_event - 1) < (u16)(new_idx - old_idx)
+
+  <sub:Receiving-Used-Buffers>Receiving Used Buffers From The
+  Device
+
+Once the device has used a buffer (read from or written to it, or
+parts of both, depending on the nature of the virtqueue and the
+device), it sends an interrupt, following an algorithm very
+similar to the algorithm used for the driver to send the device a
+buffer:
+
+  Write the head descriptor number to the next field in the used
+  ring.
+
+  Update the used ring idx.
+
+  Determine whether an interrupt is necessary:
+
+  If the VIRTIO_F_RING_EVENT_IDX feature is not negotiated: check
+    if f the VRING_AVAIL_F_NO_INTERRUPT flag is not set in avail-
+    >flags
+
+  If the VIRTIO_F_RING_EVENT_IDX feature is negotiated: check
+    whether the used index crossed the used_event field value
+    since the last update. The used_event field wraps naturally
+    at 65536 as well:(u16)(new_idx - used_event - 1) < (u16)(new_idx - old_idx)
+
+  If an interrupt is necessary:
+
+  If MSI-X capability is disabled:
+
+    Set the lower bit of the ISR Status field for the device.
+
+    Send the appropriate PCI interrupt for the device.
+
+  If MSI-X capability is enabled:
+
+    Request the appropriate MSI-X interrupt message for the
+      device, Queue Vector field sets the MSI-X Table entry
+      number.
+
+    If Queue Vector field value is NO_VECTOR, no interrupt
+      message is requested for this event.
+
+The guest interrupt handler should:
+
+  If MSI-X capability is disabled: read the ISR Status field,
+  which will reset it to zero. If the lower bit is zero, the
+  interrupt was not for this device. Otherwise, the guest driver
+  should look through the used rings of each virtqueue for the
+  device, to see if any progress has been made by the device
+  which requires servicing.
+
+  If MSI-X capability is enabled: look through the used rings of
+  each virtqueue mapped to the specific MSI-X vector for the
+  device, to see if any progress has been made by the device
+  which requires servicing.
+
+For each ring, guest should then disable interrupts by writing
+VRING_AVAIL_F_NO_INTERRUPT flag in avail structure, if required.
+It can then process used ring entries finally enabling interrupts
+by clearing the VRING_AVAIL_F_NO_INTERRUPT flag or updating the
+EVENT_IDX field in the available structure, Guest should then
+execute a memory barrier, and then recheck the ring empty
+condition. This is necessary to handle the case where, after the
+last check and before enabling interrupts, an interrupt has been
+suppressed by the device:
+
+vring_disable_interrupts(vq);
+
+for (;;) {
+
+    if (vq->last_seen_used != vring->used.idx) {
+
+               vring_enable_interrupts(vq);
+
+               mb();
+
+               if (vq->last_seen_used != vring->used.idx)
+
+                       break;
+
+    }
+
+    struct vring_used_elem *e =
+vring.used->ring[vq->last_seen_used%vsz];
+
+    process_buffer(e);
+
+    vq->last_seen_used++;
+
+}
+
+  Dealing With Configuration Changes
+
+Some virtio PCI devices can change the device configuration
+state, as reflected in the virtio header in the PCI configuration
+space. In this case:
+
+  If MSI-X capability is disabled: an interrupt is delivered and
+  the second highest bit is set in the ISR Status field to
+  indicate that the driver should re-examine the configuration
+  space.Note that a single interrupt can indicate both that one
+  or more virtqueue has been used and that the configuration
+  space has changed: even if the config bit is set, virtqueues
+  must be scanned.
+
+  If MSI-X capability is enabled: an interrupt message is
+  requested. The Configuration Vector field sets the MSI-X Table
+  entry number to use. If Configuration Vector field value is
+  NO_VECTOR, no interrupt message is requested for this event.
+
+Creating New Device Types
+
+Various considerations are necessary when creating a new device
+type:
+
+  How Many Virtqueues?
+
+It is possible that a very simple device will operate entirely
+through its configuration space, but most will need at least one
+virtqueue in which it will place requests. A device with both
+input and output (eg. console and network devices described here)
+need two queues: one which the driver fills with buffers to
+receive input, and one which the driver places buffers to
+transmit output.
+
+  What Configuration Space Layout?
+
+Configuration space is generally used for rarely-changing or
+initialization-time parameters. But it is a limited resource, so
+it might be better to use a virtqueue to update configuration
+information (the network device does this for filtering,
+otherwise the table in the config space could potentially be very
+large).
+
+Note that this space is generally the guest's native endian,
+rather than PCI's little-endian.
+
+  What Device Number?
+
+Currently device numbers are assigned quite freely: a simple
+request mail to the author of this document or the Linux
+virtualization mailing list[footnote:
+
+https://lists.linux-foundation.org/mailman/listinfo/virtualization
+] will be sufficient to secure a unique one.
+
+Meanwhile for experimental drivers, use 65535 and work backwards.
+
+  How many MSI-X vectors?
+
+Using the optional MSI-X capability devices can speed up
+interrupt processing by removing the need to read ISR Status
+register by guest driver (which might be an expensive operation),
+reducing interrupt sharing between devices and queues within the
+device, and handling interrupts from multiple CPUs. However, some
+systems impose a limit (which might be as low as 256) on the
+total number of MSI-X vectors that can be allocated to all
+devices. Devices and/or device drivers should take this into
+account, limiting the number of vectors used unless the device is
+expected to cause a high volume of interrupts. Devices can
+control the number of vectors used by limiting the MSI-X Table
+Size or not presenting MSI-X capability in PCI configuration
+space. Drivers can control this by mapping events to as small
+number of vectors as possible, or disabling MSI-X capability
+altogether.
+
+  Message Framing
+
+The descriptors used for a buffer should not effect the semantics
+of the message, except for the total length of the buffer. For
+example, a network buffer consists of a 10 byte header followed
+by the network packet. Whether this is presented in the ring
+descriptor chain as (say) a 10 byte buffer and a 1514 byte
+buffer, or a single 1524 byte buffer, or even three buffers,
+should have no effect.
+
+In particular, no implementation should use the descriptor
+boundaries to determine the size of any header in a request.[footnote:
+The current qemu device implementations mistakenly insist that
+the first descriptor cover the header in these cases exactly, so
+a cautious driver should arrange it so.
+]
+
+  Device Improvements
+
+Any change to configuration space, or new virtqueues, or
+behavioural changes, should be indicated by negotiation of a new
+feature bit. This establishes clarity[footnote:
+Even if it does mean documenting design or implementation
+mistakes!
+] and avoids future expansion problems.
+
+Clusters of functionality which are always implemented together
+can use a single bit, but if one feature makes sense without the
+others they should not be gratuitously grouped together to
+conserve feature bits. We can always extend the spec when the
+first person needs more than 24 feature bits for their device.
+
+[LaTeX Command: printnomenclature]
+
+Appendix A: virtio_ring.h
+
+#ifndef VIRTIO_RING_H
+
+#define VIRTIO_RING_H
+
+/* An interface for efficient virtio implementation.
+
+ *
+
+ * This header is BSD licensed so anyone can use the definitions
+
+ * to implement compatible drivers/servers.
+
+ *
+
+ * Copyright 2007, 2009, IBM Corporation
+
+ * Copyright 2011, Red Hat, Inc
+
+ * All rights reserved.
+
+ *
+
+ * Redistribution and use in source and binary forms, with or
+without
+
+ * modification, are permitted provided that the following
+conditions
+
+ * are met:
+
+ * 1. Redistributions of source code must retain the above
+copyright
+
+ *    notice, this list of conditions and the following
+disclaimer.
+
+ * 2. Redistributions in binary form must reproduce the above
+copyright
+
+ *    notice, this list of conditions and the following
+disclaimer in the
+
+ *    documentation and/or other materials provided with the
+distribution.
+
+ * 3. Neither the name of IBM nor the names of its contributors
+
+ *    may be used to endorse or promote products derived from
+this software
+
+ *    without specific prior written permission.
+
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+CONTRIBUTORS ``AS IS'' AND
+
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE
+
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE
+
+ * ARE DISCLAIMED.  IN NO EVENT SHALL IBM OR CONTRIBUTORS BE
+LIABLE
+
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL
+
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS
+
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION)
+
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT
+
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY
+
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF
+
+ * SUCH DAMAGE.
+
+ */
+
+
+
+/* This marks a buffer as continuing via the next field. */
+
+#define VRING_DESC_F_NEXT       1
+
+/* This marks a buffer as write-only (otherwise read-only). */
+
+#define VRING_DESC_F_WRITE      2
+
+
+
+/* The Host uses this in used->flags to advise the Guest: don't
+kick me
+
+ * when you add a buffer.  It's unreliable, so it's simply an
+
+ * optimization.  Guest will still kick if it's out of buffers.
+*/
+
+#define VRING_USED_F_NO_NOTIFY  1
+
+/* The Guest uses this in avail->flags to advise the Host: don't
+
+ * interrupt me when you consume a buffer.  It's unreliable, so
+it's
+
+ * simply an optimization.  */
+
+#define VRING_AVAIL_F_NO_INTERRUPT      1
+
+
+
+/* Virtio ring descriptors: 16 bytes.
+
+ * These can chain together via "next". */
+
+struct vring_desc {
+
+        /* Address (guest-physical). */
+
+        uint64_t addr;
+
+        /* Length. */
+
+        uint32_t len;
+
+        /* The flags as indicated above. */
+
+        uint16_t flags;
+
+        /* We chain unused descriptors via this, too */
+
+        uint16_t next;
+
+};
+
+
+
+struct vring_avail {
+
+        uint16_t flags;
+
+        uint16_t idx;
+
+        uint16_t ring[];
+
+        uint16_t used_event;
+
+};
+
+
+
+/* u32 is used here for ids for padding reasons. */
+
+struct vring_used_elem {
+
+        /* Index of start of used descriptor chain. */
+
+        uint32_t id;
+
+        /* Total length of the descriptor chain which was written
+to. */
+
+        uint32_t len;
+
+};
+
+
+
+struct vring_used {
+
+        uint16_t flags;
+
+        uint16_t idx;
+
+        struct vring_used_elem ring[];
+
+        uint16_t avail_event;
+
+};
+
+
+
+struct vring {
+
+        unsigned int num;
+
+
+
+        struct vring_desc *desc;
+
+        struct vring_avail *avail;
+
+        struct vring_used *used;
+
+};
+
+
+
+/* The standard layout for the ring is a continuous chunk of
+memory which
+
+ * looks like this.  We assume num is a power of 2.
+
+ *
+
+ * struct vring {
+
+ *      // The actual descriptors (16 bytes each)
+
+ *      struct vring_desc desc[num];
+
+ *
+
+ *      // A ring of available descriptor heads with free-running
+index.
+
+ *      __u16 avail_flags;
+
+ *      __u16 avail_idx;
+
+ *      __u16 available[num];
+
+ *
+
+ *      // Padding to the next align boundary.
+
+ *      char pad[];
+
+ *
+
+ *      // A ring of used descriptor heads with free-running
+index.
+
+ *      __u16 used_flags;
+
+ *      __u16 EVENT_IDX;
+
+ *      struct vring_used_elem used[num];
+
+ * };
+
+ * Note: for virtio PCI, align is 4096.
+
+ */
+
+static inline void vring_init(struct vring *vr, unsigned int num,
+void *p,
+
+                              unsigned long align)
+
+{
+
+        vr->num = num;
+
+        vr->desc = p;
+
+        vr->avail = p + num*sizeof(struct vring_desc);
+
+        vr->used = (void *)(((unsigned long)&vr->avail->ring[num]
+
+                              + align-1)
+
+                            & ~(align - 1));
+
+}
+
+
+
+static inline unsigned vring_size(unsigned int num, unsigned long
+align)
+
+{
+
+        return ((sizeof(struct vring_desc)*num +
+sizeof(uint16_t)*(2+num)
+
+                 + align - 1) & ~(align - 1))
+
+                + sizeof(uint16_t)*3 + sizeof(struct
+vring_used_elem)*num;
+
+}
+
+
+
+static inline int vring_need_event(uint16_t event_idx, uint16_t
+new_idx, uint16_t old_idx)
+
+{
+
+         return (uint16_t)(new_idx - event_idx - 1) <
+(uint16_t)(new_idx - old_idx);
+
+}
+
+#endif /* VIRTIO_RING_H */
+
+<cha:Reserved-Feature-Bits>Appendix B: Reserved Feature Bits
+
+Currently there are five device-independent feature bits defined:
+
+  VIRTIO_F_NOTIFY_ON_EMPTY (24) Negotiating this feature
+  indicates that the driver wants an interrupt if the device runs
+  out of available descriptors on a virtqueue, even though
+  interrupts are suppressed using the VRING_AVAIL_F_NO_INTERRUPT
+  flag or the used_event field. An example of this is the
+  networking driver: it doesn't need to know every time a packet
+  is transmitted, but it does need to free the transmitted
+  packets a finite time after they are transmitted. It can avoid
+  using a timer if the device interrupts it when all the packets
+  are transmitted.
+
+  VIRTIO_F_RING_INDIRECT_DESC (28) Negotiating this feature
+  indicates that the driver can use descriptors with the
+  VRING_DESC_F_INDIRECT flag set, as described in [sub:Indirect-Descriptors]
+  .
+
+  VIRTIO_F_RING_EVENT_IDX(29) This feature enables the used_event
+  and the avail_event fields. If set, it indicates that the
+  device should ignore the flags field in the available ring
+  structure. Instead, the used_event field in this structure is
+  used by guest to suppress device interrupts. Further, the
+  driver should ignore the flags field in the used ring
+  structure. Instead, the avail_event field in this structure is
+  used by the device to suppress notifications. If unset, the
+  driver should ignore the used_event field; the device should
+  ignore the avail_event field; the flags field is used
+
+  VIRTIO_F_BAD_FEATURE(30) This feature should never be
+  negotiated by the guest; doing so is an indication that the
+  guest is faulty[footnote:
+An experimental virtio PCI driver contained in Linux version
+2.6.25 had this problem, and this feature bit can be used to
+detect it.
+]
+
+  VIRTIO_F_FEATURES_HIGH(31) This feature indicates that the
+  device supports feature bits 32:63. If unset, feature bits
+  32:63 are unset.
+
+Appendix C: Network Device
+
+The virtio network device is a virtual ethernet card, and is the
+most complex of the devices supported so far by virtio. It has
+enhanced rapidly and demonstrates clearly how support for new
+features should be added to an existing device. Empty buffers are
+placed in one virtqueue for receiving packets, and outgoing
+packets are enqueued into another for transmission in that order.
+A third command queue is used to control advanced filtering
+features.
+
+  Configuration
+
+  Subsystem Device ID 1
+
+  Virtqueues 0:receiveq. 1:transmitq. 2:controlq[footnote:
+Only if VIRTIO_NET_F_CTRL_VQ set
+]
+
+  Feature bits
+
+  VIRTIO_NET_F_CSUM (0) Device handles packets with partial
+    checksum
+
+  VIRTIO_NET_F_GUEST_CSUM (1) Guest handles packets with partial
+    checksum
+
+  VIRTIO_NET_F_MAC (5) Device has given MAC address.
+
+  VIRTIO_NET_F_GSO (6) (Deprecated) device handles packets with
+    any GSO type.[footnote:
+It was supposed to indicate segmentation offload support, but
+upon further investigation it became clear that multiple bits
+were required.
+]
+
+  VIRTIO_NET_F_GUEST_TSO4 (7) Guest can receive TSOv4.
+
+  VIRTIO_NET_F_GUEST_TSO6 (8) Guest can receive TSOv6.
+
+  VIRTIO_NET_F_GUEST_ECN (9) Guest can receive TSO with ECN.
+
+  VIRTIO_NET_F_GUEST_UFO (10) Guest can receive UFO.
+
+  VIRTIO_NET_F_HOST_TSO4 (11) Device can receive TSOv4.
+
+  VIRTIO_NET_F_HOST_TSO6 (12) Device can receive TSOv6.
+
+  VIRTIO_NET_F_HOST_ECN (13) Device can receive TSO with ECN.
+
+  VIRTIO_NET_F_HOST_UFO (14) Device can receive UFO.
+
+  VIRTIO_NET_F_MRG_RXBUF (15) Guest can merge receive buffers.
+
+  VIRTIO_NET_F_STATUS (16) Configuration status field is
+    available.
+
+  VIRTIO_NET_F_CTRL_VQ (17) Control channel is available.
+
+  VIRTIO_NET_F_CTRL_RX (18) Control channel RX mode support.
+
+  VIRTIO_NET_F_CTRL_VLAN (19) Control channel VLAN filtering.
+
+  Device configuration layout Two configuration fields are
+  currently defined. The mac address field always exists (though
+  is only valid if VIRTIO_NET_F_MAC is set), and the status field
+  only exists if VIRTIO_NET_F_STATUS is set. Only one bit is
+  currently defined for the status field: VIRTIO_NET_S_LINK_UP. #define VIRTIO_NET_S_LINK_UP   1
+
+
+
+struct virtio_net_config {
+
+    u8 mac[6];
+
+    u16 status;
+
+};
+
+  Device Initialization
+
+  The initialization routine should identify the receive and
+  transmission virtqueues.
+
+  If the VIRTIO_NET_F_MAC feature bit is set, the configuration
+  space “mac” entry indicates the “physical” address of the the
+  network card, otherwise a private MAC address should be
+  assigned. All guests are expected to negotiate this feature if
+  it is set.
+
+  If the VIRTIO_NET_F_CTRL_VQ feature bit is negotiated, identify
+  the control virtqueue.
+
+  If the VIRTIO_NET_F_STATUS feature bit is negotiated, the link
+  status can be read from the bottom bit of the “status” config
+  field. Otherwise, the link should be assumed active.
+
+  The receive virtqueue should be filled with receive buffers.
+  This is described in detail below in “Setting Up Receive
+  Buffers”.
+
+  A driver can indicate that it will generate checksumless
+  packets by negotating the VIRTIO_NET_F_CSUM feature. This “
+  checksum offload” is a common feature on modern network cards.
+
+  If that feature is negotiated, a driver can use TCP or UDP
+  segmentation offload by negotiating the VIRTIO_NET_F_HOST_TSO4
+  (IPv4 TCP), VIRTIO_NET_F_HOST_TSO6 (IPv6 TCP) and
+  VIRTIO_NET_F_HOST_UFO (UDP fragmentation) features. It should
+  not send TCP packets requiring segmentation offload which have
+  the Explicit Congestion Notification bit set, unless the
+  VIRTIO_NET_F_HOST_ECN feature is negotiated.[footnote:
+This is a common restriction in real, older network cards.
+]
+
+  The converse features are also available: a driver can save the
+  virtual device some work by negotiating these features.[footnote:
+For example, a network packet transported between two guests on
+the same system may not require checksumming at all, nor
+segmentation, if both guests are amenable.
+] The VIRTIO_NET_F_GUEST_CSUM feature indicates that partially
+  checksummed packets can be received, and if it can do that then
+  the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6,
+  VIRTIO_NET_F_GUEST_UFO and VIRTIO_NET_F_GUEST_ECN are the input
+  equivalents of the features described above. See “Receiving
+  Packets” below.
+
+  Device Operation
+
+Packets are transmitted by placing them in the transmitq, and
+buffers for incoming packets are placed in the receiveq. In each
+case, the packet itself is preceeded by a header:
+
+struct virtio_net_hdr {
+
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM    1
+
+       u8 flags;
+
+#define VIRTIO_NET_HDR_GSO_NONE        0
+
+#define VIRTIO_NET_HDR_GSO_TCPV4       1
+
+#define VIRTIO_NET_HDR_GSO_UDP          3
+
+#define VIRTIO_NET_HDR_GSO_TCPV6       4
+
+#define VIRTIO_NET_HDR_GSO_ECN      0x80
+
+       u8 gso_type;
+
+       u16 hdr_len;
+
+       u16 gso_size;
+
+       u16 csum_start;
+
+       u16 csum_offset;
+
+/* Only if VIRTIO_NET_F_MRG_RXBUF: */
+
+       u16 num_buffers
+
+};
+
+The controlq is used to control device features such as
+filtering.
+
+  Packet Transmission
+
+Transmitting a single packet is simple, but varies depending on
+the different features the driver negotiated.
+
+  If the driver negotiated VIRTIO_NET_F_CSUM, and the packet has
+  not been fully checksummed, then the virtio_net_hdr's fields
+  are set as follows. Otherwise, the packet must be fully
+  checksummed, and flags is zero.
+
+  flags has the VIRTIO_NET_HDR_F_NEEDS_CSUM set,
+
+  <ite:csum_start-is-set>csum_start is set to the offset within
+    the packet to begin checksumming, and
+
+  csum_offset indicates how many bytes after the csum_start the
+    new (16 bit ones' complement) checksum should be placed.[footnote:
+For example, consider a partially checksummed TCP (IPv4) packet.
+It will have a 14 byte ethernet header and 20 byte IP header
+followed by the TCP header (with the TCP checksum field 16 bytes
+into that header). csum_start will be 14+20 = 34 (the TCP
+checksum includes the header), and csum_offset will be 16. The
+value in the TCP checksum field will be the sum of the TCP pseudo
+header, so that replacing it by the ones' complement checksum of
+the TCP header and body will give the correct result.
+]
+
+  <enu:If-the-driver>If the driver negotiated
+  VIRTIO_NET_F_HOST_TSO4, TSO6 or UFO, and the packet requires
+  TCP segmentation or UDP fragmentation, then the “gso_type”
+  field is set to VIRTIO_NET_HDR_GSO_TCPV4, TCPV6 or UDP.
+  (Otherwise, it is set to VIRTIO_NET_HDR_GSO_NONE). In this
+  case, packets larger than 1514 bytes can be transmitted: the
+  metadata indicates how to replicate the packet header to cut it
+  into smaller packets. The other gso fields are set:
+
+  hdr_len is a hint to the device as to how much of the header
+    needs to be kept to copy into each packet, usually set to the
+    length of the headers, including the transport header.[footnote:
+Due to various bugs in implementations, this field is not useful
+as a guarantee of the transport header size.
+]
+
+  gso_size is the size of the packet beyond that header (ie.
+    MSS).
+
+  If the driver negotiated the VIRTIO_NET_F_HOST_ECN feature, the
+    VIRTIO_NET_HDR_GSO_ECN bit may be set in “gso_type” as well,
+    indicating that the TCP packet has the ECN bit set.[footnote:
+This case is not handled by some older hardware, so is called out
+specifically in the protocol.
+]
+
+  If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
+  the num_buffers field is set to zero.
+
+  The header and packet are added as one output buffer to the
+  transmitq, and the device is notified of the new entry (see [sub:Notifying-The-Device]
+  ).[footnote:
+Note that the header will be two bytes longer for the
+VIRTIO_NET_F_MRG_RXBUF case.
+]
+
+  Packet Transmission Interrupt
+
+Often a driver will suppress transmission interrupts using the
+VRING_AVAIL_F_NO_INTERRUPT flag (see [sub:Receiving-Used-Buffers]
+) and check for used packets in the transmit path of following
+packets. However, it will still receive interrupts if the
+VIRTIO_F_NOTIFY_ON_EMPTY feature is negotiated, indicating that
+the transmission queue is completely emptied.
+
+The normal behavior in this interrupt handler is to retrieve and
+new descriptors from the used ring and free the corresponding
+headers and packets.
+
+  Setting Up Receive Buffers
+
+It is generally a good idea to keep the receive virtqueue as
+fully populated as possible: if it runs out, network performance
+will suffer.
+
+If the VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6 or
+VIRTIO_NET_F_GUEST_UFO features are used, the Guest will need to
+accept packets of up to 65550 bytes long (the maximum size of a
+TCP or UDP packet, plus the 14 byte ethernet header), otherwise
+1514 bytes. So unless VIRTIO_NET_F_MRG_RXBUF is negotiated, every
+buffer in the receive queue needs to be at least this length [footnote:
+Obviously each one can be split across multiple descriptor
+elements.
+].
+
+If VIRTIO_NET_F_MRG_RXBUF is negotiated, each buffer must be at
+least the size of the struct virtio_net_hdr.
+
+  Packet Receive Interrupt
+
+When a packet is copied into a buffer in the receiveq, the
+optimal path is to disable further interrupts for the receiveq
+(see [sub:Receiving-Used-Buffers]) and process packets until no
+more are found, then re-enable them.
+
+Processing packet involves:
+
+  If the driver negotiated the VIRTIO_NET_F_MRG_RXBUF feature,
+  then the “num_buffers” field indicates how many descriptors
+  this packet is spread over (including this one). This allows
+  receipt of large packets without having to allocate large
+  buffers. In this case, there will be at least “num_buffers” in
+  the used ring, and they should be chained together to form a
+  single packet. The other buffers will not begin with a struct
+  virtio_net_hdr.
+
+  If the VIRTIO_NET_F_MRG_RXBUF feature was not negotiated, or
+  the “num_buffers” field is one, then the entire packet will be
+  contained within this buffer, immediately following the struct
+  virtio_net_hdr.
+
+  If the VIRTIO_NET_F_GUEST_CSUM feature was negotiated, the
+  VIRTIO_NET_HDR_F_NEEDS_CSUM bit in the “flags” field may be
+  set: if so, the checksum on the packet is incomplete and the “
+  csum_start” and “csum_offset” fields indicate how to calculate
+  it (see [ite:csum_start-is-set]).
+
+  If the VIRTIO_NET_F_GUEST_TSO4, TSO6 or UFO options were
+  negotiated, then the “gso_type” may be something other than
+  VIRTIO_NET_HDR_GSO_NONE, and the “gso_size” field indicates the
+  desired MSS (see [enu:If-the-driver]).Control Virtqueue
+
+The driver uses the control virtqueue (if VIRTIO_NET_F_VTRL_VQ is
+negotiated) to send commands to manipulate various features of
+the device which would not easily map into the configuration
+space.
+
+All commands are of the following form:
+
+struct virtio_net_ctrl {
+
+       u8 class;
+
+       u8 command;
+
+       u8 command-specific-data[];
+
+       u8 ack;
+
+};
+
+
+
+/* ack values */
+
+#define VIRTIO_NET_OK     0
+
+#define VIRTIO_NET_ERR    1
+
+The class, command and command-specific-data are set by the
+driver, and the device sets the ack byte. There is little it can
+do except issue a diagnostic if the ack byte is not
+VIRTIO_NET_OK.
+
+  Packet Receive Filtering
+
+If the VIRTIO_NET_F_CTRL_RX feature is negotiated, the driver can
+send control commands for promiscuous mode, multicast receiving,
+and filtering of MAC addresses.
+
+Note that in general, these commands are best-effort: unwanted
+packets may still arrive.
+
+  Setting Promiscuous Mode
+
+#define VIRTIO_NET_CTRL_RX    0
+
+ #define VIRTIO_NET_CTRL_RX_PROMISC      0
+
+ #define VIRTIO_NET_CTRL_RX_ALLMULTI     1
+
+The class VIRTIO_NET_CTRL_RX has two commands:
+VIRTIO_NET_CTRL_RX_PROMISC turns promiscuous mode on and off, and
+VIRTIO_NET_CTRL_RX_ALLMULTI turns all-multicast receive on and
+off. The command-specific-data is one byte containing 0 (off) or
+1 (on).
+
+  Setting MAC Address Filtering
+
+struct virtio_net_ctrl_mac {
+
+       u32 entries;
+
+       u8 macs[entries][ETH_ALEN];
+
+};
+
+
+
+#define VIRTIO_NET_CTRL_MAC    1
+
+ #define VIRTIO_NET_CTRL_MAC_TABLE_SET        0
+
+The device can filter incoming packets by any number of
+destination MAC addresses.[footnote:
+Since there are no guarentees, it can use a hash filter
+orsilently switch to allmulti or promiscuous mode if it is given
+too many addresses.
+] This table is set using the class VIRTIO_NET_CTRL_MAC and the
+command VIRTIO_NET_CTRL_MAC_TABLE_SET. The command-specific-data
+is two variable length tables of 6-byte MAC addresses. The first
+table contains unicast addresses, and the second contains
+multicast addresses.
+
+  VLAN Filtering
+
+If the driver negotiates the VIRTION_NET_F_CTRL_VLAN feature, it
+can control a VLAN filter table in the device.
+
+#define VIRTIO_NET_CTRL_VLAN       2
+
+ #define VIRTIO_NET_CTRL_VLAN_ADD             0
+
+ #define VIRTIO_NET_CTRL_VLAN_DEL             1
+
+Both the VIRTIO_NET_CTRL_VLAN_ADD and VIRTIO_NET_CTRL_VLAN_DEL
+command take a 16-bit VLAN id as the command-specific-data.
+
+Appendix D: Block Device
+
+The virtio block device is a simple virtual block device (ie.
+disk). Read and write requests (and other exotic requests) are
+placed in the queue, and serviced (probably out of order) by the
+device except where noted.
+
+  Configuration
+
+  Subsystem Device ID 2
+
+  Virtqueues 0:requestq.
+
+  Feature bits
+
+  VIRTIO_BLK_F_BARRIER (0) Host supports request barriers.
+
+  VIRTIO_BLK_F_SIZE_MAX (1) Maximum size of any single segment is
+    in “size_max”.
+
+  VIRTIO_BLK_F_SEG_MAX (2) Maximum number of segments in a
+    request is in “seg_max”.
+
+  VIRTIO_BLK_F_GEOMETRY (4) Disk-style geometry specified in “
+    geometry”.
+
+  VIRTIO_BLK_F_RO (5) Device is read-only.
+
+  VIRTIO_BLK_F_BLK_SIZE (6) Block size of disk is in “blk_size”.
+
+  VIRTIO_BLK_F_SCSI (7) Device supports scsi packet commands.
+
+  VIRTIO_BLK_F_FLUSH (9) Cache flush command support.
+
+
+
+  Device configuration layout The capacity of the device
+  (expressed in 512-byte sectors) is always present. The
+  availability of the others all depend on various feature bits
+  as indicated above. struct virtio_blk_config {
+
+       u64 capacity;
+
+       u32 size_max;
+
+       u32 seg_max;
+
+       struct virtio_blk_geometry {
+
+               u16 cylinders;
+
+               u8 heads;
+
+               u8 sectors;
+
+       } geometry;
+
+       u32 blk_size;
+
+
+
+};
+
+  Device Initialization
+
+  The device size should be read from the “capacity”
+  configuration field. No requests should be submitted which goes
+  beyond this limit.
+
+  If the VIRTIO_BLK_F_BLK_SIZE feature is negotiated, the
+  blk_size field can be read to determine the optimal sector size
+  for the driver to use. This does not effect the units used in
+  the protocol (always 512 bytes), but awareness of the correct
+  value can effect performance.
+
+  If the VIRTIO_BLK_F_RO feature is set by the device, any write
+  requests will fail.
+
+
+
+  Device Operation
+
+The driver queues requests to the virtqueue, and they are used by
+the device (not necessarily in order). Each request is of form:
+
+struct virtio_blk_req {
+
+
+
+       u32 type;
+
+       u32 ioprio;
+
+       u64 sector;
+
+       char data[][512];
+
+       u8 status;
+
+};
+
+If the device has VIRTIO_BLK_F_SCSI feature, it can also support
+scsi packet command requests, each of these requests is of form:struct virtio_scsi_pc_req {
+
+       u32 type;
+
+       u32 ioprio;
+
+       u64 sector;
+
+    char cmd[];
+
+       char data[][512];
+
+#define SCSI_SENSE_BUFFERSIZE   96
+
+    u8 sense[SCSI_SENSE_BUFFERSIZE];
+
+    u32 errors;
+
+    u32 data_len;
+
+    u32 sense_len;
+
+    u32 residual;
+
+       u8 status;
+
+};
+
+The type of the request is either a read (VIRTIO_BLK_T_IN), a
+write (VIRTIO_BLK_T_OUT), a scsi packet command
+(VIRTIO_BLK_T_SCSI_CMD or VIRTIO_BLK_T_SCSI_CMD_OUT[footnote:
+the SCSI_CMD and SCSI_CMD_OUT types are equivalent, the device
+does not distinguish between them
+]) or a flush (VIRTIO_BLK_T_FLUSH or VIRTIO_BLK_T_FLUSH_OUT[footnote:
+the FLUSH and FLUSH_OUT types are equivalent, the device does not
+distinguish between them
+]). If the device has VIRTIO_BLK_F_BARRIER feature the high bit
+(VIRTIO_BLK_T_BARRIER) indicates that this request acts as a
+barrier and that all preceeding requests must be complete before
+this one, and all following requests must not be started until
+this is complete. Note that a barrier does not flush caches in
+the underlying backend device in host, and thus does not serve as
+data consistency guarantee. Driver must use FLUSH request to
+flush the host cache.
+
+#define VIRTIO_BLK_T_IN           0
+
+#define VIRTIO_BLK_T_OUT          1
+
+#define VIRTIO_BLK_T_SCSI_CMD     2
+
+#define VIRTIO_BLK_T_SCSI_CMD_OUT 3
+
+#define VIRTIO_BLK_T_FLUSH        4
+
+#define VIRTIO_BLK_T_FLUSH_OUT    5
+
+#define VIRTIO_BLK_T_BARRIER    0x80000000
+
+The ioprio field is a hint about the relative priorities of
+requests to the device: higher numbers indicate more important
+requests.
+
+The sector number indicates the offset (multiplied by 512) where
+the read or write is to occur. This field is unused and set to 0
+for scsi packet commands and for flush commands.
+
+The cmd field is only present for scsi packet command requests,
+and indicates the command to perform. This field must reside in a
+single, separate read-only buffer; command length can be derived
+from the length of this buffer.
+
+Note that these first three (four for scsi packet commands)
+fields are always read-only: the data field is either read-only
+or write-only, depending on the request. The size of the read or
+write can be derived from the total size of the request buffers.
+
+The sense field is only present for scsi packet command requests,
+and indicates the buffer for scsi sense data.
+
+The data_len field is only present for scsi packet command
+requests, this field is deprecated, and should be ignored by the
+driver. Historically, devices copied data length there.
+
+The sense_len field is only present for scsi packet command
+requests and indicates the number of bytes actually written to
+the sense buffer.
+
+The residual field is only present for scsi packet command
+requests and indicates the residual size, calculated as data
+length - number of bytes actually transferred.
+
+The final status byte is written by the device: either
+VIRTIO_BLK_S_OK for success, VIRTIO_BLK_S_IOERR for host or guest
+error or VIRTIO_BLK_S_UNSUPP for a request unsupported by host:#define VIRTIO_BLK_S_OK        0
+
+#define VIRTIO_BLK_S_IOERR     1
+
+#define VIRTIO_BLK_S_UNSUPP    2
+
+Historically, devices assumed that the fields type, ioprio and
+sector reside in a single, separate read-only buffer; the fields
+errors, data_len, sense_len and residual reside in a single,
+separate write-only buffer; the sense field in a separate
+write-only buffer of size 96 bytes, by itself; the fields errors,
+data_len, sense_len and residual in a single write-only buffer;
+and the status field is a separate read-only buffer of size 1
+byte, by itself.
+
+Appendix E: Console Device
+
+The virtio console device is a simple device for data input and
+output. A device may have one or more ports. Each port has a pair
+of input and output virtqueues. Moreover, a device has a pair of
+control IO virtqueues. The control virtqueues are used to
+communicate information between the device and the driver about
+ports being opened and closed on either side of the connection,
+indication from the host about whether a particular port is a
+console port, adding new ports, port hot-plug/unplug, etc., and
+indication from the guest about whether a port or a device was
+successfully added, port open/close, etc.. For data IO, one or
+more empty buffers are placed in the receive queue for incoming
+data and outgoing characters are placed in the transmit queue.
+
+  Configuration
+
+  Subsystem Device ID 3
+
+  Virtqueues 0:receiveq(port0). 1:transmitq(port0), 2:control
+  receiveq[footnote:
+Ports 2 onwards only if VIRTIO_CONSOLE_F_MULTIPORT is set
+], 3:control transmitq, 4:receiveq(port1), 5:transmitq(port1),
+  ...
+
+  Feature bits
+
+  VIRTIO_CONSOLE_F_SIZE (0) Configuration cols and rows fields
+    are valid.
+
+  VIRTIO_CONSOLE_F_MULTIPORT(1) Device has support for multiple
+    ports; configuration fields nr_ports and max_nr_ports are
+    valid and control virtqueues will be used.
+
+  Device configuration layout The size of the console is supplied
+  in the configuration space if the VIRTIO_CONSOLE_F_SIZE feature
+  is set. Furthermore, if the VIRTIO_CONSOLE_F_MULTIPORT feature
+  is set, the maximum number of ports supported by the device can
+  be fetched.struct virtio_console_config {
+
+       u16 cols;
+
+       u16 rows;
+
+
+
+       u32 max_nr_ports;
+
+};
+
+  Device Initialization
+
+  If the VIRTIO_CONSOLE_F_SIZE feature is negotiated, the driver
+  can read the console dimensions from the configuration fields.
+
+  If the VIRTIO_CONSOLE_F_MULTIPORT feature is negotiated, the
+  driver can spawn multiple ports, not all of which may be
+  attached to a console. Some could be generic ports. In this
+  case, the control virtqueues are enabled and according to the
+  max_nr_ports configuration-space value, the appropriate number
+  of virtqueues are created. A control message indicating the
+  driver is ready is sent to the host. The host can then send
+  control messages for adding new ports to the device. After
+  creating and initializing each port, a
+  VIRTIO_CONSOLE_PORT_READY control message is sent to the host
+  for that port so the host can let us know of any additional
+  configuration options set for that port.
+
+  The receiveq for each port is populated with one or more
+  receive buffers.
+
+  Device Operation
+
+  For output, a buffer containing the characters is placed in the
+  port's transmitq.[footnote:
+Because this is high importance and low bandwidth, the current
+Linux implementation polls for the buffer to be used, rather than
+waiting for an interrupt, simplifying the implementation
+significantly. However, for generic serial ports with the
+O_NONBLOCK flag set, the polling limitation is relaxed and the
+consumed buffers are freed upon the next write or poll call or
+when a port is closed or hot-unplugged.
+]
+
+  When a buffer is used in the receiveq (signalled by an
+  interrupt), the contents is the input to the port associated
+  with the virtqueue for which the notification was received.
+
+  If the driver negotiated the VIRTIO_CONSOLE_F_SIZE feature, a
+  configuration change interrupt may occur. The updated size can
+  be read from the configuration fields.
+
+  If the driver negotiated the VIRTIO_CONSOLE_F_MULTIPORT
+  feature, active ports are announced by the host using the
+  VIRTIO_CONSOLE_PORT_ADD control message. The same message is
+  used for port hot-plug as well.
+
+  If the host specified a port `name', a sysfs attribute is
+  created with the name filled in, so that udev rules can be
+  written that can create a symlink from the port's name to the
+  char device for port discovery by applications in the guest.
+
+  Changes to ports' state are effected by control messages.
+  Appropriate action is taken on the port indicated in the
+  control message. The layout of the structure of the control
+  buffer and the events associated are:struct virtio_console_control {
+
+       uint32_t id;    /* Port number */
+
+       uint16_t event; /* The kind of control event */
+
+       uint16_t value; /* Extra information for the event */
+
+};
+
+
+
+/* Some events for the internal messages (control packets) */
+
+
+
+#define VIRTIO_CONSOLE_DEVICE_READY     0
+
+#define VIRTIO_CONSOLE_PORT_ADD         1
+
+#define VIRTIO_CONSOLE_PORT_REMOVE      2
+
+#define VIRTIO_CONSOLE_PORT_READY       3
+
+#define VIRTIO_CONSOLE_CONSOLE_PORT     4
+
+#define VIRTIO_CONSOLE_RESIZE           5
+
+#define VIRTIO_CONSOLE_PORT_OPEN        6
+
+#define VIRTIO_CONSOLE_PORT_NAME        7
+
+Appendix F: Entropy Device
+
+The virtio entropy device supplies high-quality randomness for
+guest use.
+
+  Configuration
+
+  Subsystem Device ID 4
+
+  Virtqueues 0:requestq.
+
+  Feature bits None currently defined
+
+  Device configuration layout None currently defined.
+
+  Device Initialization
+
+  The virtqueue is initialized
+
+  Device Operation
+
+When the driver requires random bytes, it places the descriptor
+of one or more buffers in the queue. It will be completely filled
+by random data by the device.
+
+Appendix G: Memory Balloon Device
+
+The virtio memory balloon device is a primitive device for
+managing guest memory: the device asks for a certain amount of
+memory, and the guest supplies it (or withdraws it, if the device
+has more than it asks for). This allows the guest to adapt to
+changes in allowance of underlying physical memory. If the
+feature is negotiated, the device can also be used to communicate
+guest memory statistics to the host.
+
+  Configuration
+
+  Subsystem Device ID 5
+
+  Virtqueues 0:inflateq. 1:deflateq. 2:statsq.[footnote:
+Only if VIRTIO_BALLON_F_STATS_VQ set
+]
+
+  Feature bits
+
+  VIRTIO_BALLOON_F_MUST_TELL_HOST (0) Host must be told before
+    pages from the balloon are used.
+
+  VIRTIO_BALLOON_F_STATS_VQ (1) A virtqueue for reporting guest
+    memory statistics is present.
+
+  Device configuration layout Both fields of this configuration
+  are always available. Note that they are little endian, despite
+  convention that device fields are guest endian:struct virtio_balloon_config {
+
+       u32 num_pages;
+
+       u32 actual;
+
+};
+
+  Device Initialization
+
+  The inflate and deflate virtqueues are identified.
+
+  If the VIRTIO_BALLOON_F_STATS_VQ feature bit is negotiated:
+
+  Identify the stats virtqueue.
+
+  Add one empty buffer to the stats virtqueue and notify the
+    host.
+
+Device operation begins immediately.
+
+  Device Operation
+
+  Memory Ballooning The device is driven by the receipt of a
+  configuration change interrupt.
+
+  The “num_pages” configuration field is examined. If this is
+  greater than the “actual” number of pages, memory must be given
+  to the balloon. If it is less than the “actual” number of
+  pages, memory may be taken back from the balloon for general
+  use.
+
+  To supply memory to the balloon (aka. inflate):
+
+  The driver constructs an array of addresses of unused memory
+    pages. These addresses are divided by 4096[footnote:
+This is historical, and independent of the guest page size
+] and the descriptor describing the resulting 32-bit array is
+    added to the inflateq.
+
+  To remove memory from the balloon (aka. deflate):
+
+  The driver constructs an array of addresses of memory pages it
+    has previously given to the balloon, as described above. This
+    descriptor is added to the deflateq.
+
+  If the VIRTIO_BALLOON_F_MUST_TELL_HOST feature is set, the
+    guest may not use these requested pages until that descriptor
+    in the deflateq has been used by the device.
+
+  Otherwise, the guest may begin to re-use pages previously given
+    to the balloon before the device has acknowledged their
+    withdrawl. [footnote:
+In this case, deflation advice is merely a courtesy
+]
+
+  In either case, once the device has completed the inflation or
+  deflation, the “actual” field of the configuration should be
+  updated to reflect the new number of pages in the balloon.[footnote:
+As updates to configuration space are not atomic, this field
+isn't particularly reliable, but can be used to diagnose buggy
+guests.
+]
+
+  Memory Statistics
+
+The stats virtqueue is atypical because communication is driven
+by the device (not the driver). The channel becomes active at
+driver initialization time when the driver adds an empty buffer
+and notifies the device. A request for memory statistics proceeds
+as follows:
+
+  The device pushes the buffer onto the used ring and sends an
+  interrupt.
+
+  The driver pops the used buffer and discards it.
+
+  The driver collects memory statistics and writes them into a
+  new buffer.
+
+  The driver adds the buffer to the virtqueue and notifies the
+  device.
+
+  The device pops the buffer (retaining it to initiate a
+  subsequent request) and consumes the statistics.
+
+  Memory Statistics Format Each statistic consists of a 16 bit
+  tag and a 64 bit value. Both quantities are represented in the
+  native endian of the guest. All statistics are optional and the
+  driver may choose which ones to supply. To guarantee backwards
+  compatibility, unsupported statistics should be omitted.
+
+  struct virtio_balloon_stat {
+
+#define VIRTIO_BALLOON_S_SWAP_IN  0
+
+#define VIRTIO_BALLOON_S_SWAP_OUT 1
+
+#define VIRTIO_BALLOON_S_MAJFLT   2
+
+#define VIRTIO_BALLOON_S_MINFLT   3
+
+#define VIRTIO_BALLOON_S_MEMFREE  4
+
+#define VIRTIO_BALLOON_S_MEMTOT   5
+
+       u16 tag;
+
+       u64 val;
+
+} __attribute__((packed));
+
+  Tags
+
+  VIRTIO_BALLOON_S_SWAP_IN The amount of memory that has been
+  swapped in (in bytes).
+
+  VIRTIO_BALLOON_S_SWAP_OUT The amount of memory that has been
+  swapped out to disk (in bytes).
+
+  VIRTIO_BALLOON_S_MAJFLT The number of major page faults that
+  have occurred.
+
+  VIRTIO_BALLOON_S_MINFLT The number of minor page faults that
+  have occurred.
+
+  VIRTIO_BALLOON_S_MEMFREE The amount of memory not being used
+  for any purpose (in bytes).
+
+  VIRTIO_BALLOON_S_MEMTOT The total amount of memory available
+  (in bytes).
+
index 1e55e1e..069ee3b 100644 (file)
@@ -4604,7 +4604,7 @@ F:        arch/arm/mach-omap2/clockdomain2xxx_3xxx.c
 F:     arch/arm/mach-omap2/clockdomain44xx.c
 
 OMAP AUDIO SUPPORT
-M:     Jarkko Nikula <jhnikula@gmail.com>
+M:     Jarkko Nikula <jarkko.nikula@bitmer.com>
 L:     alsa-devel@alsa-project.org (subscribers-only)
 L:     linux-omap@vger.kernel.org
 S:     Maintained
@@ -4971,7 +4971,7 @@ M:        Paul Mackerras <paulus@samba.org>
 M:     Ingo Molnar <mingo@elte.hu>
 M:     Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
 S:     Supported
-F:     kernel/perf_event*.c
+F:     kernel/events/*
 F:     include/linux/perf_event.h
 F:     arch/*/kernel/perf_event*.c
 F:     arch/*/kernel/*/perf_event*.c
index b4ca4e1..788511f 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,8 @@
 VERSION = 3
 PATCHLEVEL = 1
 SUBLEVEL = 0
-EXTRAVERSION = -rc1
-NAME = Sneaky Weasel
+EXTRAVERSION = -rc3
+NAME = "Divemaster Edition"
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
index 1248547..3ff7785 100644 (file)
@@ -162,7 +162,6 @@ config IA64_GENERIC
        select ACPI_NUMA
        select SWIOTLB
        select PCI_MSI
-       select DMAR
        help
          This selects the system type of your hardware.  A "generic" kernel
          will run on any supported IA-64 system.  However, if you configure
index 1d7bca0..0e5cd14 100644 (file)
@@ -234,3 +234,4 @@ CONFIG_CRYPTO_MD5=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRC_T10DIF=y
 CONFIG_MISC_DEVICES=y
+CONFIG_DMAR=y
index 31d5570..89f2014 100644 (file)
@@ -162,7 +162,7 @@ static inline __attribute_const__ int __virt_to_node_shift(void)
        pgdat->node_mem_map + (__pfn - pgdat->node_start_pfn);          \
 })
 #define page_to_pfn(_page) ({                                          \
-       struct page *__p = (_page);                                     \
+       const struct page *__p = (_page);                               \
        struct pglist_data *pgdat;                                      \
        pgdat = &pg_data_map[page_to_nid(__p)];                         \
        ((__p) - pgdat->node_mem_map) + pgdat->node_start_pfn;          \
index 42c67be..1a6f20d 100644 (file)
@@ -55,6 +55,7 @@ config SPARC64
        select PERF_USE_VMALLOC
        select IRQ_PREFLOW_FASTEOI
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
+       select HAVE_C_RECORDMCOUNT
 
 config ARCH_DEFCONFIG
        string
index 5f5b8bf..bcc98fc 100644 (file)
@@ -131,6 +131,15 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
        *(volatile __u32 *)&lp->lock = ~0U;
 }
 
+static void inline arch_write_unlock(arch_rwlock_t *lock)
+{
+       __asm__ __volatile__(
+"      st              %%g0, [%0]"
+       : /* no outputs */
+       : "r" (lock)
+       : "memory");
+}
+
 static inline int arch_write_trylock(arch_rwlock_t *rw)
 {
        unsigned int val;
@@ -175,8 +184,6 @@ static inline int __arch_read_trylock(arch_rwlock_t *rw)
        res; \
 })
 
-#define arch_write_unlock(rw)  do { (rw)->lock = 0; } while(0)
-
 #define arch_spin_lock_flags(lock, flags) arch_spin_lock(lock)
 #define arch_read_lock_flags(rw, flags)   arch_read_lock(rw)
 #define arch_write_lock_flags(rw, flags)  arch_write_lock(rw)
index 073936a..9689176 100644 (file)
@@ -210,14 +210,8 @@ static int inline arch_write_trylock(arch_rwlock_t *lock)
        return result;
 }
 
-#define arch_read_lock(p)      arch_read_lock(p)
 #define arch_read_lock_flags(p, f) arch_read_lock(p)
-#define arch_read_trylock(p)   arch_read_trylock(p)
-#define arch_read_unlock(p)    arch_read_unlock(p)
-#define arch_write_lock(p)     arch_write_lock(p)
 #define arch_write_lock_flags(p, f) arch_write_lock(p)
-#define arch_write_unlock(p)   arch_write_unlock(p)
-#define arch_write_trylock(p)  arch_write_trylock(p)
 
 #define arch_read_can_lock(rw)         (!((rw)->lock & 0x80000000UL))
 #define arch_write_can_lock(rw)        (!(rw)->lock)
index a19f041..1aaf8c1 100644 (file)
@@ -352,8 +352,8 @@ int __init pcic_probe(void)
        strcpy(pbm->prom_name, namebuf);
 
        {
-               extern volatile int t_nmi[1];
-               extern int pcic_nmi_trap_patch[1];
+               extern volatile int t_nmi[4];
+               extern int pcic_nmi_trap_patch[4];
 
                t_nmi[0] = pcic_nmi_trap_patch[0];
                t_nmi[1] = pcic_nmi_trap_patch[1];
index 64a619d..7ff4669 100644 (file)
@@ -39,7 +39,7 @@ typedef struct xpaddr {
     ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
 
 extern unsigned long *machine_to_phys_mapping;
-extern unsigned int   machine_to_phys_order;
+extern unsigned long  machine_to_phys_nr;
 
 extern unsigned long get_phys_to_machine(unsigned long pfn);
 extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
@@ -87,7 +87,7 @@ static inline unsigned long mfn_to_pfn(unsigned long mfn)
        if (xen_feature(XENFEAT_auto_translated_physmap))
                return mfn;
 
-       if (unlikely((mfn >> machine_to_phys_order) != 0)) {
+       if (unlikely(mfn >= machine_to_phys_nr)) {
                pfn = ~0;
                goto try_override;
        }
index adc66c3..34b1859 100644 (file)
@@ -207,7 +207,6 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
            ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
            APIC_DM_INIT;
        uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
-       mdelay(10);
 
        val = (1UL << UVH_IPI_INT_SEND_SHFT) |
            (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
index 988724b..ff5790d 100644 (file)
@@ -22,6 +22,8 @@ config KVM
        depends on HAVE_KVM
        # for device assignment:
        depends on PCI
+       # for TASKSTATS/TASK_DELAY_ACCT:
+       depends on NET
        select PREEMPT_NOTIFIERS
        select MMU_NOTIFIER
        select ANON_INODES
@@ -31,6 +33,7 @@ config KVM
        select KVM_ASYNC_PF
        select USER_RETURN_NOTIFIER
        select KVM_MMIO
+       select TASKSTATS
        select TASK_DELAY_ACCT
        ---help---
          Support hosting fully virtualized guest machines using hardware
index 247aae3..0d17c8c 100644 (file)
@@ -17,6 +17,7 @@
 #include <asm/traps.h>                 /* dotraplinkage, ...           */
 #include <asm/pgalloc.h>               /* pgd_*(), ...                 */
 #include <asm/kmemcheck.h>             /* kmemcheck_*(), ...           */
+#include <asm/vsyscall.h>
 
 /*
  * Page fault error code bits:
index ae3cb23..c953302 100644 (file)
@@ -360,6 +360,15 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root)
                }
        }
 
+       /* After the PCI-E bus has been walked and all devices discovered,
+        * configure any settings of the fabric that might be necessary.
+        */
+       if (bus) {
+               struct pci_bus *child;
+               list_for_each_entry(child, &bus->children, node)
+                       pcie_bus_configure_settings(child, child->self->pcie_mpss);
+       }
+
        if (!bus)
                kfree(sd);
 
index 8b9940e..7cce722 100644 (file)
@@ -161,13 +161,13 @@ restart:
        if (inbuf && inlen) {
                /* write data to EC */
                for (i = 0; i < inlen; i++) {
+                       pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
+                       outb(inbuf[i], 0x68);
                        if (wait_on_ibf(0x6c, 0)) {
                                printk(KERN_ERR "olpc-ec:  timeout waiting for"
                                                " EC accept data!\n");
                                goto err;
                        }
-                       pr_devel("olpc-ec:  sending cmd arg 0x%x\n", inbuf[i]);
-                       outb(inbuf[i], 0x68);
                }
        }
        if (outbuf && outlen) {
index e2800af..e354bce 100644 (file)
@@ -43,7 +43,7 @@ __kernel_vsyscall:
        .space 7,0x90
 
        /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
-       jmp .Lenter_kernel
+       int $0x80
        /* 16: System call normal return point is here! */
 VDSO32_SYSENTER_RETURN:        /* Symbol used by sysenter.c via vdso32-syms.h */
        pop %ebp
index 3326204..add2c2d 100644 (file)
@@ -15,7 +15,7 @@ obj-y         := enlighten.o setup.o multicalls.o mmu.o irq.o \
                        grant-table.o suspend.o platform-pci-unplug.o \
                        p2m.o
 
-obj-$(CONFIG_FTRACE) += trace.o
+obj-$(CONFIG_EVENT_TRACING) += trace.o
 
 obj-$(CONFIG_SMP)              += smp.o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
index e2345af..2d69617 100644 (file)
@@ -77,8 +77,8 @@ EXPORT_SYMBOL_GPL(xen_domain_type);
 
 unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
 EXPORT_SYMBOL(machine_to_phys_mapping);
-unsigned int   machine_to_phys_order;
-EXPORT_SYMBOL(machine_to_phys_order);
+unsigned long  machine_to_phys_nr;
+EXPORT_SYMBOL(machine_to_phys_nr);
 
 struct start_info *xen_start_info;
 EXPORT_SYMBOL_GPL(xen_start_info);
index 8cce339..20a6142 100644 (file)
@@ -1713,15 +1713,19 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 void __init xen_setup_machphys_mapping(void)
 {
        struct xen_machphys_mapping mapping;
-       unsigned long machine_to_phys_nr_ents;
 
        if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
                machine_to_phys_mapping = (unsigned long *)mapping.v_start;
-               machine_to_phys_nr_ents = mapping.max_mfn + 1;
+               machine_to_phys_nr = mapping.max_mfn + 1;
        } else {
-               machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
+               machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
        }
-       machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
+#ifdef CONFIG_X86_32
+       if ((machine_to_phys_mapping + machine_to_phys_nr)
+           < machine_to_phys_mapping)
+               machine_to_phys_nr = (unsigned long *)NULL
+                                    - machine_to_phys_mapping;
+#endif
 }
 
 #ifdef CONFIG_X86_64
index b4533a8..e79dbb9 100644 (file)
@@ -521,8 +521,6 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
        native_smp_prepare_cpus(max_cpus);
        WARN_ON(xen_smp_intr_init(0));
 
-       if (!xen_have_vector_callback)
-               return;
        xen_init_lock_cpu(0);
        xen_init_spinlocks();
 }
@@ -546,6 +544,8 @@ static void xen_hvm_cpu_die(unsigned int cpu)
 
 void __init xen_hvm_smp_init(void)
 {
+       if (!xen_have_vector_callback)
+               return;
        smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
        smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
        smp_ops.cpu_up = xen_hvm_cpu_up;
index 60be1e0..e97934e 100644 (file)
@@ -65,6 +65,16 @@ config BLK_DEV_BSG
 
          If unsure, say Y.
 
+config BLK_DEV_BSGLIB
+       bool "Block layer SG support v4 helper lib"
+       default n
+       select BLK_DEV_BSG
+       help
+         Subsystems will normally enable this if needed. Users will not
+         normally need to manually enable this.
+
+         If unsure, say N.
+
 config BLK_DEV_INTEGRITY
        bool "Block layer data integrity support"
        ---help---
index 0fec4b3..514c6e4 100644 (file)
@@ -8,6 +8,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o
 
 obj-$(CONFIG_BLK_DEV_BSG)      += bsg.o
+obj-$(CONFIG_BLK_DEV_BSGLIB)   += bsg-lib.o
 obj-$(CONFIG_BLK_CGROUP)       += blk-cgroup.o
 obj-$(CONFIG_BLK_DEV_THROTTLING)       += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
index b627558..90e1ffd 100644 (file)
@@ -1702,6 +1702,7 @@ EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
        unsigned long flags;
+       int where = ELEVATOR_INSERT_BACK;
 
        if (blk_rq_check_limits(q, rq))
                return -EIO;
@@ -1718,7 +1719,10 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
         */
        BUG_ON(blk_queued_rq(rq));
 
-       add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
+       if (rq->cmd_flags & (REQ_FLUSH|REQ_FUA))
+               where = ELEVATOR_INSERT_FLUSH;
+
+       add_acct_request(q, rq, where);
        spin_unlock_irqrestore(q->queue_lock, flags);
 
        return 0;
@@ -2275,7 +2279,7 @@ static bool blk_end_bidi_request(struct request *rq, int error,
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-static bool __blk_end_bidi_request(struct request *rq, int error,
+bool __blk_end_bidi_request(struct request *rq, int error,
                                   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
        if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
index bb21e4c..491eb30 100644 (file)
@@ -95,11 +95,12 @@ static unsigned int blk_flush_policy(unsigned int fflags, struct request *rq)
 {
        unsigned int policy = 0;
 
+       if (blk_rq_sectors(rq))
+               policy |= REQ_FSEQ_DATA;
+
        if (fflags & REQ_FLUSH) {
                if (rq->cmd_flags & REQ_FLUSH)
                        policy |= REQ_FSEQ_PREFLUSH;
-               if (blk_rq_sectors(rq))
-                       policy |= REQ_FSEQ_DATA;
                if (!(fflags & REQ_FUA) && (rq->cmd_flags & REQ_FUA))
                        policy |= REQ_FSEQ_POSTFLUSH;
        }
@@ -122,7 +123,7 @@ static void blk_flush_restore_request(struct request *rq)
 
        /* make @rq a normal request */
        rq->cmd_flags &= ~REQ_FLUSH_SEQ;
-       rq->end_io = NULL;
+       rq->end_io = rq->flush.saved_end_io;
 }
 
 /**
@@ -300,9 +301,6 @@ void blk_insert_flush(struct request *rq)
        unsigned int fflags = q->flush_flags;   /* may change, cache */
        unsigned int policy = blk_flush_policy(fflags, rq);
 
-       BUG_ON(rq->end_io);
-       BUG_ON(!rq->bio || rq->bio != rq->biotail);
-
        /*
         * @policy now records what operations need to be done.  Adjust
         * REQ_FLUSH and FUA for the driver.
@@ -311,6 +309,19 @@ void blk_insert_flush(struct request *rq)
        if (!(fflags & REQ_FUA))
                rq->cmd_flags &= ~REQ_FUA;
 
+       /*
+        * An empty flush handed down from a stacking driver may
+        * translate into nothing if the underlying device does not
+        * advertise a write-back cache.  In this case, simply
+        * complete the request.
+        */
+       if (!policy) {
+               __blk_end_bidi_request(rq, 0, 0, 0);
+               return;
+       }
+
+       BUG_ON(!rq->bio || rq->bio != rq->biotail);
+
        /*
         * If there's data but flush is not necessary, the request can be
         * processed directly without going through flush machinery.  Queue
@@ -319,6 +330,7 @@ void blk_insert_flush(struct request *rq)
        if ((policy & REQ_FSEQ_DATA) &&
            !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
                list_add_tail(&rq->queuelist, &q->queue_head);
+               blk_run_queue_async(q);
                return;
        }
 
@@ -329,6 +341,7 @@ void blk_insert_flush(struct request *rq)
        memset(&rq->flush, 0, sizeof(rq->flush));
        INIT_LIST_HEAD(&rq->flush.list);
        rq->cmd_flags |= REQ_FLUSH_SEQ;
+       rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
        rq->end_io = flush_data_end_io;
 
        blk_flush_complete_seq(rq, REQ_FSEQ_ACTIONS & ~policy, 0);
index 475fab8..58340d0 100644 (file)
@@ -124,6 +124,14 @@ void __blk_complete_request(struct request *req)
        } else
                ccpu = cpu;
 
+       /*
+        * If current CPU and requested CPU are in the same group, running
+        * softirq in current CPU. One might concern this is just like
+        * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
+        * running in interrupt handler, and currently I/O controller doesn't
+        * support multiple interrupts, so current CPU is unique actually. This
+        * avoids IPI sending from current CPU to the first CPU of a group.
+        */
        if (ccpu == cpu || ccpu == group_cpu) {
                struct list_head *list;
 do_local:
index f6a7941..a19f58c 100644 (file)
@@ -746,7 +746,7 @@ static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
        bool rw = bio_data_dir(bio);
-       bool sync = bio->bi_rw & REQ_SYNC;
+       bool sync = rw_is_sync(bio->bi_rw);
 
        /* Charge the bio to the group */
        tg->bytes_disp[rw] += bio->bi_size;
@@ -1150,7 +1150,7 @@ int blk_throtl_bio(struct request_queue *q, struct bio **biop)
 
                if (tg_no_rule_group(tg, rw)) {
                        blkiocg_update_dispatch_stats(&tg->blkg, bio->bi_size,
-                                       rw, bio->bi_rw & REQ_SYNC);
+                                       rw, rw_is_sync(bio->bi_rw));
                        rcu_read_unlock();
                        return 0;
                }
index d658628..20b900a 100644 (file)
@@ -17,6 +17,8 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
                      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
+bool __blk_end_bidi_request(struct request *rq, int error,
+                           unsigned int nr_bytes, unsigned int bidi_bytes);
 
 void blk_rq_timed_out_timer(unsigned long data);
 void blk_delete_timer(struct request *);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
new file mode 100644 (file)
index 0000000..6690e6e
--- /dev/null
@@ -0,0 +1,298 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/scatterlist.h>
+#include <linux/bsg-lib.h>
+#include <linux/module.h>
+#include <scsi/scsi_cmnd.h>
+
+/**
+ * bsg_destroy_job - routine to teardown/delete a bsg job
+ * @job: bsg_job that is to be torn down
+ */
+static void bsg_destroy_job(struct bsg_job *job)
+{
+       put_device(job->dev);   /* release reference for the request */
+
+       kfree(job->request_payload.sg_list);
+       kfree(job->reply_payload.sg_list);
+       kfree(job);
+}
+
+/**
+ * bsg_job_done - completion routine for bsg requests
+ * @job: bsg_job that is complete
+ * @result: job reply result
+ * @reply_payload_rcv_len: length of payload recvd
+ *
+ * The LLD should call this when the bsg job has completed.
+ */
+void bsg_job_done(struct bsg_job *job, int result,
+                 unsigned int reply_payload_rcv_len)
+{
+       struct request *req = job->req;
+       struct request *rsp = req->next_rq;
+       int err;
+
+       err = job->req->errors = result;
+       if (err < 0)
+               /* we're only returning the result field in the reply */
+               job->req->sense_len = sizeof(u32);
+       else
+               job->req->sense_len = job->reply_len;
+       /* we assume all request payload was transferred, residual == 0 */
+       req->resid_len = 0;
+
+       if (rsp) {
+               WARN_ON(reply_payload_rcv_len > rsp->resid_len);
+
+               /* set reply (bidi) residual */
+               rsp->resid_len -= min(reply_payload_rcv_len, rsp->resid_len);
+       }
+       blk_complete_request(req);
+}
+EXPORT_SYMBOL_GPL(bsg_job_done);
+
+/**
+ * bsg_softirq_done - softirq done routine for destroying the bsg requests
+ * @rq: BSG request that holds the job to be destroyed
+ */
+static void bsg_softirq_done(struct request *rq)
+{
+       struct bsg_job *job = rq->special;
+
+       blk_end_request_all(rq, rq->errors);
+       bsg_destroy_job(job);
+}
+
+static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
+{
+       size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments);
+
+       BUG_ON(!req->nr_phys_segments);
+
+       buf->sg_list = kzalloc(sz, GFP_KERNEL);
+       if (!buf->sg_list)
+               return -ENOMEM;
+       sg_init_table(buf->sg_list, req->nr_phys_segments);
+       buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list);
+       buf->payload_len = blk_rq_bytes(req);
+       return 0;
+}
+
+/**
+ * bsg_create_job - create the bsg_job structure for the bsg request
+ * @dev: device that is being sent the bsg request
+ * @req: BSG request that needs a job structure
+ */
+static int bsg_create_job(struct device *dev, struct request *req)
+{
+       struct request *rsp = req->next_rq;
+       struct request_queue *q = req->q;
+       struct bsg_job *job;
+       int ret;
+
+       BUG_ON(req->special);
+
+       job = kzalloc(sizeof(struct bsg_job) + q->bsg_job_size, GFP_KERNEL);
+       if (!job)
+               return -ENOMEM;
+
+       req->special = job;
+       job->req = req;
+       if (q->bsg_job_size)
+               job->dd_data = (void *)&job[1];
+       job->request = req->cmd;
+       job->request_len = req->cmd_len;
+       job->reply = req->sense;
+       job->reply_len = SCSI_SENSE_BUFFERSIZE; /* Size of sense buffer
+                                                * allocated */
+       if (req->bio) {
+               ret = bsg_map_buffer(&job->request_payload, req);
+               if (ret)
+                       goto failjob_rls_job;
+       }
+       if (rsp && rsp->bio) {
+               ret = bsg_map_buffer(&job->reply_payload, rsp);
+               if (ret)
+                       goto failjob_rls_rqst_payload;
+       }
+       job->dev = dev;
+       /* take a reference for the request */
+       get_device(job->dev);
+       return 0;
+
+failjob_rls_rqst_payload:
+       kfree(job->request_payload.sg_list);
+failjob_rls_job:
+       kfree(job);
+       return -ENOMEM;
+}
+
+/*
+ * bsg_goose_queue - restart queue in case it was stopped
+ * @q: request q to be restarted
+ */
+void bsg_goose_queue(struct request_queue *q)
+{
+       if (!q)
+               return;
+
+       blk_run_queue_async(q);
+}
+EXPORT_SYMBOL_GPL(bsg_goose_queue);
+
+/**
+ * bsg_request_fn - generic handler for bsg requests
+ * @q: request queue to manage
+ *
+ * On error the create_bsg_job function should return a -Exyz error value
+ * that will be set to the req->errors.
+ *
+ * Drivers/subsys should pass this to the queue init function.
+ */
+void bsg_request_fn(struct request_queue *q)
+{
+       struct device *dev = q->queuedata;
+       struct request *req;
+       struct bsg_job *job;
+       int ret;
+
+       if (!get_device(dev))
+               return;
+
+       while (1) {
+               req = blk_fetch_request(q);
+               if (!req)
+                       break;
+               spin_unlock_irq(q->queue_lock);
+
+               ret = bsg_create_job(dev, req);
+               if (ret) {
+                       req->errors = ret;
+                       blk_end_request_all(req, ret);
+                       spin_lock_irq(q->queue_lock);
+                       continue;
+               }
+
+               job = req->special;
+               ret = q->bsg_job_fn(job);
+               spin_lock_irq(q->queue_lock);
+               if (ret)
+                       break;
+       }
+
+       spin_unlock_irq(q->queue_lock);
+       put_device(dev);
+       spin_lock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL_GPL(bsg_request_fn);
+
+/**
+ * bsg_setup_queue - Create and add the bsg hooks so we can receive requests
+ * @dev: device to attach bsg device to
+ * @q: request queue setup by caller
+ * @name: device to give bsg device
+ * @job_fn: bsg job handler
+ * @dd_job_size: size of LLD data needed for each job
+ *
+ * The caller should have setup the reuqest queue with bsg_request_fn
+ * as the request_fn.
+ */
+int bsg_setup_queue(struct device *dev, struct request_queue *q,
+                   char *name, bsg_job_fn *job_fn, int dd_job_size)
+{
+       int ret;
+
+       q->queuedata = dev;
+       q->bsg_job_size = dd_job_size;
+       q->bsg_job_fn = job_fn;
+       queue_flag_set_unlocked(QUEUE_FLAG_BIDI, q);
+       blk_queue_softirq_done(q, bsg_softirq_done);
+       blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
+
+       ret = bsg_register_queue(q, dev, name, NULL);
+       if (ret) {
+               printk(KERN_ERR "%s: bsg interface failed to "
+                      "initialize - register queue\n", dev->kobj.name);
+               return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(bsg_setup_queue);
+
+/**
+ * bsg_remove_queue - Deletes the bsg dev from the q
+ * @q: the request_queue that is to be torn down.
+ *
+ * Notes:
+ *   Before unregistering the queue empty any requests that are blocked
+ */
+void bsg_remove_queue(struct request_queue *q)
+{
+       struct request *req; /* block request */
+       int counts; /* totals for request_list count and starved */
+
+       if (!q)
+               return;
+
+       /* Stop taking in new requests */
+       spin_lock_irq(q->queue_lock);
+       blk_stop_queue(q);
+
+       /* drain all requests in the queue */
+       while (1) {
+               /* need the lock to fetch a request
+                * this may fetch the same reqeust as the previous pass
+                */
+               req = blk_fetch_request(q);
+               /* save requests in use and starved */
+               counts = q->rq.count[0] + q->rq.count[1] +
+                        q->rq.starved[0] + q->rq.starved[1];
+               spin_unlock_irq(q->queue_lock);
+               /* any requests still outstanding? */
+               if (counts == 0)
+                       break;
+
+               /* This may be the same req as the previous iteration,
+                * always send the blk_end_request_all after a prefetch.
+                * It is not okay to not end the request because the
+                * prefetch started the request.
+                */
+               if (req) {
+                       /* return -ENXIO to indicate that this queue is
+                        * going away
+                        */
+                       req->errors = -ENXIO;
+                       blk_end_request_all(req, -ENXIO);
+               }
+
+               msleep(200); /* allow bsg to possibly finish */
+               spin_lock_irq(q->queue_lock);
+       }
+       bsg_unregister_queue(q);
+}
+EXPORT_SYMBOL_GPL(bsg_remove_queue);
index 1f96ad6..a33bd43 100644 (file)
@@ -130,6 +130,8 @@ struct cfq_queue {
        unsigned long slice_end;
        long slice_resid;
 
+       /* pending metadata requests */
+       int meta_pending;
        /* number of requests that are on the dispatch list or inside driver */
        int dispatched;
 
@@ -682,6 +684,9 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2,
        if (rq_is_sync(rq1) != rq_is_sync(rq2))
                return rq_is_sync(rq1) ? rq1 : rq2;
 
+       if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_META)
+               return rq1->cmd_flags & REQ_META ? rq1 : rq2;
+
        s1 = blk_rq_pos(rq1);
        s2 = blk_rq_pos(rq2);
 
@@ -1209,6 +1214,9 @@ static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 
        hlist_del_init(&cfqg->cfqd_node);
 
+       BUG_ON(cfqd->nr_blkcg_linked_grps <= 0);
+       cfqd->nr_blkcg_linked_grps--;
+
        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
@@ -1604,6 +1612,10 @@ static void cfq_remove_request(struct request *rq)
        cfqq->cfqd->rq_queued--;
        cfq_blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg,
                                        rq_data_dir(rq), rq_is_sync(rq));
+       if (rq->cmd_flags & REQ_META) {
+               WARN_ON(!cfqq->meta_pending);
+               cfqq->meta_pending--;
+       }
 }
 
 static int cfq_merge(struct request_queue *q, struct request **req,
@@ -3356,6 +3368,13 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
            RB_EMPTY_ROOT(&cfqq->sort_list))
                return true;
 
+       /*
+        * So both queues are sync. Let the new request get disk time if
+        * it's a metadata request and the current queue is doing regular IO.
+        */
+       if ((rq->cmd_flags & REQ_META) && !cfqq->meta_pending)
+               return true;
+
        /*
         * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
         */
@@ -3420,6 +3439,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
        struct cfq_io_context *cic = RQ_CIC(rq);
 
        cfqd->rq_queued++;
+       if (rq->cmd_flags & REQ_META)
+               cfqq->meta_pending++;
 
        cfq_update_io_thinktime(cfqd, cfqq, cic);
        cfq_update_io_seektime(cfqd, cfqq, rq);
index 5cb51c5..e2f6790 100644 (file)
@@ -1146,17 +1146,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
                cpu = part_stat_lock();
                part_round_stats(cpu, hd);
                part_stat_unlock();
-               seq_printf(seqf, "%4d %7d %s %lu %lu %llu "
-                          "%u %lu %lu %llu %u %u %u %u\n",
+               seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
+                          "%u %lu %lu %lu %u %u %u %u\n",
                           MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
                           disk_name(gp, hd->partno, buf),
                           part_stat_read(hd, ios[READ]),
                           part_stat_read(hd, merges[READ]),
-                          (unsigned long long)part_stat_read(hd, sectors[READ]),
+                          part_stat_read(hd, sectors[READ]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[READ])),
                           part_stat_read(hd, ios[WRITE]),
                           part_stat_read(hd, merges[WRITE]),
-                          (unsigned long long)part_stat_read(hd, sectors[WRITE]),
+                          part_stat_read(hd, sectors[WRITE]),
                           jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
                           part_in_flight(hd),
                           jiffies_to_msecs(part_stat_read(hd, io_ticks)),
index ca3e6be..5987e0b 100644 (file)
@@ -468,6 +468,15 @@ config PATA_ICSIDE
          interface card.  This is not required for ICS partition support.
          If you are unsure, say N to this.
 
+config PATA_IMX
+       tristate "PATA support for Freescale iMX"
+       depends on ARCH_MXC
+       help
+         This option enables support for the PATA host available on Freescale
+          iMX SoCs.
+
+         If unsure, say N.
+
 config PATA_IT8213
        tristate "IT8213 PATA support (Experimental)"
        depends on PCI && EXPERIMENTAL
index 8ac64e1..9550d69 100644 (file)
@@ -48,6 +48,7 @@ obj-$(CONFIG_PATA_HPT37X)     += pata_hpt37x.o
 obj-$(CONFIG_PATA_HPT3X2N)     += pata_hpt3x2n.o
 obj-$(CONFIG_PATA_HPT3X3)      += pata_hpt3x3.o
 obj-$(CONFIG_PATA_ICSIDE)      += pata_icside.o
+obj-$(CONFIG_PATA_IMX)         += pata_imx.o
 obj-$(CONFIG_PATA_IT8213)      += pata_it8213.o
 obj-$(CONFIG_PATA_IT821X)      += pata_it821x.o
 obj-$(CONFIG_PATA_JMICRON)     += pata_jmicron.o
diff --git a/drivers/ata/pata_imx.c b/drivers/ata/pata_imx.c
new file mode 100644 (file)
index 0000000..ca9d9ca
--- /dev/null
@@ -0,0 +1,253 @@
+/*
+ * Freescale iMX PATA driver
+ *
+ * Copyright (C) 2011 Arnaud Patard <arnaud.patard@rtp-net.org>
+ *
+ * Based on pata_platform - Copyright (C) 2006 - 2007  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * TODO:
+ * - dmaengine support
+ * - check if timing stuff needed
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <scsi/scsi_host.h>
+#include <linux/ata.h>
+#include <linux/libata.h>
+#include <linux/platform_device.h>
+#include <linux/clk.h>
+
+#define DRV_NAME "pata_imx"
+
+#define PATA_IMX_ATA_CONTROL           0x24
+#define PATA_IMX_ATA_CTRL_FIFO_RST_B   (1<<7)
+#define PATA_IMX_ATA_CTRL_ATA_RST_B    (1<<6)
+#define PATA_IMX_ATA_CTRL_IORDY_EN     (1<<0)
+#define PATA_IMX_ATA_INT_EN            0x2C
+#define PATA_IMX_ATA_INTR_ATA_INTRQ2   (1<<3)
+#define PATA_IMX_DRIVE_DATA            0xA0
+#define PATA_IMX_DRIVE_CONTROL         0xD8
+
+struct pata_imx_priv {
+       struct clk *clk;
+       /* timings/interrupt/control regs */
+       u8 *host_regs;
+       u32 ata_ctl;
+};
+
+static int pata_imx_set_mode(struct ata_link *link, struct ata_device **unused)
+{
+       struct ata_device *dev;
+       struct ata_port *ap = link->ap;
+       struct pata_imx_priv *priv = ap->host->private_data;
+       u32 val;
+
+       ata_for_each_dev(dev, link, ENABLED) {
+               dev->pio_mode = dev->xfer_mode = XFER_PIO_0;
+               dev->xfer_shift = ATA_SHIFT_PIO;
+               dev->flags |= ATA_DFLAG_PIO;
+
+               val = __raw_readl(priv->host_regs + PATA_IMX_ATA_CONTROL);
+               if (ata_pio_need_iordy(dev))
+                       val |= PATA_IMX_ATA_CTRL_IORDY_EN;
+               else
+                       val &= ~PATA_IMX_ATA_CTRL_IORDY_EN;
+               __raw_writel(val, priv->host_regs + PATA_IMX_ATA_CONTROL);
+
+               ata_dev_printk(dev, KERN_INFO, "configured for PIO\n");
+       }
+       return 0;
+}
+
+static struct scsi_host_template pata_imx_sht = {
+       ATA_PIO_SHT(DRV_NAME),
+};
+
+static struct ata_port_operations pata_imx_port_ops = {
+       .inherits               = &ata_sff_port_ops,
+       .sff_data_xfer          = ata_sff_data_xfer_noirq,
+       .cable_detect           = ata_cable_unknown,
+       .set_mode               = pata_imx_set_mode,
+};
+
+static void pata_imx_setup_port(struct ata_ioports *ioaddr)
+{
+       /* Fixup the port shift for platforms that need it */
+       ioaddr->data_addr       = ioaddr->cmd_addr + (ATA_REG_DATA    << 2);
+       ioaddr->error_addr      = ioaddr->cmd_addr + (ATA_REG_ERR     << 2);
+       ioaddr->feature_addr    = ioaddr->cmd_addr + (ATA_REG_FEATURE << 2);
+       ioaddr->nsect_addr      = ioaddr->cmd_addr + (ATA_REG_NSECT   << 2);
+       ioaddr->lbal_addr       = ioaddr->cmd_addr + (ATA_REG_LBAL    << 2);
+       ioaddr->lbam_addr       = ioaddr->cmd_addr + (ATA_REG_LBAM    << 2);
+       ioaddr->lbah_addr       = ioaddr->cmd_addr + (ATA_REG_LBAH    << 2);
+       ioaddr->device_addr     = ioaddr->cmd_addr + (ATA_REG_DEVICE  << 2);
+       ioaddr->status_addr     = ioaddr->cmd_addr + (ATA_REG_STATUS  << 2);
+       ioaddr->command_addr    = ioaddr->cmd_addr + (ATA_REG_CMD     << 2);
+}
+
+static int __devinit pata_imx_probe(struct platform_device *pdev)
+{
+       struct ata_host *host;
+       struct ata_port *ap;
+       struct pata_imx_priv *priv;
+       int irq = 0;
+       struct resource *io_res;
+
+       io_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (io_res == NULL)
+               return -EINVAL;
+
+       irq = platform_get_irq(pdev, 0);
+       if (irq <= 0)
+               return -EINVAL;
+
+       priv = devm_kzalloc(&pdev->dev,
+                               sizeof(struct pata_imx_priv), GFP_KERNEL);
+       if (!priv)
+               return -ENOMEM;
+
+       priv->clk = clk_get(&pdev->dev, NULL);
+       if (IS_ERR(priv->clk)) {
+               dev_err(&pdev->dev, "Failed to get clock\n");
+               return PTR_ERR(priv->clk);
+       }
+
+       clk_enable(priv->clk);
+
+       host = ata_host_alloc(&pdev->dev, 1);
+       if (!host)
+               goto free_priv;
+
+       host->private_data = priv;
+       ap = host->ports[0];
+
+       ap->ops = &pata_imx_port_ops;
+       ap->pio_mask = ATA_PIO0;
+       ap->flags |= ATA_FLAG_SLAVE_POSS;
+
+       priv->host_regs = devm_ioremap(&pdev->dev, io_res->start,
+               resource_size(io_res));
+       if (!priv->host_regs) {
+               dev_err(&pdev->dev, "failed to map IO/CTL base\n");
+               goto free_priv;
+       }
+
+       ap->ioaddr.cmd_addr = priv->host_regs + PATA_IMX_DRIVE_DATA;
+       ap->ioaddr.ctl_addr = priv->host_regs + PATA_IMX_DRIVE_CONTROL;
+
+       ap->ioaddr.altstatus_addr = ap->ioaddr.ctl_addr;
+
+       pata_imx_setup_port(&ap->ioaddr);
+
+       ata_port_desc(ap, "cmd 0x%llx ctl 0x%llx",
+               (unsigned long long)io_res->start + PATA_IMX_DRIVE_DATA,
+               (unsigned long long)io_res->start + PATA_IMX_DRIVE_CONTROL);
+
+       /* deassert resets */
+       __raw_writel(PATA_IMX_ATA_CTRL_FIFO_RST_B |
+                       PATA_IMX_ATA_CTRL_ATA_RST_B,
+                       priv->host_regs + PATA_IMX_ATA_CONTROL);
+       /* enable interrupts */
+       __raw_writel(PATA_IMX_ATA_INTR_ATA_INTRQ2,
+                       priv->host_regs + PATA_IMX_ATA_INT_EN);
+
+       /* activate */
+       return ata_host_activate(host, irq, ata_sff_interrupt, 0,
+                               &pata_imx_sht);
+
+free_priv:
+       clk_disable(priv->clk);
+       clk_put(priv->clk);
+       return -ENOMEM;
+}
+
+static int __devexit pata_imx_remove(struct platform_device *pdev)
+{
+       struct ata_host *host = dev_get_drvdata(&pdev->dev);
+       struct pata_imx_priv *priv = host->private_data;
+
+       ata_host_detach(host);
+
+       __raw_writel(0, priv->host_regs + PATA_IMX_ATA_INT_EN);
+
+       clk_disable(priv->clk);
+       clk_put(priv->clk);
+
+       return 0;
+}
+
+#ifdef CONFIG_PM
+static int pata_imx_suspend(struct device *dev)
+{
+       struct ata_host *host = dev_get_drvdata(dev);
+       struct pata_imx_priv *priv = host->private_data;
+       int ret;
+
+       ret = ata_host_suspend(host, PMSG_SUSPEND);
+       if (!ret) {
+               __raw_writel(0, priv->host_regs + PATA_IMX_ATA_INT_EN);
+               priv->ata_ctl =
+                       __raw_readl(priv->host_regs + PATA_IMX_ATA_CONTROL);
+               clk_disable(priv->clk);
+       }
+
+       return ret;
+}
+
+static int pata_imx_resume(struct device *dev)
+{
+       struct ata_host *host = dev_get_drvdata(dev);
+       struct pata_imx_priv *priv = host->private_data;
+
+       clk_enable(priv->clk);
+
+       __raw_writel(priv->ata_ctl, priv->host_regs + PATA_IMX_ATA_CONTROL);
+
+       __raw_writel(PATA_IMX_ATA_INTR_ATA_INTRQ2,
+                       priv->host_regs + PATA_IMX_ATA_INT_EN);
+
+       ata_host_resume(host);
+
+       return 0;
+}
+
+static const struct dev_pm_ops pata_imx_pm_ops = {
+       .suspend        = pata_imx_suspend,
+       .resume         = pata_imx_resume,
+};
+#endif
+
+static struct platform_driver pata_imx_driver = {
+       .probe          = pata_imx_probe,
+       .remove         = __devexit_p(pata_imx_remove),
+       .driver = {
+               .name           = DRV_NAME,
+               .owner          = THIS_MODULE,
+#ifdef CONFIG_PM
+               .pm             = &pata_imx_pm_ops,
+#endif
+       },
+};
+
+static int __init pata_imx_init(void)
+{
+       return platform_driver_register(&pata_imx_driver);
+}
+
+static void __exit pata_imx_exit(void)
+{
+       platform_driver_unregister(&pata_imx_driver);
+}
+module_init(pata_imx_init);
+module_exit(pata_imx_exit);
+
+MODULE_AUTHOR("Arnaud Patard <arnaud.patard@rtp-net.org>");
+MODULE_DESCRIPTION("low-level driver for iMX PATA");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("platform:" DRV_NAME);
index 65e4be6..8e9f504 100644 (file)
@@ -124,6 +124,17 @@ static const struct via_isa_bridge {
        { NULL }
 };
 
+static const struct dmi_system_id no_atapi_dma_dmi_table[] = {
+       {
+               .ident = "AVERATEC 3200",
+               .matches = {
+                       DMI_MATCH(DMI_BOARD_VENDOR, "AVERATEC"),
+                       DMI_MATCH(DMI_BOARD_NAME, "3200"),
+               },
+       },
+       { }
+};
+
 struct via_port {
        u8 cached_device;
 };
@@ -355,6 +366,13 @@ static unsigned long via_mode_filter(struct ata_device *dev, unsigned long mask)
                        mask &= ~ ATA_MASK_UDMA;
                }
        }
+
+       if (dev->class == ATA_DEV_ATAPI &&
+           dmi_check_system(no_atapi_dma_dmi_table)) {
+               ata_dev_warn(dev, "controller locks up on ATAPI DMA, forcing PIO\n");
+               mask &= ATA_MASK_PIO;
+       }
+
        return mask;
 }
 
index 0a9a774..5c42374 100644 (file)
@@ -1329,7 +1329,7 @@ static int sata_dwc_port_start(struct ata_port *ap)
                        dev_err(ap->dev, "%s: dma_alloc_coherent failed\n",
                                 __func__);
                        err = -ENOMEM;
-                       goto CLEANUP;
+                       goto CLEANUP_ALLOC;
                }
        }
 
@@ -1349,15 +1349,13 @@ static int sata_dwc_port_start(struct ata_port *ap)
        /* Clear any error bits before libata starts issuing commands */
        clear_serror();
        ap->private_data = hsdevp;
+       dev_dbg(ap->dev, "%s: done\n", __func__);
+       return 0;
 
+CLEANUP_ALLOC:
+       kfree(hsdevp);
 CLEANUP:
-       if (err) {
-               sata_dwc_port_stop(ap);
-               dev_dbg(ap->dev, "%s: fail\n", __func__);
-       } else {
-               dev_dbg(ap->dev, "%s: done\n", __func__);
-       }
-
+       dev_dbg(ap->dev, "%s: fail. ap->id = %d\n", __func__, ap->print_id);
        return err;
 }
 
index 98c1d78..9dfb40b 100644 (file)
@@ -438,7 +438,7 @@ static void sil_host_intr(struct ata_port *ap, u32 bmdma2)
        u8 status;
 
        if (unlikely(bmdma2 & SIL_DMA_SATA_IRQ)) {
-               u32 serror;
+               u32 serror = 0xffffffff;
 
                /* SIEN doesn't mask SATA IRQs on some 3112s.  Those
                 * controllers continue to assert IRQ as long as
index e18566a..1c37457 100644 (file)
@@ -460,6 +460,21 @@ static int pm_genpd_runtime_resume(struct device *dev)
        return 0;
 }
 
+/**
+ * pm_genpd_poweroff_unused - Power off all PM domains with no devices in use.
+ */
+void pm_genpd_poweroff_unused(void)
+{
+       struct generic_pm_domain *genpd;
+
+       mutex_lock(&gpd_list_lock);
+
+       list_for_each_entry(genpd, &gpd_list, gpd_list_node)
+               genpd_queue_power_off_work(genpd);
+
+       mutex_unlock(&gpd_list_lock);
+}
+
 #else
 
 static inline void genpd_power_off_work_fn(struct work_struct *work) {}
@@ -1255,18 +1270,3 @@ void pm_genpd_init(struct generic_pm_domain *genpd,
        list_add(&genpd->gpd_list_node, &gpd_list);
        mutex_unlock(&gpd_list_lock);
 }
-
-/**
- * pm_genpd_poweroff_unused - Power off all PM domains with no devices in use.
- */
-void pm_genpd_poweroff_unused(void)
-{
-       struct generic_pm_domain *genpd;
-
-       mutex_lock(&gpd_list_lock);
-
-       list_for_each_entry(genpd, &gpd_list, gpd_list_node)
-               genpd_queue_power_off_work(genpd);
-
-       mutex_unlock(&gpd_list_lock);
-}
index c2231ff..c4f7a45 100644 (file)
@@ -113,3 +113,4 @@ struct regmap *regmap_init_i2c(struct i2c_client *i2c,
 }
 EXPORT_SYMBOL_GPL(regmap_init_i2c);
 
+MODULE_LICENSE("GPL");
index 4deba06..f839694 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/regmap.h>
 #include <linux/spi/spi.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 static int regmap_spi_write(struct device *dev, const void *data, size_t count)
 {
@@ -70,3 +71,5 @@ struct regmap *regmap_init_spi(struct spi_device *spi,
        return regmap_init(&spi->dev, &regmap_spi, config);
 }
 EXPORT_SYMBOL_GPL(regmap_init_spi);
+
+MODULE_LICENSE("GPL");
index cf3565c..0eef4da 100644 (file)
@@ -317,7 +317,7 @@ static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val,
                u8[0] |= map->bus->read_flag_mask;
 
        ret = map->bus->read(map->dev, map->work_buf, map->format.reg_bytes,
-                            val, map->format.val_bytes);
+                            val, val_len);
        if (ret != 0)
                return ret;
 
index 717d6e4..6f07ec1 100644 (file)
@@ -256,6 +256,21 @@ config BLK_DEV_LOOP
 
          Most users will answer N here.
 
+config BLK_DEV_LOOP_MIN_COUNT
+       int "Number of loop devices to pre-create at init time"
+       depends on BLK_DEV_LOOP
+       default 8
+       help
+         Static number of loop devices to be unconditionally pre-created
+         at init time.
+
+         This default value can be overwritten on the kernel command
+         line or with module-parameter loop.max_loop.
+
+         The historic default is 8. If a late 2011 version of losetup(8)
+         is used, it can be set to 0, since needed loop devices can be
+         dynamically allocated with the /dev/loop-control interface.
+
 config BLK_DEV_CRYPTOLOOP
        tristate "Cryptoloop Support"
        select CRYPTO
@@ -471,7 +486,7 @@ config XEN_BLKDEV_FRONTEND
          in another domain which drives the actual block device.
 
 config XEN_BLKDEV_BACKEND
-       tristate "Block-device backend driver"
+       tristate "Xen block-device backend driver"
        depends on XEN_BACKEND
        help
          The block-device backend driver allows the kernel to export its
index 515bcd9..0feab26 100644 (file)
@@ -1829,10 +1829,10 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
 
        /* silently ignore cpu mask on UP kernel */
        if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
-               err = __bitmap_parse(sc.cpu_mask, 32, 0,
+               err = bitmap_parse(sc.cpu_mask, 32,
                                cpumask_bits(new_cpu_mask), nr_cpu_ids);
                if (err) {
-                       dev_warn(DEV, "__bitmap_parse() failed with %d\n", err);
+                       dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
                        retcode = ERR_CPU_MASK_PARSE;
                        goto fail;
                }
index 76c8da7..4720c7a 100644 (file)
 #include <linux/kthread.h>
 #include <linux/splice.h>
 #include <linux/sysfs.h>
-
+#include <linux/miscdevice.h>
 #include <asm/uaccess.h>
 
-static LIST_HEAD(loop_devices);
-static DEFINE_MUTEX(loop_devices_mutex);
+static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_index_mutex);
 
 static int max_part;
 static int part_shift;
@@ -722,17 +722,10 @@ static inline int is_loop_device(struct file *file)
 static ssize_t loop_attr_show(struct device *dev, char *page,
                              ssize_t (*callback)(struct loop_device *, char *))
 {
-       struct loop_device *l, *lo = NULL;
-
-       mutex_lock(&loop_devices_mutex);
-       list_for_each_entry(l, &loop_devices, lo_list)
-               if (disk_to_dev(l->lo_disk) == dev) {
-                       lo = l;
-                       break;
-               }
-       mutex_unlock(&loop_devices_mutex);
+       struct gendisk *disk = dev_to_disk(dev);
+       struct loop_device *lo = disk->private_data;
 
-       return lo ? callback(lo, page) : -EIO;
+       return callback(lo, page);
 }
 
 #define LOOP_ATTR_RO(_name)                                            \
@@ -750,10 +743,10 @@ static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
        ssize_t ret;
        char *p = NULL;
 
-       mutex_lock(&lo->lo_ctl_mutex);
+       spin_lock_irq(&lo->lo_lock);
        if (lo->lo_backing_file)
                p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
-       mutex_unlock(&lo->lo_ctl_mutex);
+       spin_unlock_irq(&lo->lo_lock);
 
        if (IS_ERR_OR_NULL(p))
                ret = PTR_ERR(p);
@@ -1007,7 +1000,9 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
 
        kthread_stop(lo->lo_thread);
 
+       spin_lock_irq(&lo->lo_lock);
        lo->lo_backing_file = NULL;
+       spin_unlock_irq(&lo->lo_lock);
 
        loop_release_xfer(lo);
        lo->transfer = NULL;
@@ -1485,13 +1480,22 @@ static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
 
 static int lo_open(struct block_device *bdev, fmode_t mode)
 {
-       struct loop_device *lo = bdev->bd_disk->private_data;
+       struct loop_device *lo;
+       int err = 0;
+
+       mutex_lock(&loop_index_mutex);
+       lo = bdev->bd_disk->private_data;
+       if (!lo) {
+               err = -ENXIO;
+               goto out;
+       }
 
        mutex_lock(&lo->lo_ctl_mutex);
        lo->lo_refcnt++;
        mutex_unlock(&lo->lo_ctl_mutex);
-
-       return 0;
+out:
+       mutex_unlock(&loop_index_mutex);
+       return err;
 }
 
 static int lo_release(struct gendisk *disk, fmode_t mode)
@@ -1557,40 +1561,71 @@ int loop_register_transfer(struct loop_func_table *funcs)
        return 0;
 }
 
+static int unregister_transfer_cb(int id, void *ptr, void *data)
+{
+       struct loop_device *lo = ptr;
+       struct loop_func_table *xfer = data;
+
+       mutex_lock(&lo->lo_ctl_mutex);
+       if (lo->lo_encryption == xfer)
+               loop_release_xfer(lo);
+       mutex_unlock(&lo->lo_ctl_mutex);
+       return 0;
+}
+
 int loop_unregister_transfer(int number)
 {
        unsigned int n = number;
-       struct loop_device *lo;
        struct loop_func_table *xfer;
 
        if (n == 0 || n >= MAX_LO_CRYPT || (xfer = xfer_funcs[n]) == NULL)
                return -EINVAL;
 
        xfer_funcs[n] = NULL;
-
-       list_for_each_entry(lo, &loop_devices, lo_list) {
-               mutex_lock(&lo->lo_ctl_mutex);
-
-               if (lo->lo_encryption == xfer)
-                       loop_release_xfer(lo);
-
-               mutex_unlock(&lo->lo_ctl_mutex);
-       }
-
+       idr_for_each(&loop_index_idr, &unregister_transfer_cb, xfer);
        return 0;
 }
 
 EXPORT_SYMBOL(loop_register_transfer);
 EXPORT_SYMBOL(loop_unregister_transfer);
 
-static struct loop_device *loop_alloc(int i)
+static int loop_add(struct loop_device **l, int i)
 {
        struct loop_device *lo;
        struct gendisk *disk;
+       int err;
 
        lo = kzalloc(sizeof(*lo), GFP_KERNEL);
-       if (!lo)
+       if (!lo) {
+               err = -ENOMEM;
                goto out;
+       }
+
+       err = idr_pre_get(&loop_index_idr, GFP_KERNEL);
+       if (err < 0)
+               goto out_free_dev;
+
+       if (i >= 0) {
+               int m;
+
+               /* create specific i in the index */
+               err = idr_get_new_above(&loop_index_idr, lo, i, &m);
+               if (err >= 0 && i != m) {
+                       idr_remove(&loop_index_idr, m);
+                       err = -EEXIST;
+               }
+       } else if (i == -1) {
+               int m;
+
+               /* get next free nr */
+               err = idr_get_new(&loop_index_idr, lo, &m);
+               if (err >= 0)
+                       i = m;
+       } else {
+               err = -EINVAL;
+       }
+       if (err < 0)
+               goto out_free_dev;
 
        lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
        if (!lo->lo_queue)
@@ -1611,81 +1646,158 @@ static struct loop_device *loop_alloc(int i)
        disk->private_data      = lo;
        disk->queue             = lo->lo_queue;
        sprintf(disk->disk_name, "loop%d", i);
-       return lo;
+       add_disk(disk);
+       *l = lo;
+       return lo->lo_number;
 
 out_free_queue:
        blk_cleanup_queue(lo->lo_queue);
 out_free_dev:
        kfree(lo);
 out:
-       return NULL;
+       return err;
 }
 
-static void loop_free(struct loop_device *lo)
+static void loop_remove(struct loop_device *lo)
 {
+       del_gendisk(lo->lo_disk);
        blk_cleanup_queue(lo->lo_queue);
        put_disk(lo->lo_disk);
-       list_del(&lo->lo_list);
        kfree(lo);
 }
 
-static struct loop_device *loop_init_one(int i)
+static int find_free_cb(int id, void *ptr, void *data)
+{
+       struct loop_device *lo = ptr;
+       struct loop_device **l = data;
+
+       if (lo->lo_state == Lo_unbound) {
+               *l = lo;
+               return 1;
+       }
+       return 0;
+}
+
+static int loop_lookup(struct loop_device **l, int i)
 {
        struct loop_device *lo;
+       int ret = -ENODEV;
 
-       list_for_each_entry(lo, &loop_devices, lo_list) {
-               if (lo->lo_number == i)
-                       return lo;
+       if (i < 0) {
+               int err;
+
+               err = idr_for_each(&loop_index_idr, &find_free_cb, &lo);
+               if (err == 1) {
+                       *l = lo;
+                       ret = lo->lo_number;
+               }
+               goto out;
        }
 
-       lo = loop_alloc(i);
+       /* lookup and return a specific i */
+       lo = idr_find(&loop_index_idr, i);
        if (lo) {
-               add_disk(lo->lo_disk);
-               list_add_tail(&lo->lo_list, &loop_devices);
+               *l = lo;
+               ret = lo->lo_number;
        }
-       return lo;
-}
-
-static void loop_del_one(struct loop_device *lo)
-{
-       del_gendisk(lo->lo_disk);
-       loop_free(lo);
+out:
+       return ret;
 }
 
 static struct kobject *loop_probe(dev_t dev, int *part, void *data)
 {
        struct loop_device *lo;
        struct kobject *kobj;
+       int err;
 
-       mutex_lock(&loop_devices_mutex);
-       lo = loop_init_one(MINOR(dev) >> part_shift);
-       kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
-       mutex_unlock(&loop_devices_mutex);
+       mutex_lock(&loop_index_mutex);
+       err = loop_lookup(&lo, MINOR(dev) >> part_shift);
+       if (err < 0)
+               err = loop_add(&lo, MINOR(dev) >> part_shift);
+       if (err < 0)
+               kobj = ERR_PTR(err);
+       else
+               kobj = get_disk(lo->lo_disk);
+       mutex_unlock(&loop_index_mutex);
 
        *part = 0;
        return kobj;
 }
 
+static long loop_control_ioctl(struct file *file, unsigned int cmd,
+                              unsigned long parm)
+{
+       struct loop_device *lo;
+       int ret = -ENOSYS;
+
+       mutex_lock(&loop_index_mutex);
+       switch (cmd) {
+       case LOOP_CTL_ADD:
+               ret = loop_lookup(&lo, parm);
+               if (ret >= 0) {
+                       ret = -EEXIST;
+                       break;
+               }
+               ret = loop_add(&lo, parm);
+               break;
+       case LOOP_CTL_REMOVE:
+               ret = loop_lookup(&lo, parm);
+               if (ret < 0)
+                       break;
+               mutex_lock(&lo->lo_ctl_mutex);
+               if (lo->lo_state != Lo_unbound) {
+                       ret = -EBUSY;
+                       mutex_unlock(&lo->lo_ctl_mutex);
+                       break;
+               }
+               if (lo->lo_refcnt > 0) {
+                       ret = -EBUSY;
+                       mutex_unlock(&lo->lo_ctl_mutex);
+                       break;
+               }
+               lo->lo_disk->private_data = NULL;
+               mutex_unlock(&lo->lo_ctl_mutex);
+               idr_remove(&loop_index_idr, lo->lo_number);
+               loop_remove(lo);
+               break;
+       case LOOP_CTL_GET_FREE:
+               ret = loop_lookup(&lo, -1);
+               if (ret >= 0)
+                       break;
+               ret = loop_add(&lo, -1);
+       }
+       mutex_unlock(&loop_index_mutex);
+
+       return ret;
+}
+
+static const struct file_operations loop_ctl_fops = {
+       .open           = nonseekable_open,
+       .unlocked_ioctl = loop_control_ioctl,
+       .compat_ioctl   = loop_control_ioctl,
+       .owner          = THIS_MODULE,
+       .llseek         = noop_llseek,
+};
+
+static struct miscdevice loop_misc = {
+       .minor          = LOOP_CTRL_MINOR,
+       .name           = "loop-control",
+       .fops           = &loop_ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+MODULE_ALIAS("devname:loop-control");
+
 static int __init loop_init(void)
 {
        int i, nr;
        unsigned long range;
-       struct loop_device *lo, *next;
+       struct loop_device *lo;
+       int err;
 
-       /*
-        * loop module now has a feature to instantiate underlying device
-        * structure on-demand, provided that there is an access dev node.
-        * However, this will not work well with user space tool that doesn't
-        * know about such "feature".  In order to not break any existing
-        * tool, we do the following:
-        *
-        * (1) if max_loop is specified, create that many upfront, and this
-        *     also becomes a hard limit.
-        * (2) if max_loop is not specified, create 8 loop device on module
-        *     load, user can further extend loop device by create dev node
-        *     themselves and have kernel automatically instantiate actual
-        *     device on-demand.
-        */
+       err = misc_register(&loop_misc);
+       if (err < 0)
+               return err;
 
        part_shift = 0;
        if (max_part > 0) {
@@ -1708,57 +1820,60 @@ static int __init loop_init(void)
        if (max_loop > 1UL << (MINORBITS - part_shift))
                return -EINVAL;
 
+       /*
+        * If max_loop is specified, create that many devices upfront.
+        * This also becomes a hard limit. If max_loop is not specified,
+        * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+        * init time. Loop devices can be requested on-demand with the
+        * /dev/loop-control interface, or be instantiated by accessing
+        * a 'dead' device node.
+        */
        if (max_loop) {
                nr = max_loop;
                range = max_loop << part_shift;
        } else {
-               nr = 8;
+               nr = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
                range = 1UL << MINORBITS;
        }
 
        if (register_blkdev(LOOP_MAJOR, "loop"))
                return -EIO;
 
-       for (i = 0; i < nr; i++) {
-               lo = loop_alloc(i);
-               if (!lo)
-                       goto Enomem;
-               list_add_tail(&lo->lo_list, &loop_devices);
-       }
-
-       /* point of no return */
-
-       list_for_each_entry(lo, &loop_devices, lo_list)
-               add_disk(lo->lo_disk);
-
        blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
                                  THIS_MODULE, loop_probe, NULL, NULL);
 
+       /* pre-create number of devices given by config or max_loop */
+       mutex_lock(&loop_index_mutex);
+       for (i = 0; i < nr; i++)
+               loop_add(&lo, i);
+       mutex_unlock(&loop_index_mutex);
+
        printk(KERN_INFO "loop: module loaded\n");
        return 0;
+}
 
-Enomem:
-       printk(KERN_INFO "loop: out of memory\n");
-
-       list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-               loop_free(lo);
+static int loop_exit_cb(int id, void *ptr, void *data)
+{
+       struct loop_device *lo = ptr;
 
-       unregister_blkdev(LOOP_MAJOR, "loop");
-       return -ENOMEM;
+       loop_remove(lo);
+       return 0;
 }
 
 static void __exit loop_exit(void)
 {
        unsigned long range;
-       struct loop_device *lo, *next;
 
        range = max_loop ? max_loop << part_shift : 1UL << MINORBITS;
 
-       list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
-               loop_del_one(lo);
+       idr_for_each(&loop_index_idr, &loop_exit_cb, NULL);
+       idr_remove_all(&loop_index_idr);
+       idr_destroy(&loop_index_idr);
 
        blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
        unregister_blkdev(LOOP_MAJOR, "loop");
+
+       misc_deregister(&loop_misc);
 }
 
 module_init(loop_init);
index 773bfa7..ae3e167 100644 (file)
@@ -1184,6 +1184,7 @@ static struct of_device_id swim3_match[] =
        {
        .compatible     = "swim3"
        },
+       { /* end of list */ }
 };
 
 static struct macio_driver swim3_driver =
index b536a9c..9ea8c25 100644 (file)
@@ -123,8 +123,8 @@ static DEFINE_SPINLOCK(minor_lock);
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 #define EMULATED_HD_DISK_MINOR_OFFSET (0)
 #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
-#define EMULATED_SD_DISK_MINOR_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET + (4 * 16))
-#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_HD_DISK_NAME_OFFSET + 4)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
 
 #define DEV_NAME       "xvd"   /* name in /dev */
 
@@ -529,7 +529,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
                minor = BLKIF_MINOR_EXT(info->vdevice);
                nr_parts = PARTS_PER_EXT_DISK;
                offset = minor / nr_parts;
-               if (xen_hvm_domain() && offset <= EMULATED_HD_DISK_NAME_OFFSET + 4)
+               if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
                        printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
                                        "emulated IDE disks,\n\t choose an xvd device name"
                                        "from xvde on\n", info->vdevice);
index 75fb965..f997c27 100644 (file)
@@ -1929,11 +1929,17 @@ static int dvd_read_manufact(struct cdrom_device_info *cdi, dvd_struct *s,
                goto out;
 
        s->manufact.len = buf[0] << 8 | buf[1];
-       if (s->manufact.len < 0 || s->manufact.len > 2048) {
+       if (s->manufact.len < 0) {
                cdinfo(CD_WARNING, "Received invalid manufacture info length"
                                   " (%d)\n", s->manufact.len);
                ret = -EIO;
        } else {
+               if (s->manufact.len > 2048) {
+                       cdinfo(CD_WARNING, "Received invalid manufacture info "
+                                       "length (%d): truncating to 2048\n",
+                                       s->manufact.len);
+                       s->manufact.len = 2048;
+               }
                memcpy(s->manufact.value, &buf[4], s->manufact.len);
        }
 
index 04f1e7c..f6cf448 100644 (file)
@@ -1670,7 +1670,7 @@ static void i7core_mce_output_error(struct mem_ctl_info *mci,
        char *type, *optype, *err, *msg;
        unsigned long error = m->status & 0x1ff0000l;
        u32 optypenum = (m->status >> 4) & 0x07;
-       u32 core_err_cnt = (m->status >> 38) && 0x7fff;
+       u32 core_err_cnt = (m->status >> 38) & 0x7fff;
        u32 dimm = (m->misc >> 16) & 0x3;
        u32 channel = (m->misc >> 18) & 0x3;
        u32 syndrome = m->misc >> 32;
index e6ad3bb..4799393 100644 (file)
@@ -216,15 +216,33 @@ struct inbound_phy_packet_event {
        struct fw_cdev_event_phy_packet phy_packet;
 };
 
-static inline void __user *u64_to_uptr(__u64 value)
+#ifdef CONFIG_COMPAT
+static void __user *u64_to_uptr(u64 value)
+{
+       if (is_compat_task())
+               return compat_ptr(value);
+       else
+               return (void __user *)(unsigned long)value;
+}
+
+static u64 uptr_to_u64(void __user *ptr)
+{
+       if (is_compat_task())
+               return ptr_to_compat(ptr);
+       else
+               return (u64)(unsigned long)ptr;
+}
+#else
+static inline void __user *u64_to_uptr(u64 value)
 {
        return (void __user *)(unsigned long)value;
 }
 
-static inline __u64 uptr_to_u64(void __user *ptr)
+static inline u64 uptr_to_u64(void __user *ptr)
 {
-       return (__u64)(unsigned long)ptr;
+       return (u64)(unsigned long)ptr;
 }
+#endif /* CONFIG_COMPAT */
 
 static int fw_device_op_open(struct inode *inode, struct file *file)
 {
index 8ba7f79..f3b890d 100644 (file)
@@ -455,15 +455,20 @@ static struct device_attribute fw_device_attributes[] = {
 static int read_rom(struct fw_device *device,
                    int generation, int index, u32 *data)
 {
-       int rcode;
+       u64 offset = (CSR_REGISTER_BASE | CSR_CONFIG_ROM) + index * 4;
+       int i, rcode;
 
        /* device->node_id, accessed below, must not be older than generation */
        smp_rmb();
 
-       rcode = fw_run_transaction(device->card, TCODE_READ_QUADLET_REQUEST,
-                       device->node_id, generation, device->max_speed,
-                       (CSR_REGISTER_BASE | CSR_CONFIG_ROM) + index * 4,
-                       data, 4);
+       for (i = 10; i < 100; i += 10) {
+               rcode = fw_run_transaction(device->card,
+                               TCODE_READ_QUADLET_REQUEST, device->node_id,
+                               generation, device->max_speed, offset, data, 4);
+               if (rcode != RCODE_BUSY)
+                       break;
+               msleep(i);
+       }
        be32_to_cpus(data);
 
        return rcode;
index bcf792f..57cd3a4 100644 (file)
@@ -2179,8 +2179,13 @@ static int ohci_enable(struct fw_card *card,
                        ohci_driver_name, ohci)) {
                fw_error("Failed to allocate interrupt %d.\n", dev->irq);
                pci_disable_msi(dev);
-               dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE,
-                                 ohci->config_rom, ohci->config_rom_bus);
+
+               if (config_rom) {
+                       dma_free_coherent(ohci->card.device, CONFIG_ROM_SIZE,
+                                         ohci->next_config_rom,
+                                         ohci->next_config_rom_bus);
+                       ohci->next_config_rom = NULL;
+               }
                return -EIO;
        }
 
index a8ab626..3c395a5 100644 (file)
@@ -499,7 +499,7 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
        seq_printf(m, "Interrupts received: %d\n",
                   atomic_read(&dev_priv->irq_received));
        for (i = 0; i < I915_NUM_RINGS; i++) {
-               if (IS_GEN6(dev)) {
+               if (IS_GEN6(dev) || IS_GEN7(dev)) {
                        seq_printf(m, "Graphics Interrupt mask (%s):    %08x\n",
                                   dev_priv->ring[i].name,
                                   I915_READ_IMR(&dev_priv->ring[i]));
index feb4f16..7916bd9 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/io-mapping.h>
 #include <linux/i2c.h>
 #include <drm/intel-gtt.h>
+#include <linux/backlight.h>
 
 /* General customization:
  */
@@ -690,6 +691,7 @@ typedef struct drm_i915_private {
        int child_dev_num;
        struct child_device_config *child_dev;
        struct drm_connector *int_lvds_connector;
+       struct drm_connector *int_edp_connector;
 
        bool mchbar_need_disable;
 
@@ -723,6 +725,8 @@ typedef struct drm_i915_private {
        /* list of fbdev register on this device */
        struct intel_fbdev *fbdev;
 
+       struct backlight_device *backlight;
+
        struct drm_property *broadcast_rgb_property;
        struct drm_property *force_audio_property;
 
index 02f96fd..9cbb0cd 100644 (file)
@@ -2058,8 +2058,10 @@ void intel_irq_init(struct drm_device *dev)
                dev->driver->get_vblank_counter = gm45_get_vblank_counter;
        }
 
-
-       dev->driver->get_vblank_timestamp = i915_get_vblank_timestamp;
+       if (drm_core_check_feature(dev, DRIVER_MODESET))
+               dev->driver->get_vblank_timestamp = i915_get_vblank_timestamp;
+       else
+               dev->driver->get_vblank_timestamp = NULL;
        dev->driver->get_scanout_position = i915_get_crtc_scanoutpos;
 
        if (IS_IVYBRIDGE(dev)) {
index d1331f7..542453f 100644 (file)
 # define MI_FLUSH_ENABLE                               (1 << 11)
 
 #define GFX_MODE       0x02520
+#define GFX_MODE_GEN7  0x0229c
 #define   GFX_RUN_LIST_ENABLE          (1<<15)
 #define   GFX_TLB_INVALIDATE_ALWAYS    (1<<13)
 #define   GFX_SURFACE_FAULT_ENABLE     (1<<12)
 #define   GFX_PSMI_GRANULARITY         (1<<10)
 #define   GFX_PPGTT_ENABLE             (1<<9)
 
+#define GFX_MODE_ENABLE(bit) (((bit) << 16) | (bit))
+#define GFX_MODE_DISABLE(bit) (((bit) << 16) | (0))
+
 #define SCPD0          0x0209c /* 915+ only */
 #define IER            0x020a0
 #define IIR            0x020a4
 #define   ADPA_PIPE_SELECT_MASK        (1<<30)
 #define   ADPA_PIPE_A_SELECT   0
 #define   ADPA_PIPE_B_SELECT   (1<<30)
+#define   ADPA_PIPE_SELECT(pipe) ((pipe) << 30)
 #define   ADPA_USE_VGA_HVPOLARITY (1<<15)
 #define   ADPA_SETS_HVPOLARITY 0
 #define   ADPA_VSYNC_CNTL_DISABLE (1<<11)
 /* Selects pipe B for LVDS data.  Must be set on pre-965. */
 #define   LVDS_PIPEB_SELECT            (1 << 30)
 #define   LVDS_PIPE_MASK               (1 << 30)
+#define   LVDS_PIPE(pipe)              ((pipe) << 30)
 /* LVDS dithering flag on 965/g4x platform */
 #define   LVDS_ENABLE_DITHER           (1 << 25)
 /* LVDS sync polarity flags. Set to invert (i.e. negative) */
 #define   LVDS_B0B3_POWER_DOWN         (0 << 2)
 #define   LVDS_B0B3_POWER_UP           (3 << 2)
 
-#define LVDS_PIPE_ENABLED(V, P) \
-       (((V) & (LVDS_PIPE_MASK | LVDS_PORT_EN)) == ((P) << 30 | LVDS_PORT_EN))
-
 /* Video Data Island Packet control */
 #define VIDEO_DIP_DATA         0x61178
 #define VIDEO_DIP_CTL          0x61170
 #define  ADPA_CRT_HOTPLUG_VOLREF_475MV  (1<<17)
 #define  ADPA_CRT_HOTPLUG_FORCE_TRIGGER (1<<16)
 
-#define ADPA_PIPE_ENABLED(V, P) \
-       (((V) & (ADPA_TRANS_SELECT_MASK | ADPA_DAC_ENABLE)) == ((P) << 30 | ADPA_DAC_ENABLE))
-
 /* or SDVOB */
 #define HDMIB   0xe1140
 #define  PORT_ENABLE    (1 << 31)
 #define  TRANSCODER_A   (0)
 #define  TRANSCODER_B   (1 << 30)
+#define  TRANSCODER(pipe)      ((pipe) << 30)
 #define  TRANSCODER_MASK   (1 << 30)
 #define  COLOR_FORMAT_8bpc      (0)
 #define  COLOR_FORMAT_12bpc     (3 << 26)
 #define  HSYNC_ACTIVE_HIGH      (1 << 3)
 #define  PORT_DETECTED          (1 << 2)
 
-#define HDMI_PIPE_ENABLED(V, P) \
-       (((V) & (TRANSCODER_MASK | PORT_ENABLE)) == ((P) << 30 | PORT_ENABLE))
-
 /* PCH SDVOB multiplex with HDMIB */
 #define PCH_SDVOB      HDMIB
 
 #define  PORT_TRANS_B_SEL_CPT  (1<<29)
 #define  PORT_TRANS_C_SEL_CPT  (2<<29)
 #define  PORT_TRANS_SEL_MASK   (3<<29)
+#define  PORT_TRANS_SEL_CPT(pipe)      ((pipe) << 29)
 
 #define TRANS_DP_CTL_A         0xe0300
 #define TRANS_DP_CTL_B         0xe1300
index 87677d6..f107423 100644 (file)
@@ -871,7 +871,8 @@ int i915_restore_state(struct drm_device *dev)
        }
        mutex_unlock(&dev->struct_mutex);
 
-       intel_init_clock_gating(dev);
+       if (drm_core_check_feature(dev, DRIVER_MODESET))
+               intel_init_clock_gating(dev);
 
        if (IS_IRONLAKE_M(dev)) {
                ironlake_enable_drps(dev);
index 35364e6..ee1d701 100644 (file)
@@ -980,8 +980,8 @@ static void assert_transcoder_disabled(struct drm_i915_private *dev_priv,
             pipe_name(pipe));
 }
 
-static bool dp_pipe_enabled(struct drm_i915_private *dev_priv, enum pipe pipe,
-                           int reg, u32 port_sel, u32 val)
+static bool dp_pipe_enabled(struct drm_i915_private *dev_priv,
+                           enum pipe pipe, u32 port_sel, u32 val)
 {
        if ((val & DP_PORT_EN) == 0)
                return false;
@@ -998,11 +998,58 @@ static bool dp_pipe_enabled(struct drm_i915_private *dev_priv, enum pipe pipe,
        return true;
 }
 
+static bool hdmi_pipe_enabled(struct drm_i915_private *dev_priv,
+                             enum pipe pipe, u32 val)
+{
+       if ((val & PORT_ENABLE) == 0)
+               return false;
+
+       if (HAS_PCH_CPT(dev_priv->dev)) {
+               if ((val & PORT_TRANS_SEL_MASK) != PORT_TRANS_SEL_CPT(pipe))
+                       return false;
+       } else {
+               if ((val & TRANSCODER_MASK) != TRANSCODER(pipe))
+                       return false;
+       }
+       return true;
+}
+
+static bool lvds_pipe_enabled(struct drm_i915_private *dev_priv,
+                             enum pipe pipe, u32 val)
+{
+       if ((val & LVDS_PORT_EN) == 0)
+               return false;
+
+       if (HAS_PCH_CPT(dev_priv->dev)) {
+               if ((val & PORT_TRANS_SEL_MASK) != PORT_TRANS_SEL_CPT(pipe))
+                       return false;
+       } else {
+               if ((val & LVDS_PIPE_MASK) != LVDS_PIPE(pipe))
+                       return false;
+       }
+       return true;
+}
+
+static bool adpa_pipe_enabled(struct drm_i915_private *dev_priv,
+                             enum pipe pipe, u32 val)
+{
+       if ((val & ADPA_DAC_ENABLE) == 0)
+               return false;
+       if (HAS_PCH_CPT(dev_priv->dev)) {
+               if ((val & PORT_TRANS_SEL_MASK) != PORT_TRANS_SEL_CPT(pipe))
+                       return false;
+       } else {
+               if ((val & ADPA_PIPE_SELECT_MASK) != ADPA_PIPE_SELECT(pipe))
+                       return false;
+       }
+       return true;
+}
+
 static void assert_pch_dp_disabled(struct drm_i915_private *dev_priv,
                                   enum pipe pipe, int reg, u32 port_sel)
 {
        u32 val = I915_READ(reg);
-       WARN(dp_pipe_enabled(dev_priv, pipe, reg, port_sel, val),
+       WARN(dp_pipe_enabled(dev_priv, pipe, port_sel, val),
             "PCH DP (0x%08x) enabled on transcoder %c, should be disabled\n",
             reg, pipe_name(pipe));
 }
@@ -1011,7 +1058,7 @@ static void assert_pch_hdmi_disabled(struct drm_i915_private *dev_priv,
                                     enum pipe pipe, int reg)
 {
        u32 val = I915_READ(reg);
-       WARN(HDMI_PIPE_ENABLED(val, pipe),
+       WARN(hdmi_pipe_enabled(dev_priv, val, pipe),
             "PCH DP (0x%08x) enabled on transcoder %c, should be disabled\n",
             reg, pipe_name(pipe));
 }
@@ -1028,13 +1075,13 @@ static void assert_pch_ports_disabled(struct drm_i915_private *dev_priv,
 
        reg = PCH_ADPA;
        val = I915_READ(reg);
-       WARN(ADPA_PIPE_ENABLED(val, pipe),
+       WARN(adpa_pipe_enabled(dev_priv, val, pipe),
             "PCH VGA enabled on transcoder %c, should be disabled\n",
             pipe_name(pipe));
 
        reg = PCH_LVDS;
        val = I915_READ(reg);
-       WARN(LVDS_PIPE_ENABLED(val, pipe),
+       WARN(lvds_pipe_enabled(dev_priv, val, pipe),
             "PCH LVDS enabled on transcoder %c, should be disabled\n",
             pipe_name(pipe));
 
@@ -1360,7 +1407,7 @@ static void disable_pch_dp(struct drm_i915_private *dev_priv,
                           enum pipe pipe, int reg, u32 port_sel)
 {
        u32 val = I915_READ(reg);
-       if (dp_pipe_enabled(dev_priv, pipe, reg, port_sel, val)) {
+       if (dp_pipe_enabled(dev_priv, pipe, port_sel, val)) {
                DRM_DEBUG_KMS("Disabling pch dp %x on pipe %d\n", reg, pipe);
                I915_WRITE(reg, val & ~DP_PORT_EN);
        }
@@ -1370,7 +1417,7 @@ static void disable_pch_hdmi(struct drm_i915_private *dev_priv,
                             enum pipe pipe, int reg)
 {
        u32 val = I915_READ(reg);
-       if (HDMI_PIPE_ENABLED(val, pipe)) {
+       if (hdmi_pipe_enabled(dev_priv, val, pipe)) {
                DRM_DEBUG_KMS("Disabling pch HDMI %x on pipe %d\n",
                              reg, pipe);
                I915_WRITE(reg, val & ~PORT_ENABLE);
@@ -1392,12 +1439,13 @@ static void intel_disable_pch_ports(struct drm_i915_private *dev_priv,
 
        reg = PCH_ADPA;
        val = I915_READ(reg);
-       if (ADPA_PIPE_ENABLED(val, pipe))
+       if (adpa_pipe_enabled(dev_priv, val, pipe))
                I915_WRITE(reg, val & ~ADPA_DAC_ENABLE);
 
        reg = PCH_LVDS;
        val = I915_READ(reg);
-       if (LVDS_PIPE_ENABLED(val, pipe)) {
+       if (lvds_pipe_enabled(dev_priv, val, pipe)) {
+               DRM_DEBUG_KMS("disable lvds on pipe %d val 0x%08x\n", pipe, val);
                I915_WRITE(reg, val & ~LVDS_PORT_EN);
                POSTING_READ(reg);
                udelay(100);
@@ -5049,6 +5097,81 @@ static int i9xx_crtc_mode_set(struct drm_crtc *crtc,
        return ret;
 }
 
+static void ironlake_update_pch_refclk(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct drm_mode_config *mode_config = &dev->mode_config;
+       struct drm_crtc *crtc;
+       struct intel_encoder *encoder;
+       struct intel_encoder *has_edp_encoder = NULL;
+       u32 temp;
+       bool has_lvds = false;
+
+       /* We need to take the global config into account */
+       list_for_each_entry(crtc, &mode_config->crtc_list, head) {
+               if (!crtc->enabled)
+                       continue;
+
+               list_for_each_entry(encoder, &mode_config->encoder_list,
+                                   base.head) {
+                       if (encoder->base.crtc != crtc)
+                               continue;
+
+                       switch (encoder->type) {
+                       case INTEL_OUTPUT_LVDS:
+                               has_lvds = true;
+                       case INTEL_OUTPUT_EDP:
+                               has_edp_encoder = encoder;
+                               break;
+                       }
+               }
+       }
+
+       /* Ironlake: try to setup display ref clock before DPLL
+        * enabling. This is only under driver's control after
+        * PCH B stepping, previous chipset stepping should be
+        * ignoring this setting.
+        */
+       temp = I915_READ(PCH_DREF_CONTROL);
+       /* Always enable nonspread source */
+       temp &= ~DREF_NONSPREAD_SOURCE_MASK;
+       temp |= DREF_NONSPREAD_SOURCE_ENABLE;
+       temp &= ~DREF_SSC_SOURCE_MASK;
+       temp |= DREF_SSC_SOURCE_ENABLE;
+       I915_WRITE(PCH_DREF_CONTROL, temp);
+
+       POSTING_READ(PCH_DREF_CONTROL);
+       udelay(200);
+
+       if (has_edp_encoder) {
+               if (intel_panel_use_ssc(dev_priv)) {
+                       temp |= DREF_SSC1_ENABLE;
+                       I915_WRITE(PCH_DREF_CONTROL, temp);
+
+                       POSTING_READ(PCH_DREF_CONTROL);
+                       udelay(200);
+               }
+               temp &= ~DREF_CPU_SOURCE_OUTPUT_MASK;
+
+               /* Enable CPU source on CPU attached eDP */
+               if (!intel_encoder_is_pch_edp(&has_edp_encoder->base)) {
+                       if (intel_panel_use_ssc(dev_priv))
+                               temp |= DREF_CPU_SOURCE_OUTPUT_DOWNSPREAD;
+                       else
+                               temp |= DREF_CPU_SOURCE_OUTPUT_NONSPREAD;
+               } else {
+                       /* Enable SSC on PCH eDP if needed */
+                       if (intel_panel_use_ssc(dev_priv)) {
+                               DRM_ERROR("enabling SSC on PCH\n");
+                               temp |= DREF_SUPERSPREAD_SOURCE_ENABLE;
+                       }
+               }
+               I915_WRITE(PCH_DREF_CONTROL, temp);
+               POSTING_READ(PCH_DREF_CONTROL);
+               udelay(200);
+       }
+}
+
 static int ironlake_crtc_mode_set(struct drm_crtc *crtc,
                                  struct drm_display_mode *mode,
                                  struct drm_display_mode *adjusted_mode,
@@ -5244,49 +5367,7 @@ static int ironlake_crtc_mode_set(struct drm_crtc *crtc,
        ironlake_compute_m_n(intel_crtc->bpp, lane, target_clock, link_bw,
                             &m_n);
 
-       /* Ironlake: try to setup display ref clock before DPLL
-        * enabling. This is only under driver's control after
-        * PCH B stepping, previous chipset stepping should be
-        * ignoring this setting.
-        */
-       temp = I915_READ(PCH_DREF_CONTROL);
-       /* Always enable nonspread source */
-       temp &= ~DREF_NONSPREAD_SOURCE_MASK;
-       temp |= DREF_NONSPREAD_SOURCE_ENABLE;
-       temp &= ~DREF_SSC_SOURCE_MASK;
-       temp |= DREF_SSC_SOURCE_ENABLE;
-       I915_WRITE(PCH_DREF_CONTROL, temp);
-
-       POSTING_READ(PCH_DREF_CONTROL);
-       udelay(200);
-
-       if (has_edp_encoder) {
-               if (intel_panel_use_ssc(dev_priv)) {
-                       temp |= DREF_SSC1_ENABLE;
-                       I915_WRITE(PCH_DREF_CONTROL, temp);
-
-                       POSTING_READ(PCH_DREF_CONTROL);
-                       udelay(200);
-               }
-               temp &= ~DREF_CPU_SOURCE_OUTPUT_MASK;
-
-               /* Enable CPU source on CPU attached eDP */
-               if (!intel_encoder_is_pch_edp(&has_edp_encoder->base)) {
-                       if (intel_panel_use_ssc(dev_priv))
-                               temp |= DREF_CPU_SOURCE_OUTPUT_DOWNSPREAD;
-                       else
-                               temp |= DREF_CPU_SOURCE_OUTPUT_NONSPREAD;
-               } else {
-                       /* Enable SSC on PCH eDP if needed */
-                       if (intel_panel_use_ssc(dev_priv)) {
-                               DRM_ERROR("enabling SSC on PCH\n");
-                               temp |= DREF_SUPERSPREAD_SOURCE_ENABLE;
-                       }
-               }
-               I915_WRITE(PCH_DREF_CONTROL, temp);
-               POSTING_READ(PCH_DREF_CONTROL);
-               udelay(200);
-       }
+       ironlake_update_pch_refclk(dev);
 
        fp = clock.n << 16 | clock.m1 << 8 | clock.m2;
        if (has_reduced_clock)
index 0feae90..44fef5e 100644 (file)
@@ -1841,6 +1841,11 @@ done:
 static void
 intel_dp_destroy (struct drm_connector *connector)
 {
+       struct drm_device *dev = connector->dev;
+
+       if (intel_dpd_is_edp(dev))
+               intel_panel_destroy_backlight(dev);
+
        drm_sysfs_connector_remove(connector);
        drm_connector_cleanup(connector);
        kfree(connector);
@@ -2072,6 +2077,8 @@ intel_dp_init(struct drm_device *dev, int output_reg)
                                        DRM_MODE_TYPE_PREFERRED;
                        }
                }
+               dev_priv->int_edp_connector = connector;
+               intel_panel_setup_backlight(dev);
        }
 
        intel_dp_add_properties(intel_dp, connector);
index 7b330e7..0b2ee9d 100644 (file)
@@ -297,9 +297,10 @@ extern void intel_pch_panel_fitting(struct drm_device *dev,
 extern u32 intel_panel_get_max_backlight(struct drm_device *dev);
 extern u32 intel_panel_get_backlight(struct drm_device *dev);
 extern void intel_panel_set_backlight(struct drm_device *dev, u32 level);
-extern void intel_panel_setup_backlight(struct drm_device *dev);
+extern int intel_panel_setup_backlight(struct drm_device *dev);
 extern void intel_panel_enable_backlight(struct drm_device *dev);
 extern void intel_panel_disable_backlight(struct drm_device *dev);
+extern void intel_panel_destroy_backlight(struct drm_device *dev);
 extern enum drm_connector_status intel_panel_detect(struct drm_device *dev);
 
 extern void intel_crtc_load_lut(struct drm_crtc *crtc);
index 2e8ddfc..31da77f 100644 (file)
@@ -72,14 +72,16 @@ static void intel_lvds_enable(struct intel_lvds *intel_lvds)
 {
        struct drm_device *dev = intel_lvds->base.base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u32 ctl_reg, lvds_reg;
+       u32 ctl_reg, lvds_reg, stat_reg;
 
        if (HAS_PCH_SPLIT(dev)) {
                ctl_reg = PCH_PP_CONTROL;
                lvds_reg = PCH_LVDS;
+               stat_reg = PCH_PP_STATUS;
        } else {
                ctl_reg = PP_CONTROL;
                lvds_reg = LVDS;
+               stat_reg = PP_STATUS;
        }
 
        I915_WRITE(lvds_reg, I915_READ(lvds_reg) | LVDS_PORT_EN);
@@ -94,17 +96,16 @@ static void intel_lvds_enable(struct intel_lvds *intel_lvds)
                DRM_DEBUG_KMS("applying panel-fitter: %x, %x\n",
                              intel_lvds->pfit_control,
                              intel_lvds->pfit_pgm_ratios);
-               if (wait_for((I915_READ(PP_STATUS) & PP_ON) == 0, 1000)) {
-                       DRM_ERROR("timed out waiting for panel to power off\n");
-               } else {
-                       I915_WRITE(PFIT_PGM_RATIOS, intel_lvds->pfit_pgm_ratios);
-                       I915_WRITE(PFIT_CONTROL, intel_lvds->pfit_control);
-                       intel_lvds->pfit_dirty = false;
-               }
+
+               I915_WRITE(PFIT_PGM_RATIOS, intel_lvds->pfit_pgm_ratios);
+               I915_WRITE(PFIT_CONTROL, intel_lvds->pfit_control);
+               intel_lvds->pfit_dirty = false;
        }
 
        I915_WRITE(ctl_reg, I915_READ(ctl_reg) | POWER_TARGET_ON);
        POSTING_READ(lvds_reg);
+       if (wait_for((I915_READ(stat_reg) & PP_ON) != 0, 1000))
+               DRM_ERROR("timed out waiting for panel to power on\n");
 
        intel_panel_enable_backlight(dev);
 }
@@ -113,24 +114,25 @@ static void intel_lvds_disable(struct intel_lvds *intel_lvds)
 {
        struct drm_device *dev = intel_lvds->base.base.dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
-       u32 ctl_reg, lvds_reg;
+       u32 ctl_reg, lvds_reg, stat_reg;
 
        if (HAS_PCH_SPLIT(dev)) {
                ctl_reg = PCH_PP_CONTROL;
                lvds_reg = PCH_LVDS;
+               stat_reg = PCH_PP_STATUS;
        } else {
                ctl_reg = PP_CONTROL;
                lvds_reg = LVDS;
+               stat_reg = PP_STATUS;
        }
 
        intel_panel_disable_backlight(dev);
 
        I915_WRITE(ctl_reg, I915_READ(ctl_reg) & ~POWER_TARGET_ON);
+       if (wait_for((I915_READ(stat_reg) & PP_ON) == 0, 1000))
+               DRM_ERROR("timed out waiting for panel to power off\n");
 
        if (intel_lvds->pfit_control) {
-               if (wait_for((I915_READ(PP_STATUS) & PP_ON) == 0, 1000))
-                       DRM_ERROR("timed out waiting for panel to power off\n");
-
                I915_WRITE(PFIT_CONTROL, 0);
                intel_lvds->pfit_dirty = true;
        }
@@ -398,53 +400,21 @@ out:
 
 static void intel_lvds_prepare(struct drm_encoder *encoder)
 {
-       struct drm_device *dev = encoder->dev;
-       struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_lvds *intel_lvds = to_intel_lvds(encoder);
 
-       /* We try to do the minimum that is necessary in order to unlock
-        * the registers for mode setting.
-        *
-        * On Ironlake, this is quite simple as we just set the unlock key
-        * and ignore all subtleties. (This may cause some issues...)
-        *
+       /*
         * Prior to Ironlake, we must disable the pipe if we want to adjust
         * the panel fitter. However at all other times we can just reset
         * the registers regardless.
         */
-
-       if (HAS_PCH_SPLIT(dev)) {
-               I915_WRITE(PCH_PP_CONTROL,
-                          I915_READ(PCH_PP_CONTROL) | PANEL_UNLOCK_REGS);
-       } else if (intel_lvds->pfit_dirty) {
-               I915_WRITE(PP_CONTROL,
-                          (I915_READ(PP_CONTROL) | PANEL_UNLOCK_REGS)
-                          & ~POWER_TARGET_ON);
-       } else {
-               I915_WRITE(PP_CONTROL,
-                          I915_READ(PP_CONTROL) | PANEL_UNLOCK_REGS);
-       }
+       if (!HAS_PCH_SPLIT(encoder->dev) && intel_lvds->pfit_dirty)
+               intel_lvds_disable(intel_lvds);
 }
 
 static void intel_lvds_commit(struct drm_encoder *encoder)
 {
-       struct drm_device *dev = encoder->dev;
-       struct drm_i915_private *dev_priv = dev->dev_private;
        struct intel_lvds *intel_lvds = to_intel_lvds(encoder);
 
-       /* Undo any unlocking done in prepare to prevent accidental
-        * adjustment of the registers.
-        */
-       if (HAS_PCH_SPLIT(dev)) {
-               u32 val = I915_READ(PCH_PP_CONTROL);
-               if ((val & PANEL_UNLOCK_REGS) == PANEL_UNLOCK_REGS)
-                       I915_WRITE(PCH_PP_CONTROL, val & 0x3);
-       } else {
-               u32 val = I915_READ(PP_CONTROL);
-               if ((val & PANEL_UNLOCK_REGS) == PANEL_UNLOCK_REGS)
-                       I915_WRITE(PP_CONTROL, val & 0x3);
-       }
-
        /* Always do a full power on as we do not know what state
         * we were left in.
         */
@@ -582,6 +552,8 @@ static void intel_lvds_destroy(struct drm_connector *connector)
        struct drm_device *dev = connector->dev;
        struct drm_i915_private *dev_priv = dev->dev_private;
 
+       intel_panel_destroy_backlight(dev);
+
        if (dev_priv->lid_notifier.notifier_call)
                acpi_lid_notifier_unregister(&dev_priv->lid_notifier);
        drm_sysfs_connector_remove(connector);
@@ -1040,6 +1012,19 @@ out:
                pwm = I915_READ(BLC_PWM_PCH_CTL1);
                pwm |= PWM_PCH_ENABLE;
                I915_WRITE(BLC_PWM_PCH_CTL1, pwm);
+               /*
+                * Unlock registers and just
+                * leave them unlocked
+                */
+               I915_WRITE(PCH_PP_CONTROL,
+                          I915_READ(PCH_PP_CONTROL) | PANEL_UNLOCK_REGS);
+       } else {
+               /*
+                * Unlock registers and just
+                * leave them unlocked
+                */
+               I915_WRITE(PP_CONTROL,
+                          I915_READ(PP_CONTROL) | PANEL_UNLOCK_REGS);
        }
        dev_priv->lid_notifier.notifier_call = intel_lid_notify;
        if (acpi_lid_notifier_register(&dev_priv->lid_notifier)) {
@@ -1049,6 +1034,9 @@ out:
        /* keep the LVDS connector */
        dev_priv->int_lvds_connector = connector;
        drm_sysfs_connector_add(connector);
+
+       intel_panel_setup_backlight(dev);
+
        return true;
 
 failed:
index b7c5ddb..b8e8158 100644 (file)
@@ -227,7 +227,6 @@ void intel_opregion_asle_intr(struct drm_device *dev)
        asle->aslc = asle_stat;
 }
 
-/* Only present on Ironlake+ */
 void intel_opregion_gse_intr(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
index 05f500c..a9e0c7b 100644 (file)
@@ -277,7 +277,7 @@ void intel_panel_enable_backlight(struct drm_device *dev)
        dev_priv->backlight_enabled = true;
 }
 
-void intel_panel_setup_backlight(struct drm_device *dev)
+static void intel_panel_init_backlight(struct drm_device *dev)
 {
        struct drm_i915_private *dev_priv = dev->dev_private;
 
@@ -309,3 +309,73 @@ intel_panel_detect(struct drm_device *dev)
 
        return connector_status_unknown;
 }
+
+#ifdef CONFIG_BACKLIGHT_CLASS_DEVICE
+static int intel_panel_update_status(struct backlight_device *bd)
+{
+       struct drm_device *dev = bl_get_data(bd);
+       intel_panel_set_backlight(dev, bd->props.brightness);
+       return 0;
+}
+
+static int intel_panel_get_brightness(struct backlight_device *bd)
+{
+       struct drm_device *dev = bl_get_data(bd);
+       return intel_panel_get_backlight(dev);
+}
+
+static const struct backlight_ops intel_panel_bl_ops = {
+       .update_status = intel_panel_update_status,
+       .get_brightness = intel_panel_get_brightness,
+};
+
+int intel_panel_setup_backlight(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       struct backlight_properties props;
+       struct drm_connector *connector;
+
+       intel_panel_init_backlight(dev);
+
+       if (dev_priv->int_lvds_connector)
+               connector = dev_priv->int_lvds_connector;
+       else if (dev_priv->int_edp_connector)
+               connector = dev_priv->int_edp_connector;
+       else
+               return -ENODEV;
+
+       props.type = BACKLIGHT_RAW;
+       props.max_brightness = intel_panel_get_max_backlight(dev);
+       dev_priv->backlight =
+               backlight_device_register("intel_backlight",
+                                         &connector->kdev, dev,
+                                         &intel_panel_bl_ops, &props);
+
+       if (IS_ERR(dev_priv->backlight)) {
+               DRM_ERROR("Failed to register backlight: %ld\n",
+                         PTR_ERR(dev_priv->backlight));
+               dev_priv->backlight = NULL;
+               return -ENODEV;
+       }
+       dev_priv->backlight->props.brightness = intel_panel_get_backlight(dev);
+       return 0;
+}
+
+void intel_panel_destroy_backlight(struct drm_device *dev)
+{
+       struct drm_i915_private *dev_priv = dev->dev_private;
+       if (dev_priv->backlight)
+               backlight_device_unregister(dev_priv->backlight);
+}
+#else
+int intel_panel_setup_backlight(struct drm_device *dev)
+{
+       intel_panel_init_backlight(dev);
+       return 0;
+}
+
+void intel_panel_destroy_backlight(struct drm_device *dev)
+{
+       return;
+}
+#endif
index 47b9b27..c30626e 100644 (file)
@@ -290,6 +290,10 @@ static int init_render_ring(struct intel_ring_buffer *ring)
                if (IS_GEN6(dev) || IS_GEN7(dev))
                        mode |= MI_FLUSH_ENABLE << 16 | MI_FLUSH_ENABLE;
                I915_WRITE(MI_MODE, mode);
+               if (IS_GEN7(dev))
+                       I915_WRITE(GFX_MODE_GEN7,
+                                  GFX_MODE_DISABLE(GFX_TLB_INVALIDATE_ALWAYS) |
+                                  GFX_MODE_ENABLE(GFX_REPLAY_MODE));
        }
 
        if (INTEL_INFO(dev)->gen >= 6) {
index 645b84b..7ad43c6 100644 (file)
@@ -613,6 +613,18 @@ static bool radeon_dp_get_link_status(struct radeon_connector *radeon_connector,
        return true;
 }
 
+bool radeon_dp_needs_link_train(struct radeon_connector *radeon_connector)
+{
+       u8 link_status[DP_LINK_STATUS_SIZE];
+       struct radeon_connector_atom_dig *dig = radeon_connector->con_priv;
+
+       if (!radeon_dp_get_link_status(radeon_connector, link_status))
+               return false;
+       if (dp_channel_eq_ok(link_status, dig->dp_lane_count))
+               return false;
+       return true;
+}
+
 struct radeon_dp_link_train_info {
        struct radeon_device *rdev;
        struct drm_encoder *encoder;
index 14dce9f..fb5fa08 100644 (file)
@@ -743,7 +743,7 @@ static void evergreen_program_watermarks(struct radeon_device *rdev,
                    !evergreen_average_bandwidth_vs_available_bandwidth(&wm) ||
                    !evergreen_check_latency_hiding(&wm) ||
                    (rdev->disp_priority == 2)) {
-                       DRM_INFO("force priority to high\n");
+                       DRM_DEBUG_KMS("force priority to high\n");
                        priority_a_cnt |= PRIORITY_ALWAYS_ON;
                        priority_b_cnt |= PRIORITY_ALWAYS_ON;
                }
index 6d6b5f1..4f0c1ec 100644 (file)
@@ -60,18 +60,20 @@ void radeon_connector_hotplug(struct drm_connector *connector)
 
        radeon_hpd_set_polarity(rdev, radeon_connector->hpd.hpd);
 
-       /* powering up/down the eDP panel generates hpd events which
-        * can interfere with modesetting.
-        */
-       if (connector->connector_type == DRM_MODE_CONNECTOR_eDP)
+       /* if the connector is already off, don't turn it back on */
+       if (connector->dpms != DRM_MODE_DPMS_ON)
                return;
 
-       /* pre-r600 did not always have the hpd pins mapped accurately to connectors */
-       if (rdev->family >= CHIP_R600) {
-               if (radeon_hpd_sense(rdev, radeon_connector->hpd.hpd))
+       /* just deal with DP (not eDP) here. */
+       if (connector->connector_type == DRM_MODE_CONNECTOR_DisplayPort) {
+               int saved_dpms = connector->dpms;
+
+               if (radeon_hpd_sense(rdev, radeon_connector->hpd.hpd) &&
+                   radeon_dp_needs_link_train(radeon_connector))
                        drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON);
                else
                        drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF);
+               connector->dpms = saved_dpms;
        }
 }
 
@@ -464,6 +466,16 @@ static bool radeon_connector_needs_extended_probe(struct radeon_device *dev,
                    (supported_device == ATOM_DEVICE_DFP2_SUPPORT))
                        return true;
        }
+       /* TOSHIBA Satellite L300D with ATI Mobility Radeon x1100
+        * (RS690M) sends data to i2c bus for a HDMI connector that
+        * is not implemented */
+       if ((dev->pdev->device == 0x791f) &&
+           (dev->pdev->subsystem_vendor == 0x1179) &&
+           (dev->pdev->subsystem_device == 0xff68)) {
+               if ((connector_type == DRM_MODE_CONNECTOR_HDMIA) &&
+                   (supported_device == ATOM_DEVICE_DFP2_SUPPORT))
+                       return true;
+       }
 
        /* Default: no EDID header probe required for DDC probing */
        return false;
@@ -474,11 +486,19 @@ static void radeon_fixup_lvds_native_mode(struct drm_encoder *encoder,
 {
        struct radeon_encoder *radeon_encoder = to_radeon_encoder(encoder);
        struct drm_display_mode *native_mode = &radeon_encoder->native_mode;
+       struct drm_display_mode *t, *mode;
+
+       /* If the EDID preferred mode doesn't match the native mode, use it */
+       list_for_each_entry_safe(mode, t, &connector->probed_modes, head) {
+               if (mode->type & DRM_MODE_TYPE_PREFERRED) {
+                       if (mode->hdisplay != native_mode->hdisplay ||
+                           mode->vdisplay != native_mode->vdisplay)
+                               memcpy(native_mode, mode, sizeof(*mode));
+               }
+       }
 
        /* Try to get native mode details from EDID if necessary */
        if (!native_mode->clock) {
-               struct drm_display_mode *t, *mode;
-
                list_for_each_entry_safe(mode, t, &connector->probed_modes, head) {
                        if (mode->hdisplay == native_mode->hdisplay &&
                            mode->vdisplay == native_mode->vdisplay) {
@@ -489,6 +509,7 @@ static void radeon_fixup_lvds_native_mode(struct drm_encoder *encoder,
                        }
                }
        }
+
        if (!native_mode->clock) {
                DRM_DEBUG_KMS("No LVDS native mode details, disabling RMX\n");
                radeon_encoder->rmx_type = RMX_OFF;
index 440e6ec..b51e157 100644 (file)
@@ -32,6 +32,7 @@
 #include <drm/radeon_drm.h>
 #include <linux/vgaarb.h>
 #include <linux/vga_switcheroo.h>
+#include <linux/efi.h>
 #include "radeon_reg.h"
 #include "radeon.h"
 #include "atom.h"
@@ -300,6 +301,8 @@ void radeon_vram_location(struct radeon_device *rdev, struct radeon_mc *mc, u64
                mc->mc_vram_size = mc->aper_size;
        }
        mc->vram_end = mc->vram_start + mc->mc_vram_size - 1;
+       if (radeon_vram_limit && radeon_vram_limit < mc->real_vram_size)
+               mc->real_vram_size = radeon_vram_limit;
        dev_info(rdev->dev, "VRAM: %lluM 0x%016llX - 0x%016llX (%lluM used)\n",
                        mc->mc_vram_size >> 20, mc->vram_start,
                        mc->vram_end, mc->real_vram_size >> 20);
@@ -348,6 +351,9 @@ bool radeon_card_posted(struct radeon_device *rdev)
 {
        uint32_t reg;
 
+       if (efi_enabled && rdev->pdev->subsystem_vendor == PCI_VENDOR_ID_APPLE)
+               return false;
+
        /* first check CRTCs */
        if (ASIC_IS_DCE41(rdev)) {
                reg = RREG32(EVERGREEN_CRTC_CONTROL + EVERGREEN_CRTC0_REGISTER_OFFSET) |
index b293487..319d85d 100644 (file)
@@ -2323,6 +2323,9 @@ radeon_add_atom_encoder(struct drm_device *dev,
        default:
                encoder->possible_crtcs = 0x3;
                break;
+       case 4:
+               encoder->possible_crtcs = 0xf;
+               break;
        case 6:
                encoder->possible_crtcs = 0x3f;
                break;
index d09031c..68820f5 100644 (file)
@@ -479,6 +479,7 @@ extern void radeon_dp_set_link_config(struct drm_connector *connector,
                                      struct drm_display_mode *mode);
 extern void radeon_dp_link_train(struct drm_encoder *encoder,
                                 struct drm_connector *connector);
+extern bool radeon_dp_needs_link_train(struct radeon_connector *radeon_connector);
 extern u8 radeon_dp_getsinktype(struct radeon_connector *radeon_connector);
 extern bool radeon_dp_getdpcd(struct radeon_connector *radeon_connector);
 extern void atombios_dig_encoder_setup(struct drm_encoder *encoder, int action, int panel_mode);
index dee4a0c..602fa35 100644 (file)
@@ -40,10 +40,14 @@ void radeon_test_moves(struct radeon_device *rdev)
        size = 1024 * 1024;
 
        /* Number of tests =
-        * (Total GTT - IB pool - writeback page - ring buffer) / test size
+        * (Total GTT - IB pool - writeback page - ring buffers) / test size
         */
-       n = ((u32)(rdev->mc.gtt_size - RADEON_IB_POOL_SIZE*64*1024 - RADEON_GPU_PAGE_SIZE -
-            rdev->cp.ring_size)) / size;
+       n = rdev->mc.gtt_size - RADEON_IB_POOL_SIZE*64*1024 - rdev->cp.ring_size;
+       if (rdev->wb.wb_obj)
+               n -= RADEON_GPU_PAGE_SIZE;
+       if (rdev->ih.ring_obj)
+               n -= rdev->ih.ring_size;
+       n /= size;
 
        gtt_obj = kzalloc(n * sizeof(*gtt_obj), GFP_KERNEL);
        if (!gtt_obj) {
@@ -132,9 +136,15 @@ void radeon_test_moves(struct radeon_device *rdev)
                     gtt_start++, vram_start++) {
                        if (*vram_start != gtt_start) {
                                DRM_ERROR("Incorrect GTT->VRAM copy %d: Got 0x%p, "
-                                         "expected 0x%p (GTT map 0x%p-0x%p)\n",
-                                         i, *vram_start, gtt_start, gtt_map,
-                                         gtt_end);
+                                         "expected 0x%p (GTT/VRAM offset "
+                                         "0x%16llx/0x%16llx)\n",
+                                         i, *vram_start, gtt_start,
+                                         (unsigned long long)
+                                         (gtt_addr - rdev->mc.gtt_start +
+                                          (void*)gtt_start - gtt_map),
+                                         (unsigned long long)
+                                         (vram_addr - rdev->mc.vram_start +
+                                          (void*)gtt_start - gtt_map));
                                radeon_bo_kunmap(vram_obj);
                                goto out_cleanup;
                        }
@@ -175,9 +185,15 @@ void radeon_test_moves(struct radeon_device *rdev)
                     gtt_start++, vram_start++) {
                        if (*gtt_start != vram_start) {
                                DRM_ERROR("Incorrect VRAM->GTT copy %d: Got 0x%p, "
-                                         "expected 0x%p (VRAM map 0x%p-0x%p)\n",
-                                         i, *gtt_start, vram_start, vram_map,
-                                         vram_end);
+                                         "expected 0x%p (VRAM/GTT offset "
+                                         "0x%16llx/0x%16llx)\n",
+                                         i, *gtt_start, vram_start,
+                                         (unsigned long long)
+                                         (vram_addr - rdev->mc.vram_start +
+                                          (void*)vram_start - vram_map),
+                                         (unsigned long long)
+                                         (gtt_addr - rdev->mc.gtt_start +
+                                          (void*)vram_start - vram_map));
                                radeon_bo_kunmap(gtt_obj[i]);
                                goto out_cleanup;
                        }
index 60125dd..9b86fb0 100644 (file)
@@ -450,6 +450,29 @@ static int radeon_ttm_io_mem_reserve(struct ttm_bo_device *bdev, struct ttm_mem_
                        return -EINVAL;
                mem->bus.base = rdev->mc.aper_base;
                mem->bus.is_iomem = true;
+#ifdef __alpha__
+               /*
+                * Alpha: use bus.addr to hold the ioremap() return,
+                * so we can modify bus.base below.
+                */
+               if (mem->placement & TTM_PL_FLAG_WC)
+                       mem->bus.addr =
+                               ioremap_wc(mem->bus.base + mem->bus.offset,
+                                          mem->bus.size);
+               else
+                       mem->bus.addr =
+                               ioremap_nocache(mem->bus.base + mem->bus.offset,
+                                               mem->bus.size);
+
+               /*
+                * Alpha: Use just the bus offset plus
+                * the hose/domain memory base for bus.base.
+                * It then can be used to build PTEs for VRAM
+                * access, as done in ttm_bo_vm_fault().
+                */
+               mem->bus.base = (mem->bus.base & 0x0ffffffffUL) +
+                       rdev->ddev->hose->dense_mem_base;
+#endif
                break;
        default:
                return -EINVAL;
index 56619f6..a4d38d8 100644 (file)
@@ -353,8 +353,10 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc)
 
                ret = ttm_tt_set_user(bo->ttm, current,
                                      bo->buffer_start, bo->num_pages);
-               if (unlikely(ret != 0))
+               if (unlikely(ret != 0)) {
                        ttm_tt_destroy(bo->ttm);
+                       bo->ttm = NULL;
+               }
                break;
        default:
                printk(KERN_ERR TTM_PFX "Illegal buffer object type\n");
@@ -390,10 +392,12 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo,
         * Create and bind a ttm if required.
         */
 
-       if (!(new_man->flags & TTM_MEMTYPE_FLAG_FIXED) && (bo->ttm == NULL)) {
-               ret = ttm_bo_add_ttm(bo, false);
-               if (ret)
-                       goto out_err;
+       if (!(new_man->flags & TTM_MEMTYPE_FLAG_FIXED)) {
+               if (bo->ttm == NULL) {
+                       ret = ttm_bo_add_ttm(bo, false);
+                       if (ret)
+                               goto out_err;
+               }
 
                ret = ttm_tt_set_placement_caching(bo->ttm, mem->placement);
                if (ret)
index 77dbf40..ae3c6f5 100644 (file)
@@ -635,13 +635,13 @@ int ttm_bo_move_accel_cleanup(struct ttm_buffer_object *bo,
                if (ret)
                        return ret;
 
-               ttm_bo_free_old_node(bo);
                if ((man->flags & TTM_MEMTYPE_FLAG_FIXED) &&
                    (bo->ttm != NULL)) {
                        ttm_tt_unbind(bo->ttm);
                        ttm_tt_destroy(bo->ttm);
                        bo->ttm = NULL;
                }
+               ttm_bo_free_old_node(bo);
        } else {
                /**
                 * This should help pipeline ordinary buffer moves.
index 1a409c5..c316294 100644 (file)
@@ -432,13 +432,15 @@ static int aem_read_sensor(struct aem_data *data, u8 elt, u8 reg,
        aem_send_message(ipmi);
 
        res = wait_for_completion_timeout(&ipmi->read_complete, IPMI_TIMEOUT);
-       if (!res)
-               return -ETIMEDOUT;
+       if (!res) {
+               res = -ETIMEDOUT;
+               goto out;
+       }
 
        if (ipmi->rx_result || ipmi->rx_msg_len != rs_size ||
            memcmp(&rs_resp->id, &system_x_id, sizeof(system_x_id))) {
-               kfree(rs_resp);
-               return -ENOENT;
+               res = -ENOENT;
+               goto out;
        }
 
        switch (size) {
@@ -463,8 +465,11 @@ static int aem_read_sensor(struct aem_data *data, u8 elt, u8 reg,
                break;
        }
        }
+       res = 0;
 
-       return 0;
+out:
+       kfree(rs_resp);
+       return res;
 }
 
 /* Update AEM energy registers */
index d4bc114..ac254fb 100644 (file)
@@ -161,6 +161,17 @@ static int lm25066_write_word_data(struct i2c_client *client, int page, int reg,
        return ret;
 }
 
+static int lm25066_write_byte(struct i2c_client *client, int page, u8 value)
+{
+       if (page > 1)
+               return -EINVAL;
+
+       if (page == 0)
+               return pmbus_write_byte(client, 0, value);
+
+       return 0;
+}
+
 static int lm25066_probe(struct i2c_client *client,
                          const struct i2c_device_id *id)
 {
@@ -204,6 +215,7 @@ static int lm25066_probe(struct i2c_client *client,
 
        info->read_word_data = lm25066_read_word_data;
        info->write_word_data = lm25066_write_word_data;
+       info->write_byte = lm25066_write_byte;
 
        switch (id->driver_data) {
        case lm25066:
index 0808d98..a6ae20f 100644 (file)
@@ -325,6 +325,7 @@ struct pmbus_driver_info {
        int (*read_word_data)(struct i2c_client *client, int page, int reg);
        int (*write_word_data)(struct i2c_client *client, int page, int reg,
                               u16 word);
+       int (*write_byte)(struct i2c_client *client, int page, u8 value);
        /*
         * The identify function determines supported PMBus functionality.
         * This function is only necessary if a chip driver supports multiple
index 5c1b6cf..a561c3a 100644 (file)
@@ -182,6 +182,24 @@ int pmbus_write_byte(struct i2c_client *client, int page, u8 value)
 }
 EXPORT_SYMBOL_GPL(pmbus_write_byte);
 
+/*
+ * _pmbus_write_byte() is similar to pmbus_write_byte(), but checks if
+ * a device specific mapping funcion exists and calls it if necessary.
+ */
+static int _pmbus_write_byte(struct i2c_client *client, int page, u8 value)
+{
+       struct pmbus_data *data = i2c_get_clientdata(client);
+       const struct pmbus_driver_info *info = data->info;
+       int status;
+
+       if (info->write_byte) {
+               status = info->write_byte(client, page, value);
+               if (status != -ENODATA)
+                       return status;
+       }
+       return pmbus_write_byte(client, page, value);
+}
+
 int pmbus_write_word_data(struct i2c_client *client, u8 page, u8 reg, u16 word)
 {
        int rv;
@@ -281,7 +299,7 @@ static int _pmbus_read_byte_data(struct i2c_client *client, int page, int reg)
 
 static void pmbus_clear_fault_page(struct i2c_client *client, int page)
 {
-       pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
+       _pmbus_write_byte(client, page, PMBUS_CLEAR_FAULTS);
 }
 
 void pmbus_clear_faults(struct i2c_client *client)
index 43f89ba..fe89c46 100644 (file)
@@ -717,11 +717,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
 {
        struct ipoib_dev_priv *priv = netdev_priv(dev);
        struct ipoib_neigh *neigh;
-       struct neighbour *n;
+       struct neighbour *n = NULL;
        unsigned long flags;
 
-       n = dst_get_neighbour(skb_dst(skb));
-       if (likely(skb_dst(skb) && n)) {
+       if (likely(skb_dst(skb)))
+               n = dst_get_neighbour(skb_dst(skb));
+
+       if (likely(n)) {
                if (unlikely(!*to_ipoib_neigh(n))) {
                        ipoib_path_lookup(skb, dev);
                        return NETDEV_TX_OK;
index 8db008d..9c61b9c 100644 (file)
@@ -101,13 +101,17 @@ iscsi_iser_recv(struct iscsi_conn *conn,
 
        /* verify PDU length */
        datalen = ntoh24(hdr->dlength);
-       if (datalen != rx_data_len) {
-               printk(KERN_ERR "iscsi_iser: datalen %d (hdr) != %d (IB) \n",
-                      datalen, rx_data_len);
+       if (datalen > rx_data_len || (datalen + 4) < rx_data_len) {
+               iser_err("wrong datalen %d (hdr), %d (IB)\n",
+                       datalen, rx_data_len);
                rc = ISCSI_ERR_DATALEN;
                goto error;
        }
 
+       if (datalen != rx_data_len)
+               iser_dbg("aligned datalen (%d) hdr, %d (IB)\n",
+                       datalen, rx_data_len);
+
        /* read AHS */
        ahslen = hdr->hlength * 4;
 
index 342cbc1..db6f3ce 100644 (file)
@@ -89,7 +89,7 @@
        } while (0)
 
 #define SHIFT_4K       12
-#define SIZE_4K        (1UL << SHIFT_4K)
+#define SIZE_4K        (1ULL << SHIFT_4K)
 #define MASK_4K        (~(SIZE_4K-1))
 
                                        /* support up to 512KB in one RDMA */
index 5745b7f..f299de6 100644 (file)
@@ -412,7 +412,7 @@ int iser_send_control(struct iscsi_conn *conn,
                memcpy(iser_conn->ib_conn->login_buf, task->data,
                                                        task->data_count);
                tx_dsg->addr    = iser_conn->ib_conn->login_dma;
-               tx_dsg->length  = data_seg_len;
+               tx_dsg->length  = task->data_count;
                tx_dsg->lkey    = device->mr->lkey;
                mdesc->num_sge = 2;
        }
index efec413..68cd05b 100644 (file)
@@ -33,7 +33,7 @@ EXPORT_SYMBOL_GPL(cb710_pci_update_config_reg);
 static int __devinit cb710_pci_configure(struct pci_dev *pdev)
 {
        unsigned int devfn = PCI_DEVFN(PCI_SLOT(pdev->devfn), 0);
-       struct pci_dev *pdev0 = pci_get_slot(pdev->bus, devfn);
+       struct pci_dev *pdev0;
        u32 val;
 
        cb710_pci_update_config_reg(pdev, 0x48,
@@ -43,6 +43,7 @@ static int __devinit cb710_pci_configure(struct pci_dev *pdev)
        if (val & 0x80000000)
                return 0;
 
+       pdev0 = pci_get_slot(pdev->bus, devfn);
        if (!pdev0)
                return -ENODEV;
 
index 006a5e9..2bf229a 100644 (file)
@@ -224,7 +224,7 @@ static void mmc_test_prepare_mrq(struct mmc_test_card *test,
 static int mmc_test_busy(struct mmc_command *cmd)
 {
        return !(cmd->resp[0] & R1_READY_FOR_DATA) ||
-               (R1_CURRENT_STATE(cmd->resp[0]) == 7);
+               (R1_CURRENT_STATE(cmd->resp[0]) == R1_STATE_PRG);
 }
 
 /*
@@ -2900,7 +2900,7 @@ static const struct file_operations mmc_test_fops_testlist = {
        .release        = single_release,
 };
 
-static void mmc_test_free_file_test(struct mmc_card *card)
+static void mmc_test_free_dbgfs_file(struct mmc_card *card)
 {
        struct mmc_test_dbgfs_file *df, *dfs;
 
@@ -2917,34 +2917,21 @@ static void mmc_test_free_file_test(struct mmc_card *card)
        mutex_unlock(&mmc_test_lock);
 }
 
-static int mmc_test_register_file_test(struct mmc_card *card)
+static int __mmc_test_register_dbgfs_file(struct mmc_card *card,
+       const char *name, mode_t mode, const struct file_operations *fops)
 {
        struct dentry *file = NULL;
        struct mmc_test_dbgfs_file *df;
-       int ret = 0;
-
-       mutex_lock(&mmc_test_lock);
-
-       if (card->debugfs_root)
-               file = debugfs_create_file("test", S_IWUSR | S_IRUGO,
-                       card->debugfs_root, card, &mmc_test_fops_test);
-
-       if (IS_ERR_OR_NULL(file)) {
-               dev_err(&card->dev,
-                       "Can't create test. Perhaps debugfs is disabled.\n");
-               ret = -ENODEV;
-               goto err;
-       }
 
        if (card->debugfs_root)
-               file = debugfs_create_file("testlist", S_IRUGO,
-                       card->debugfs_root, card, &mmc_test_fops_testlist);
+               file = debugfs_create_file(name, mode, card->debugfs_root,
+                       card, fops);
 
        if (IS_ERR_OR_NULL(file)) {
                dev_err(&card->dev,
-                       "Can't create testlist. Perhaps debugfs is disabled.\n");
-               ret = -ENODEV;
-               goto err;
+                       "Can't create %s. Perhaps debugfs is disabled.\n",
+                       name);
+               return -ENODEV;
        }
 
        df = kmalloc(sizeof(struct mmc_test_dbgfs_file), GFP_KERNEL);
@@ -2952,14 +2939,31 @@ static int mmc_test_register_file_test(struct mmc_card *card)
                debugfs_remove(file);
                dev_err(&card->dev,
                        "Can't allocate memory for internal usage.\n");
-               ret = -ENOMEM;
-               goto err;
+               return -ENOMEM;
        }
 
        df->card = card;
        df->file = file;
 
        list_add(&df->link, &mmc_test_file_test);
+       return 0;
+}
+
+static int mmc_test_register_dbgfs_file(struct mmc_card *card)
+{
+       int ret;
+
+       mutex_lock(&mmc_test_lock);
+
+       ret = __mmc_test_register_dbgfs_file(card, "test", S_IWUSR | S_IRUGO,
+               &mmc_test_fops_test);
+       if (ret)
+               goto err;
+
+       ret = __mmc_test_register_dbgfs_file(card, "testlist", S_IRUGO,
+               &mmc_test_fops_testlist);
+       if (ret)
+               goto err;
 
 err:
        mutex_unlock(&mmc_test_lock);
@@ -2974,7 +2978,7 @@ static int mmc_test_probe(struct mmc_card *card)
        if (!mmc_card_mmc(card) && !mmc_card_sd(card))
                return -ENODEV;
 
-       ret = mmc_test_register_file_test(card);
+       ret = mmc_test_register_dbgfs_file(card);
        if (ret)
                return ret;
 
@@ -2986,7 +2990,7 @@ static int mmc_test_probe(struct mmc_card *card)
 static void mmc_test_remove(struct mmc_card *card)
 {
        mmc_test_free_result(card);
-       mmc_test_free_file_test(card);
+       mmc_test_free_dbgfs_file(card);
 }
 
 static struct mmc_driver mmc_driver = {
@@ -3006,7 +3010,7 @@ static void __exit mmc_test_exit(void)
 {
        /* Clear stalled data if card is still plugged */
        mmc_test_free_result(NULL);
-       mmc_test_free_file_test(NULL);
+       mmc_test_free_dbgfs_file(NULL);
 
        mmc_unregister_driver(&mmc_driver);
 }
index 89bdeae..91a0a74 100644 (file)
@@ -1502,7 +1502,7 @@ static int mmc_do_erase(struct mmc_card *card, unsigned int from,
                        goto out;
                }
        } while (!(cmd.resp[0] & R1_READY_FOR_DATA) ||
-                R1_CURRENT_STATE(cmd.resp[0]) == 7);
+                R1_CURRENT_STATE(cmd.resp[0]) == R1_STATE_PRG);
 out:
        return err;
 }
index aa7d1d7..5700b1c 100644 (file)
@@ -259,7 +259,7 @@ static int mmc_read_ext_csd(struct mmc_card *card, u8 *ext_csd)
        }
 
        card->ext_csd.rev = ext_csd[EXT_CSD_REV];
-       if (card->ext_csd.rev > 5) {
+       if (card->ext_csd.rev > 6) {
                printk(KERN_ERR "%s: unrecognised EXT_CSD revision %d\n",
                        mmc_hostname(card->host), card->ext_csd.rev);
                err = -EINVAL;
index 845ce7c..770c3d0 100644 (file)
@@ -407,7 +407,7 @@ int mmc_switch(struct mmc_card *card, u8 set, u8 index, u8 value,
                        break;
                if (mmc_host_is_spi(card->host))
                        break;
-       } while (R1_CURRENT_STATE(status) == 7);
+       } while (R1_CURRENT_STATE(status) == R1_STATE_PRG);
 
        if (mmc_host_is_spi(card->host)) {
                if (status & R1_SPI_ILLEGAL_COMMAND)
index 77f0b6b..ff0f714 100644 (file)
@@ -62,7 +62,7 @@ struct idmac_desc {
 
        u32             des1;   /* Buffer sizes */
 #define IDMAC_SET_BUFFER1_SIZE(d, s) \
-       ((d)->des1 = ((d)->des1 & 0x03ffc000) | ((s) & 0x3fff))
+       ((d)->des1 = ((d)->des1 & 0x03ffe000) | ((s) & 0x1fff))
 
        u32             des2;   /* buffer 1 physical address */
 
@@ -699,7 +699,7 @@ static void dw_mci_set_ios(struct mmc_host *mmc, struct mmc_ios *ios)
        }
 
        /* DDR mode set */
-       if (ios->ddr) {
+       if (ios->timing == MMC_TIMING_UHS_DDR50) {
                regs = mci_readl(slot->host, UHS_REG);
                regs |= (0x1 << slot->id) << 16;
                mci_writel(slot->host, UHS_REG, regs);
@@ -1646,7 +1646,7 @@ static int __init dw_mci_init_slot(struct dw_mci *host, unsigned int id)
                        mmc->caps |= MMC_CAP_4_BIT_DATA;
 
        if (host->pdata->quirks & DW_MCI_QUIRK_HIGHSPEED)
-               mmc->caps |= MMC_CAP_SD_HIGHSPEED;
+               mmc->caps |= MMC_CAP_SD_HIGHSPEED | MMC_CAP_MMC_HIGHSPEED;
 
 #ifdef CONFIG_MMC_DW_IDMAC
        mmc->max_segs = host->ring_size;
index 9ebfb4b..0e9780f 100644 (file)
@@ -27,6 +27,7 @@
 #include "sdhci-pltfm.h"
 #include "sdhci-esdhc.h"
 
+#define        SDHCI_CTRL_D3CD                 0x08
 /* VENDOR SPEC register */
 #define SDHCI_VENDOR_SPEC              0xC0
 #define  SDHCI_VENDOR_SPEC_SDIO_QUIRK  0x00000002
@@ -141,13 +142,32 @@ static void esdhc_writel_le(struct sdhci_host *host, u32 val, int reg)
        struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
        struct pltfm_imx_data *imx_data = pltfm_host->priv;
        struct esdhc_platform_data *boarddata = &imx_data->boarddata;
-
-       if (unlikely((reg == SDHCI_INT_ENABLE || reg == SDHCI_SIGNAL_ENABLE)
-                       && (boarddata->cd_type == ESDHC_CD_GPIO)))
-               /*
-                * these interrupts won't work with a custom card_detect gpio
-                */
-               val &= ~(SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT);
+       u32 data;
+
+       if (unlikely(reg == SDHCI_INT_ENABLE || reg == SDHCI_SIGNAL_ENABLE)) {
+               if (boarddata->cd_type == ESDHC_CD_GPIO)
+                       /*
+                        * These interrupts won't work with a custom
+                        * card_detect gpio (only applied to mx25/35)
+                        */
+                       val &= ~(SDHCI_INT_CARD_REMOVE | SDHCI_INT_CARD_INSERT);
+
+               if (val & SDHCI_INT_CARD_INT) {
+                       /*
+                        * Clear and then set D3CD bit to avoid missing the
+                        * card interrupt.  This is a eSDHC controller problem
+                        * so we need to apply the following workaround: clear
+                        * and set D3CD bit will make eSDHC re-sample the card
+                        * interrupt. In case a card interrupt was lost,
+                        * re-sample it by the following steps.
+                        */
+                       data = readl(host->ioaddr + SDHCI_HOST_CONTROL);
+                       data &= ~SDHCI_CTRL_D3CD;
+                       writel(data, host->ioaddr + SDHCI_HOST_CONTROL);
+                       data |= SDHCI_CTRL_D3CD;
+                       writel(data, host->ioaddr + SDHCI_HOST_CONTROL);
+               }
+       }
 
        if (unlikely((imx_data->flags & ESDHC_FLAG_MULTIBLK_NO_INT)
                                && (reg == SDHCI_INT_STATUS)
@@ -217,8 +237,10 @@ static void esdhc_writeb_le(struct sdhci_host *host, u8 val, int reg)
                 */
                return;
        case SDHCI_HOST_CONTROL:
-               /* FSL messed up here, so we can just keep those two */
-               new_val = val & (SDHCI_CTRL_LED | SDHCI_CTRL_4BITBUS);
+               /* FSL messed up here, so we can just keep those three */
+               new_val = val & (SDHCI_CTRL_LED | \
+                               SDHCI_CTRL_4BITBUS | \
+                               SDHCI_CTRL_D3CD);
                /* ensure the endianess */
                new_val |= ESDHC_HOST_CONTROL_LE;
                /* DMA mode bits are shifted */
index 4198dbb..fc7e4a5 100644 (file)
@@ -195,7 +195,8 @@ static int __devinit sdhci_pxav3_probe(struct platform_device *pdev)
        clk_enable(clk);
 
        host->quirks = SDHCI_QUIRK_BROKEN_TIMEOUT_VAL
-               | SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC;
+               | SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC
+               | SDHCI_QUIRK_32BIT_ADMA_SIZE;
 
        /* enable 1/8V DDR capable */
        host->mmc->caps |= MMC_CAP_1_8V_DDR;
index 460ffaf..2bd7bf4 100644 (file)
@@ -19,6 +19,7 @@
 #include <linux/clk.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
+#include <linux/module.h>
 
 #include <linux/mmc/host.h>
 
@@ -502,6 +503,9 @@ static int __devinit sdhci_s3c_probe(struct platform_device *pdev)
        /* This host supports the Auto CMD12 */
        host->quirks |= SDHCI_QUIRK_MULTIBLOCK_READ_ACMD12;
 
+       /* Samsung SoCs need BROKEN_ADMA_ZEROLEN_DESC */
+       host->quirks |= SDHCI_QUIRK_BROKEN_ADMA_ZEROLEN_DESC;
+
        if (pdata->cd_type == S3C_SDHCI_CD_NONE ||
            pdata->cd_type == S3C_SDHCI_CD_PERMANENT)
                host->quirks |= SDHCI_QUIRK_BROKEN_CARD_DETECTION;
index c31a334..0e02cc1 100644 (file)
@@ -628,12 +628,11 @@ static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
        /* timeout in us */
        if (!data)
                target_timeout = cmd->cmd_timeout_ms * 1000;
-       else
-               target_timeout = data->timeout_ns / 1000 +
-                       data->timeout_clks / host->clock;
-
-       if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)
-               host->timeout_clk = host->clock / 1000;
+       else {
+               target_timeout = data->timeout_ns / 1000;
+               if (host->clock)
+                       target_timeout += data->timeout_clks / host->clock;
+       }
 
        /*
         * Figure out needed cycles.
@@ -645,7 +644,6 @@ static u8 sdhci_calc_timeout(struct sdhci_host *host, struct mmc_command *cmd)
         *     =>
         *     (1) / (2) > 2^6
         */
-       BUG_ON(!host->timeout_clk);
        count = 0;
        current_timeout = (1 << 13) * 1000 / host->timeout_clk;
        while (current_timeout < target_timeout) {
@@ -1867,9 +1865,6 @@ static void sdhci_tasklet_finish(unsigned long param)
 
        del_timer(&host->timer);
 
-       if (host->version >= SDHCI_SPEC_300)
-               del_timer(&host->tuning_timer);
-
        mrq = host->mrq;
 
        /*
@@ -2461,22 +2456,6 @@ int sdhci_add_host(struct sdhci_host *host)
                host->max_clk = host->ops->get_max_clock(host);
        }
 
-       host->timeout_clk =
-               (caps[0] & SDHCI_TIMEOUT_CLK_MASK) >> SDHCI_TIMEOUT_CLK_SHIFT;
-       if (host->timeout_clk == 0) {
-               if (host->ops->get_timeout_clock) {
-                       host->timeout_clk = host->ops->get_timeout_clock(host);
-               } else if (!(host->quirks &
-                               SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)) {
-                       printk(KERN_ERR
-                              "%s: Hardware doesn't specify timeout clock "
-                              "frequency.\n", mmc_hostname(mmc));
-                       return -ENODEV;
-               }
-       }
-       if (caps[0] & SDHCI_TIMEOUT_CLK_UNIT)
-               host->timeout_clk *= 1000;
-
        /*
         * In case of Host Controller v3.00, find out whether clock
         * multiplier is supported.
@@ -2509,10 +2488,26 @@ int sdhci_add_host(struct sdhci_host *host)
        } else
                mmc->f_min = host->max_clk / SDHCI_MAX_DIV_SPEC_200;
 
+       host->timeout_clk =
+               (caps[0] & SDHCI_TIMEOUT_CLK_MASK) >> SDHCI_TIMEOUT_CLK_SHIFT;
+       if (host->timeout_clk == 0) {
+               if (host->ops->get_timeout_clock) {
+                       host->timeout_clk = host->ops->get_timeout_clock(host);
+               } else if (!(host->quirks &
+                               SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)) {
+                       printk(KERN_ERR
+                              "%s: Hardware doesn't specify timeout clock "
+                              "frequency.\n", mmc_hostname(mmc));
+                       return -ENODEV;
+               }
+       }
+       if (caps[0] & SDHCI_TIMEOUT_CLK_UNIT)
+               host->timeout_clk *= 1000;
+
        if (host->quirks & SDHCI_QUIRK_DATA_TIMEOUT_USES_SDCLK)
-               mmc->max_discard_to = (1 << 27) / (mmc->f_max / 1000);
-       else
-               mmc->max_discard_to = (1 << 27) / host->timeout_clk;
+               host->timeout_clk = mmc->f_max / 1000;
+
+       mmc->max_discard_to = (1 << 27) / host->timeout_clk;
 
        mmc->caps |= MMC_CAP_SDIO_IRQ | MMC_CAP_ERASE | MMC_CAP_CMD23;
 
index 8d185de..44a9668 100644 (file)
@@ -27,7 +27,6 @@
 static int tmio_mmc_suspend(struct platform_device *dev, pm_message_t state)
 {
        const struct mfd_cell *cell = mfd_get_cell(dev);
-       struct mmc_host *mmc = platform_get_drvdata(dev);
        int ret;
 
        ret = tmio_mmc_host_suspend(&dev->dev);
@@ -42,7 +41,6 @@ static int tmio_mmc_suspend(struct platform_device *dev, pm_message_t state)
 static int tmio_mmc_resume(struct platform_device *dev)
 {
        const struct mfd_cell *cell = mfd_get_cell(dev);
-       struct mmc_host *mmc = platform_get_drvdata(dev);
        int ret = 0;
 
        /* Tell the MFD core we are ready to be enabled */
index 749fdf0..753b21a 100644 (file)
@@ -158,47 +158,6 @@ static void program_hpp_type2(struct pci_dev *dev, struct hpp_type2 *hpp)
         */
 }
 
-/* Program PCIE MaxPayload setting on device: ensure parent maxpayload <= device */
-static int pci_set_payload(struct pci_dev *dev)
-{
-       int pos, ppos;
-       u16 pctl, psz;
-       u16 dctl, dsz, dcap, dmax;
-       struct pci_dev *parent;
-
-       parent = dev->bus->self;
-       pos = pci_find_capability(dev, PCI_CAP_ID_EXP);
-       if (!pos)
-               return 0;
-
-       /* Read Device MaxPayload capability and setting */
-       pci_read_config_word(dev, pos + PCI_EXP_DEVCTL, &dctl);
-       pci_read_config_word(dev, pos + PCI_EXP_DEVCAP, &dcap);
-       dsz = (dctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5;
-       dmax = (dcap & PCI_EXP_DEVCAP_PAYLOAD);
-
-       /* Read Parent MaxPayload setting */
-       ppos = pci_find_capability(parent, PCI_CAP_ID_EXP);
-       if (!ppos)
-               return 0;
-       pci_read_config_word(parent, ppos + PCI_EXP_DEVCTL, &pctl);
-       psz = (pctl &  PCI_EXP_DEVCTL_PAYLOAD) >> 5;
-
-       /* If parent payload > device max payload -> error
-        * If parent payload > device payload -> set speed
-        * If parent payload <= device payload -> do nothing
-        */
-       if (psz > dmax)
-               return -1;
-       else if (psz > dsz) {
-               dev_info(&dev->dev, "Setting MaxPayload to %d\n", 128 << psz);
-               pci_write_config_word(dev, pos + PCI_EXP_DEVCTL,
-                                     (dctl & ~PCI_EXP_DEVCTL_PAYLOAD) +
-                                     (psz << 5));
-       }
-       return 0;
-}
-
 void pci_configure_slot(struct pci_dev *dev)
 {
        struct pci_dev *cdev;
@@ -210,9 +169,7 @@ void pci_configure_slot(struct pci_dev *dev)
                        (dev->class >> 8) == PCI_CLASS_BRIDGE_PCI)))
                return;
 
-       ret = pci_set_payload(dev);
-       if (ret)
-               dev_warn(&dev->dev, "could not set device max payload\n");
+       pcie_bus_configure_settings(dev->bus, dev->bus->self->pcie_mpss);
 
        memset(&hpp, 0, sizeof(hpp));
        ret = pci_get_hp_params(dev, &hpp);
index c94d37e..f092993 100644 (file)
@@ -55,7 +55,7 @@ struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus)
         */
        if (bus->bridge->of_node)
                return of_node_get(bus->bridge->of_node);
-       if (bus->bridge->parent->of_node)
+       if (bus->bridge->parent && bus->bridge->parent->of_node)
                return of_node_get(bus->bridge->parent->of_node);
        return NULL;
 }
index 08a95b3..0ce6742 100644 (file)
@@ -77,6 +77,8 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
 unsigned long pci_hotplug_io_size  = DEFAULT_HOTPLUG_IO_SIZE;
 unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
 
+enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_PERFORMANCE;
+
 /*
  * The default CLS is used if arch didn't set CLS explicitly and not
  * all pci devices agree on the same value.  Arch can override either
@@ -3222,6 +3224,67 @@ out:
 }
 EXPORT_SYMBOL(pcie_set_readrq);
 
+/**
+ * pcie_get_mps - get PCI Express maximum payload size
+ * @dev: PCI device to query
+ *
+ * Returns maximum payload size in bytes
+ *    or appropriate error value.
+ */
+int pcie_get_mps(struct pci_dev *dev)
+{
+       int ret, cap;
+       u16 ctl;
+
+       cap = pci_pcie_cap(dev);
+       if (!cap)
+               return -EINVAL;
+
+       ret = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+       if (!ret)
+               ret = 128 << ((ctl & PCI_EXP_DEVCTL_PAYLOAD) >> 5);
+
+       return ret;
+}
+
+/**
+ * pcie_set_mps - set PCI Express maximum payload size
+ * @dev: PCI device to query
+ * @mps: maximum payload size in bytes
+ *    valid values are 128, 256, 512, 1024, 2048, 4096
+ *
+ * If possible sets maximum payload size
+ */
+int pcie_set_mps(struct pci_dev *dev, int mps)
+{
+       int cap, err = -EINVAL;
+       u16 ctl, v;
+
+       if (mps < 128 || mps > 4096 || !is_power_of_2(mps))
+               goto out;
+
+       v = ffs(mps) - 8;
+       if (v > dev->pcie_mpss) 
+               goto out;
+       v <<= 5;
+
+       cap = pci_pcie_cap(dev);
+       if (!cap)
+               goto out;
+
+       err = pci_read_config_word(dev, cap + PCI_EXP_DEVCTL, &ctl);
+       if (err)
+               goto out;
+
+       if ((ctl & PCI_EXP_DEVCTL_PAYLOAD) != v) {
+               ctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+               ctl |= v;
+               err = pci_write_config_word(dev, cap + PCI_EXP_DEVCTL, ctl);
+       }
+out:
+       return err;
+}
+
 /**
  * pci_select_bars - Make BAR mask from the type of resource
  * @dev: the PCI device for which BAR mask is made
@@ -3505,6 +3568,10 @@ static int __init pci_setup(char *str)
                                pci_hotplug_io_size = memparse(str + 9, &str);
                        } else if (!strncmp(str, "hpmemsize=", 10)) {
                                pci_hotplug_mem_size = memparse(str + 10, &str);
+                       } else if (!strncmp(str, "pcie_bus_safe", 13)) {
+                               pcie_bus_config = PCIE_BUS_SAFE;
+                       } else if (!strncmp(str, "pcie_bus_perf", 13)) {
+                               pcie_bus_config = PCIE_BUS_PERFORMANCE;
                        } else {
                                printk(KERN_ERR "PCI: Unknown option `%s'\n",
                                                str);
index c8cee76..b74084e 100644 (file)
@@ -283,6 +283,8 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 
 #endif /* CONFIG_PCI_IOV */
 
+extern unsigned long pci_cardbus_resource_alignment(struct resource *);
+
 static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
                                         struct resource *res)
 {
@@ -292,6 +294,8 @@ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
        if (resno >= PCI_IOV_RESOURCES && resno <= PCI_IOV_RESOURCE_END)
                return pci_sriov_resource_alignment(dev, resno);
 #endif
+       if (dev->class >> 8  == PCI_CLASS_BRIDGE_CARDBUS)
+               return pci_cardbus_resource_alignment(res);
        return resource_alignment(res);
 }
 
index 795c902..8473727 100644 (file)
@@ -856,6 +856,8 @@ void set_pcie_port_type(struct pci_dev *pdev)
        pdev->pcie_cap = pos;
        pci_read_config_word(pdev, pos + PCI_EXP_FLAGS, &reg16);
        pdev->pcie_type = (reg16 & PCI_EXP_FLAGS_TYPE) >> 4;
+       pci_read_config_word(pdev, pos + PCI_EXP_DEVCAP, &reg16);
+       pdev->pcie_mpss = reg16 & PCI_EXP_DEVCAP_PAYLOAD;
 }
 
 void set_pcie_hotplug_bridge(struct pci_dev *pdev)
@@ -1326,6 +1328,150 @@ int pci_scan_slot(struct pci_bus *bus, int devfn)
        return nr;
 }
 
+static int pcie_find_smpss(struct pci_dev *dev, void *data)
+{
+       u8 *smpss = data;
+
+       if (!pci_is_pcie(dev))
+               return 0;
+
+       /* For PCIE hotplug enabled slots not connected directly to a
+        * PCI-E root port, there can be problems when hotplugging
+        * devices.  This is due to the possibility of hotplugging a
+        * device into the fabric with a smaller MPS that the devices
+        * currently running have configured.  Modifying the MPS on the
+        * running devices could cause a fatal bus error due to an
+        * incoming frame being larger than the newly configured MPS.
+        * To work around this, the MPS for the entire fabric must be
+        * set to the minimum size.  Any devices hotplugged into this
+        * fabric will have the minimum MPS set.  If the PCI hotplug
+        * slot is directly connected to the root port and there are not
+        * other devices on the fabric (which seems to be the most
+        * common case), then this is not an issue and MPS discovery
+        * will occur as normal.
+        */
+       if (dev->is_hotplug_bridge && (!list_is_singular(&dev->bus->devices) ||
+           dev->bus->self->pcie_type != PCI_EXP_TYPE_ROOT_PORT))
+               *smpss = 0;
+
+       if (*smpss > dev->pcie_mpss)
+               *smpss = dev->pcie_mpss;
+
+       return 0;
+}
+
+static void pcie_write_mps(struct pci_dev *dev, int mps)
+{
+       int rc, dev_mpss;
+
+       dev_mpss = 128 << dev->pcie_mpss;
+
+       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
+               if (dev->bus->self) {
+                       dev_dbg(&dev->bus->dev, "Bus MPSS %d\n",
+                               128 << dev->bus->self->pcie_mpss);
+
+                       /* For "MPS Force Max", the assumption is made that
+                        * downstream communication will never be larger than
+                        * the MRRS.  So, the MPS only needs to be configured
+                        * for the upstream communication.  This being the case,
+                        * walk from the top down and set the MPS of the child
+                        * to that of the parent bus.
+                        */
+                       mps = 128 << dev->bus->self->pcie_mpss;
+                       if (mps > dev_mpss)
+                               dev_warn(&dev->dev, "MPS configured higher than"
+                                        " maximum supported by the device.  If"
+                                        " a bus issue occurs, try running with"
+                                        " pci=pcie_bus_safe.\n");
+               }
+
+               dev->pcie_mpss = ffs(mps) - 8;
+       }
+
+       rc = pcie_set_mps(dev, mps);
+       if (rc)
+               dev_err(&dev->dev, "Failed attempting to set the MPS\n");
+}
+
+static void pcie_write_mrrs(struct pci_dev *dev, int mps)
+{
+       int rc, mrrs;
+
+       if (pcie_bus_config == PCIE_BUS_PERFORMANCE) {
+               int dev_mpss = 128 << dev->pcie_mpss;
+
+               /* For Max performance, the MRRS must be set to the largest
+                * supported value.  However, it cannot be configured larger
+                * than the MPS the device or the bus can support.  This assumes
+                * that the largest MRRS available on the device cannot be
+                * smaller than the device MPSS.
+                */
+               mrrs = mps < dev_mpss ? mps : dev_mpss;
+       } else
+               /* In the "safe" case, configure the MRRS for fairness on the
+                * bus by making all devices have the same size
+                */
+               mrrs = mps;
+
+
+       /* MRRS is a R/W register.  Invalid values can be written, but a
+        * subsiquent read will verify if the value is acceptable or not.
+        * If the MRRS value provided is not acceptable (e.g., too large),
+        * shrink the value until it is acceptable to the HW.
+        */
+       while (mrrs != pcie_get_readrq(dev) && mrrs >= 128) {
+               rc = pcie_set_readrq(dev, mrrs);
+               if (rc)
+                       dev_err(&dev->dev, "Failed attempting to set the MRRS\n");
+
+               mrrs /= 2;
+       }
+}
+
+static int pcie_bus_configure_set(struct pci_dev *dev, void *data)
+{
+       int mps = 128 << *(u8 *)data;
+
+       if (!pci_is_pcie(dev))
+               return 0;
+
+       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+
+       pcie_write_mps(dev, mps);
+       pcie_write_mrrs(dev, mps);
+
+       dev_info(&dev->dev, "Dev MPS %d MPSS %d MRRS %d\n",
+                pcie_get_mps(dev), 128<<dev->pcie_mpss, pcie_get_readrq(dev));
+
+       return 0;
+}
+
+/* pcie_bus_configure_mps requires that pci_walk_bus work in a top-down,
+ * parents then children fashion.  If this changes, then this code will not
+ * work as designed.
+ */
+void pcie_bus_configure_settings(struct pci_bus *bus, u8 mpss)
+{
+       u8 smpss = mpss;
+
+       if (!bus->self)
+               return;
+
+       if (!pci_is_pcie(bus->self))
+               return;
+
+       if (pcie_bus_config == PCIE_BUS_SAFE) {
+               pcie_find_smpss(bus->self, &smpss);
+               pci_walk_bus(bus, pcie_find_smpss, &smpss);
+       }
+
+       pcie_bus_configure_set(bus->self, &smpss);
+       pci_walk_bus(bus, pcie_bus_configure_set, &smpss);
+}
+EXPORT_SYMBOL_GPL(pcie_bus_configure_settings);
+
 unsigned int __devinit pci_scan_child_bus(struct pci_bus *bus)
 {
        unsigned int devfn, pass, max = bus->secondary;
index 8a1d3c7..784da9d 100644 (file)
@@ -34,6 +34,7 @@ struct resource_list_x {
        resource_size_t start;
        resource_size_t end;
        resource_size_t add_size;
+       resource_size_t min_align;
        unsigned long flags;
 };
 
@@ -65,7 +66,7 @@ void pci_realloc(void)
  */
 static void add_to_list(struct resource_list_x *head,
                 struct pci_dev *dev, struct resource *res,
-                resource_size_t add_size)
+                resource_size_t add_size, resource_size_t min_align)
 {
        struct resource_list_x *list = head;
        struct resource_list_x *ln = list->next;
@@ -84,13 +85,16 @@ static void add_to_list(struct resource_list_x *head,
        tmp->end = res->end;
        tmp->flags = res->flags;
        tmp->add_size = add_size;
+       tmp->min_align = min_align;
        list->next = tmp;
 }
 
 static void add_to_failed_list(struct resource_list_x *head,
                                struct pci_dev *dev, struct resource *res)
 {
-       add_to_list(head, dev, res, 0);
+       add_to_list(head, dev, res,
+                       0 /* dont care */,
+                       0 /* dont care */);
 }
 
 static void __dev_sort_resources(struct pci_dev *dev,
@@ -121,18 +125,18 @@ static inline void reset_resource(struct resource *res)
 }
 
 /**
- * adjust_resources_sorted() - satisfy any additional resource requests
+ * reassign_resources_sorted() - satisfy any additional resource requests
  *
- * @add_head : head of the list tracking requests requiring additional
+ * @realloc_head : head of the list tracking requests requiring additional
  *             resources
  * @head     : head of the list tracking requests with allocated
  *             resources
  *
- * Walk through each element of the add_head and try to procure
+ * Walk through each element of the realloc_head and try to procure
  * additional resources for the element, provided the element
  * is in the head list.
  */
-static void adjust_resources_sorted(struct resource_list_x *add_head,
+static void reassign_resources_sorted(struct resource_list_x *realloc_head,
                struct resource_list *head)
 {
        struct resource *res;
@@ -141,8 +145,8 @@ static void adjust_resources_sorted(struct resource_list_x *add_head,
        resource_size_t add_size;
        int idx;
 
-       prev = add_head;
-       for (list = add_head->next; list;) {
+       prev = realloc_head;
+       for (list = realloc_head->next; list;) {
                res = list->res;
                /* skip resource that has been reset */
                if (!res->flags)
@@ -159,13 +163,17 @@ static void adjust_resources_sorted(struct resource_list_x *add_head,
 
                idx = res - &list->dev->resource[0];
                add_size=list->add_size;
-               if (!resource_size(res) && add_size) {
-                        res->end = res->start + add_size - 1;
-                        if(pci_assign_resource(list->dev, idx))
+               if (!resource_size(res)) {
+                       res->start = list->start;
+                       res->end = res->start + add_size - 1;
+                       if(pci_assign_resource(list->dev, idx))
                                reset_resource(res);
-               } else if (add_size) {
-                       adjust_resource(res, res->start,
-                               resource_size(res) + add_size);
+               } else {
+                       resource_size_t align = list->min_align;
+                       res->flags |= list->flags & (IORESOURCE_STARTALIGN|IORESOURCE_SIZEALIGN);
+                       if (pci_reassign_resource(list->dev, idx, add_size, align))
+                               dev_printk(KERN_DEBUG, &list->dev->dev, "failed to add optional resources res=%pR\n",
+                                                       res);
                }
 out:
                tmp = list;
@@ -210,16 +218,16 @@ static void assign_requested_resources_sorted(struct resource_list *head,
 }
 
 static void __assign_resources_sorted(struct resource_list *head,
-                                struct resource_list_x *add_head,
+                                struct resource_list_x *realloc_head,
                                 struct resource_list_x *fail_head)
 {
        /* Satisfy the must-have resource requests */
        assign_requested_resources_sorted(head, fail_head);
 
-       /* Try to satisfy any additional nice-to-have resource
+       /* Try to satisfy any additional optional resource
                requests */
-       if (add_head)
-               adjust_resources_sorted(add_head, head);
+       if (realloc_head)
+               reassign_resources_sorted(realloc_head, head);
        free_list(resource_list, head);
 }
 
@@ -235,7 +243,7 @@ static void pdev_assign_resources_sorted(struct pci_dev *dev,
 }
 
 static void pbus_assign_resources_sorted(const struct pci_bus *bus,
-                                        struct resource_list_x *add_head,
+                                        struct resource_list_x *realloc_head,
                                         struct resource_list_x *fail_head)
 {
        struct pci_dev *dev;
@@ -245,7 +253,7 @@ static void pbus_assign_resources_sorted(const struct pci_bus *bus,
        list_for_each_entry(dev, &bus->devices, bus_list)
                __dev_sort_resources(dev, &head);
 
-       __assign_resources_sorted(&head, add_head, fail_head);
+       __assign_resources_sorted(&head, realloc_head, fail_head);
 }
 
 void pci_setup_cardbus(struct pci_bus *bus)
@@ -540,13 +548,27 @@ static resource_size_t calculate_memsize(resource_size_t size,
        return size;
 }
 
+static resource_size_t get_res_add_size(struct resource_list_x *realloc_head,
+                                       struct resource *res)
+{
+       struct resource_list_x *list;
+
+       /* check if it is in realloc_head list */
+       for (list = realloc_head->next; list && list->res != res;
+                       list = list->next);
+       if (list)
+               return list->add_size;
+
+       return 0;
+}
+
 /**
  * pbus_size_io() - size the io window of a given bus
  *
  * @bus : the bus
  * @min_size : the minimum io window that must to be allocated
  * @add_size : additional optional io window
- * @add_head : track the additional io window on this list
+ * @realloc_head : track the additional io window on this list
  *
  * Sizing the IO windows of the PCI-PCI bridge is trivial,
  * since these windows have 4K granularity and the IO ranges
@@ -554,11 +576,12 @@ static resource_size_t calculate_memsize(resource_size_t size,
  * We must be careful with the ISA aliasing though.
  */
 static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
-               resource_size_t add_size, struct resource_list_x *add_head)
+               resource_size_t add_size, struct resource_list_x *realloc_head)
 {
        struct pci_dev *dev;
        struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
        unsigned long size = 0, size0 = 0, size1 = 0;
+       resource_size_t children_add_size = 0;
 
        if (!b_res)
                return;
@@ -579,11 +602,16 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
                                size += r_size;
                        else
                                size1 += r_size;
+
+                       if (realloc_head)
+                               children_add_size += get_res_add_size(realloc_head, r);
                }
        }
        size0 = calculate_iosize(size, min_size, size1,
                        resource_size(b_res), 4096);
-       size1 = (!add_head || (add_head && !add_size)) ? size0 :
+       if (children_add_size > add_size)
+               add_size = children_add_size;
+       size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
                calculate_iosize(size, min_size+add_size, size1,
                        resource_size(b_res), 4096);
        if (!size0 && !size1) {
@@ -598,8 +626,8 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
        b_res->start = 4096;
        b_res->end = b_res->start + size0 - 1;
        b_res->flags |= IORESOURCE_STARTALIGN;
-       if (size1 > size0 && add_head)
-               add_to_list(add_head, bus->self, b_res, size1-size0);
+       if (size1 > size0 && realloc_head)
+               add_to_list(realloc_head, bus->self, b_res, size1-size0, 4096);
 }
 
 /**
@@ -608,7 +636,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
  * @bus : the bus
  * @min_size : the minimum memory window that must to be allocated
  * @add_size : additional optional memory window
- * @add_head : track the additional memory window on this list
+ * @realloc_head : track the additional memory window on this list
  *
  * Calculate the size of the bus and minimal alignment which
  * guarantees that all child resources fit in this size.
@@ -616,7 +644,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
                         unsigned long type, resource_size_t min_size,
                        resource_size_t add_size,
-                       struct resource_list_x *add_head)
+                       struct resource_list_x *realloc_head)
 {
        struct pci_dev *dev;
        resource_size_t min_align, align, size, size0, size1;
@@ -624,6 +652,7 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
        int order, max_order;
        struct resource *b_res = find_free_bus_resource(bus, type);
        unsigned int mem64_mask = 0;
+       resource_size_t children_add_size = 0;
 
        if (!b_res)
                return 0;
@@ -645,6 +674,16 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
                        if (r->parent || (r->flags & mask) != type)
                                continue;
                        r_size = resource_size(r);
+#ifdef CONFIG_PCI_IOV
+                       /* put SRIOV requested res to the optional list */
+                       if (realloc_head && i >= PCI_IOV_RESOURCES &&
+                                       i <= PCI_IOV_RESOURCE_END) {
+                               r->end = r->start - 1;
+                               add_to_list(realloc_head, dev, r, r_size, 0/* dont' care */);
+                               children_add_size += r_size;
+                               continue;
+                       }
+#endif
                        /* For bridges size != alignment */
                        align = pci_resource_alignment(dev, r);
                        order = __ffs(align) - 20;
@@ -665,6 +704,9 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
                        if (order > max_order)
                                max_order = order;
                        mem64_mask &= r->flags & IORESOURCE_MEM_64;
+
+                       if (realloc_head)
+                               children_add_size += get_res_add_size(realloc_head, r);
                }
        }
        align = 0;
@@ -681,7 +723,9 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
                align += aligns[order];
        }
        size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
-       size1 = (!add_head || (add_head && !add_size)) ? size0 :
+       if (children_add_size > add_size)
+               add_size = children_add_size;
+       size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
                calculate_memsize(size, min_size+add_size, 0,
                                resource_size(b_res), min_align);
        if (!size0 && !size1) {
@@ -695,12 +739,22 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
        b_res->start = min_align;
        b_res->end = size0 + min_align - 1;
        b_res->flags |= IORESOURCE_STARTALIGN | mem64_mask;
-       if (size1 > size0 && add_head)
-               add_to_list(add_head, bus->self, b_res, size1-size0);
+       if (size1 > size0 && realloc_head)
+               add_to_list(realloc_head, bus->self, b_res, size1-size0, min_align);
        return 1;
 }
 
-static void pci_bus_size_cardbus(struct pci_bus *bus)
+unsigned long pci_cardbus_resource_alignment(struct resource *res)
+{
+       if (res->flags & IORESOURCE_IO)
+               return pci_cardbus_io_size;
+       if (res->flags & IORESOURCE_MEM)
+               return pci_cardbus_mem_size;
+       return 0;
+}
+
+static void pci_bus_size_cardbus(struct pci_bus *bus,
+                       struct resource_list_x *realloc_head)
 {
        struct pci_dev *bridge = bus->self;
        struct resource *b_res = &bridge->resource[PCI_BRIDGE_RESOURCES];
@@ -711,12 +765,14 @@ static void pci_bus_size_cardbus(struct pci_bus *bus)
         * a fixed amount of bus space for CardBus bridges.
         */
        b_res[0].start = 0;
-       b_res[0].end = pci_cardbus_io_size - 1;
        b_res[0].flags |= IORESOURCE_IO | IORESOURCE_SIZEALIGN;
+       if (realloc_head)
+               add_to_list(realloc_head, bridge, b_res, pci_cardbus_io_size, 0 /* dont care */);
 
        b_res[1].start = 0;
-       b_res[1].end = pci_cardbus_io_size - 1;
        b_res[1].flags |= IORESOURCE_IO | IORESOURCE_SIZEALIGN;
+       if (realloc_head)
+               add_to_list(realloc_head, bridge, b_res+1, pci_cardbus_io_size, 0 /* dont care */);
 
        /*
         * Check whether prefetchable memory is supported
@@ -736,21 +792,31 @@ static void pci_bus_size_cardbus(struct pci_bus *bus)
         */
        if (ctrl & PCI_CB_BRIDGE_CTL_PREFETCH_MEM0) {
                b_res[2].start = 0;
-               b_res[2].end = pci_cardbus_mem_size - 1;
                b_res[2].flags |= IORESOURCE_MEM | IORESOURCE_PREFETCH | IORESOURCE_SIZEALIGN;
+               if (realloc_head)
+                       add_to_list(realloc_head, bridge, b_res+2, pci_cardbus_mem_size, 0 /* dont care */);
 
                b_res[3].start = 0;
-               b_res[3].end = pci_cardbus_mem_size - 1;
                b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_SIZEALIGN;
+               if (realloc_head)
+                       add_to_list(realloc_head, bridge, b_res+3, pci_cardbus_mem_size, 0 /* dont care */);
        } else {
                b_res[3].start = 0;
-               b_res[3].end = pci_cardbus_mem_size * 2 - 1;
                b_res[3].flags |= IORESOURCE_MEM | IORESOURCE_SIZEALIGN;
+               if (realloc_head)
+                       add_to_list(realloc_head, bridge, b_res+3, pci_cardbus_mem_size * 2, 0 /* dont care */);
        }
+
+       /* set the size of the resource to zero, so that the resource does not
+        * get assigned during required-resource allocation cycle but gets assigned
+        * during the optional-resource allocation cycle.
+        */
+       b_res[0].start = b_res[1].start = b_res[2].start = b_res[3].start = 1;
+       b_res[0].end = b_res[1].end = b_res[2].end = b_res[3].end = 0;
 }
 
 void __ref __pci_bus_size_bridges(struct pci_bus *bus,
-                       struct resource_list_x *add_head)
+                       struct resource_list_x *realloc_head)
 {
        struct pci_dev *dev;
        unsigned long mask, prefmask;
@@ -763,12 +829,12 @@ void __ref __pci_bus_size_bridges(struct pci_bus *bus,
 
                switch (dev->class >> 8) {
                case PCI_CLASS_BRIDGE_CARDBUS:
-                       pci_bus_size_cardbus(b);
+                       pci_bus_size_cardbus(b, realloc_head);
                        break;
 
                case PCI_CLASS_BRIDGE_PCI:
                default:
-                       __pci_bus_size_bridges(b, add_head);
+                       __pci_bus_size_bridges(b, realloc_head);
                        break;
                }
        }
@@ -792,7 +858,7 @@ void __ref __pci_bus_size_bridges(struct pci_bus *bus,
                 * Follow thru
                 */
        default:
-               pbus_size_io(bus, 0, additional_io_size, add_head);
+               pbus_size_io(bus, 0, additional_io_size, realloc_head);
                /* If the bridge supports prefetchable range, size it
                   separately. If it doesn't, or its prefetchable window
                   has already been allocated by arch code, try
@@ -800,11 +866,11 @@ void __ref __pci_bus_size_bridges(struct pci_bus *bus,
                   resources. */
                mask = IORESOURCE_MEM;
                prefmask = IORESOURCE_MEM | IORESOURCE_PREFETCH;
-               if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size, add_head))
+               if (pbus_size_mem(bus, prefmask, prefmask, 0, additional_mem_size, realloc_head))
                        mask = prefmask; /* Success, size non-prefetch only. */
                else
                        additional_mem_size += additional_mem_size;
-               pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size, add_head);
+               pbus_size_mem(bus, mask, IORESOURCE_MEM, 0, additional_mem_size, realloc_head);
                break;
        }
 }
@@ -816,20 +882,20 @@ void __ref pci_bus_size_bridges(struct pci_bus *bus)
 EXPORT_SYMBOL(pci_bus_size_bridges);
 
 static void __ref __pci_bus_assign_resources(const struct pci_bus *bus,
-                                        struct resource_list_x *add_head,
+                                        struct resource_list_x *realloc_head,
                                         struct resource_list_x *fail_head)
 {
        struct pci_bus *b;
        struct pci_dev *dev;
 
-       pbus_assign_resources_sorted(bus, add_head, fail_head);
+       pbus_assign_resources_sorted(bus, realloc_head, fail_head);
 
        list_for_each_entry(dev, &bus->devices, bus_list) {
                b = dev->subordinate;
                if (!b)
                        continue;
 
-               __pci_bus_assign_resources(b, add_head, fail_head);
+               __pci_bus_assign_resources(b, realloc_head, fail_head);
 
                switch (dev->class >> 8) {
                case PCI_CLASS_BRIDGE_PCI:
@@ -1039,7 +1105,7 @@ void __init
 pci_assign_unassigned_resources(void)
 {
        struct pci_bus *bus;
-       struct resource_list_x add_list; /* list of resources that
+       struct resource_list_x realloc_list; /* list of resources that
                                        want additional resources */
        int tried_times = 0;
        enum release_type rel_type = leaf_only;
@@ -1052,7 +1118,7 @@ pci_assign_unassigned_resources(void)
 
 
        head.next = NULL;
-       add_list.next = NULL;
+       realloc_list.next = NULL;
 
        pci_try_num = max_depth + 1;
        printk(KERN_DEBUG "PCI: max bus depth: %d pci_try_num: %d\n",
@@ -1062,12 +1128,12 @@ again:
        /* Depth first, calculate sizes and alignments of all
           subordinate buses. */
        list_for_each_entry(bus, &pci_root_buses, node)
-               __pci_bus_size_bridges(bus, &add_list);
+               __pci_bus_size_bridges(bus, &realloc_list);
 
        /* Depth last, allocate resources and update the hardware. */
        list_for_each_entry(bus, &pci_root_buses, node)
-               __pci_bus_assign_resources(bus, &add_list, &head);
-       BUG_ON(add_list.next);
+               __pci_bus_assign_resources(bus, &realloc_list, &head);
+       BUG_ON(realloc_list.next);
        tried_times++;
 
        /* any device complain? */
index 319f359..51a9095 100644 (file)
@@ -128,16 +128,16 @@ void pci_disable_bridge_window(struct pci_dev *dev)
 }
 #endif /* CONFIG_PCI_QUIRKS */
 
+
+
 static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev,
-                                int resno)
+               int resno, resource_size_t size, resource_size_t align)
 {
        struct resource *res = dev->resource + resno;
-       resource_size_t size, min, align;
+       resource_size_t min;
        int ret;
 
-       size = resource_size(res);
        min = (res->flags & IORESOURCE_IO) ? PCIBIOS_MIN_IO : PCIBIOS_MIN_MEM;
-       align = pci_resource_alignment(dev, res);
 
        /* First, try exact prefetching match.. */
        ret = pci_bus_alloc_resource(bus, res, size, align, min,
@@ -154,56 +154,101 @@ static int __pci_assign_resource(struct pci_bus *bus, struct pci_dev *dev,
                ret = pci_bus_alloc_resource(bus, res, size, align, min, 0,
                                             pcibios_align_resource, dev);
        }
+       return ret;
+}
 
-       if (ret < 0 && dev->fw_addr[resno]) {
-               struct resource *root, *conflict;
-               resource_size_t start, end;
+static int pci_revert_fw_address(struct resource *res, struct pci_dev *dev, 
+               int resno, resource_size_t size)
+{
+       struct resource *root, *conflict;
+       resource_size_t start, end;
+       int ret = 0;
 
-               /*
-                * If we failed to assign anything, let's try the address
-                * where firmware left it.  That at least has a chance of
-                * working, which is better than just leaving it disabled.
-                */
+       if (res->flags & IORESOURCE_IO)
+               root = &ioport_resource;
+       else
+               root = &iomem_resource;
+
+       start = res->start;
+       end = res->end;
+       res->start = dev->fw_addr[resno];
+       res->end = res->start + size - 1;
+       dev_info(&dev->dev, "BAR %d: trying firmware assignment %pR\n",
+                resno, res);
+       conflict = request_resource_conflict(root, res);
+       if (conflict) {
+               dev_info(&dev->dev,
+                        "BAR %d: %pR conflicts with %s %pR\n", resno,
+                        res, conflict->name, conflict);
+               res->start = start;
+               res->end = end;
+               ret = 1;
+       }
+       return ret;
+}
+
+static int _pci_assign_resource(struct pci_dev *dev, int resno, int size, resource_size_t min_align)
+{
+       struct resource *res = dev->resource + resno;
+       struct pci_bus *bus;
+       int ret;
+       char *type;
 
-               if (res->flags & IORESOURCE_IO)
-                       root = &ioport_resource;
+       bus = dev->bus;
+       while ((ret = __pci_assign_resource(bus, dev, resno, size, min_align))) {
+               if (!bus->parent || !bus->self->transparent)
+                       break;
+               bus = bus->parent;
+       }
+
+       if (ret) {
+               if (res->flags & IORESOURCE_MEM)
+                       if (res->flags & IORESOURCE_PREFETCH)
+                               type = "mem pref";
+                       else
+                               type = "mem";
+               else if (res->flags & IORESOURCE_IO)
+                       type = "io";
                else
-                       root = &iomem_resource;
-
-               start = res->start;
-               end = res->end;
-               res->start = dev->fw_addr[resno];
-               res->end = res->start + size - 1;
-               dev_info(&dev->dev, "BAR %d: trying firmware assignment %pR\n",
-                        resno, res);
-               conflict = request_resource_conflict(root, res);
-               if (conflict) {
-                       dev_info(&dev->dev,
-                                "BAR %d: %pR conflicts with %s %pR\n", resno,
-                                res, conflict->name, conflict);
-                       res->start = start;
-                       res->end = end;
-               } else
-                       ret = 0;
+                       type = "unknown";
+               dev_info(&dev->dev,
+                        "BAR %d: can't assign %s (size %#llx)\n",
+                        resno, type, (unsigned long long) resource_size(res));
        }
 
+       return ret;
+}
+
+int pci_reassign_resource(struct pci_dev *dev, int resno, resource_size_t addsize,
+                       resource_size_t min_align)
+{
+       struct resource *res = dev->resource + resno;
+       resource_size_t new_size;
+       int ret;
+
+       if (!res->parent) {
+               dev_info(&dev->dev, "BAR %d: can't reassign an unassigned resouce %pR "
+                        "\n", resno, res);
+               return -EINVAL;
+       }
+
+       new_size = resource_size(res) + addsize + min_align;
+       ret = _pci_assign_resource(dev, resno, new_size, min_align);
        if (!ret) {
                res->flags &= ~IORESOURCE_STARTALIGN;
                dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res);
                if (resno < PCI_BRIDGE_RESOURCES)
                        pci_update_resource(dev, resno);
        }
-
        return ret;
 }
 
 int pci_assign_resource(struct pci_dev *dev, int resno)
 {
        struct resource *res = dev->resource + resno;
-       resource_size_t align;
+       resource_size_t align, size;
        struct pci_bus *bus;
        int ret;
-       char *type;
 
        align = pci_resource_alignment(dev, res);
        if (!align) {
@@ -213,34 +258,27 @@ int pci_assign_resource(struct pci_dev *dev, int resno)
        }
 
        bus = dev->bus;
-       while ((ret = __pci_assign_resource(bus, dev, resno))) {
-               if (bus->parent && bus->self->transparent)
-                       bus = bus->parent;
-               else
-                       bus = NULL;
-               if (bus)
-                       continue;
-               break;
-       }
+       size = resource_size(res);
+       ret = _pci_assign_resource(dev, resno, size, align);
 
-       if (ret) {
-               if (res->flags & IORESOURCE_MEM)
-                       if (res->flags & IORESOURCE_PREFETCH)
-                               type = "mem pref";
-                       else
-                               type = "mem";
-               else if (res->flags & IORESOURCE_IO)
-                       type = "io";
-               else
-                       type = "unknown";
-               dev_info(&dev->dev,
-                        "BAR %d: can't assign %s (size %#llx)\n",
-                        resno, type, (unsigned long long) resource_size(res));
-       }
+       /*
+        * If we failed to assign anything, let's try the address
+        * where firmware left it.  That at least has a chance of
+        * working, which is better than just leaving it disabled.
+        */
+       if (ret < 0 && dev->fw_addr[resno])
+               ret = pci_revert_fw_address(res, dev, resno, size);
 
+       if (!ret) {
+               res->flags &= ~IORESOURCE_STARTALIGN;
+               dev_info(&dev->dev, "BAR %d: assigned %pR\n", resno, res);
+               if (resno < PCI_BRIDGE_RESOURCES)
+                       pci_update_resource(dev, resno);
+       }
        return ret;
 }
 
+
 /* Sort resources by alignment */
 void pdev_sort_resources(struct pci_dev *dev, struct resource_list *head)
 {
index 7106b49..ffc5033 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <linux/err.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
index cc21fa2..ef8efad 100644 (file)
@@ -20,6 +20,7 @@
  */
 
 #include <linux/err.h>
+#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
index a675e31..d32d0d7 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/s3c_adc_battery.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/module.h>
 
 #include <plat/adc.h>
 
index 3195dbd..44e91e5 100644 (file)
@@ -639,7 +639,7 @@ EXPORT_SYMBOL_GPL(rtc_irq_unregister);
 static int rtc_update_hrtimer(struct rtc_device *rtc, int enabled)
 {
        /*
-        * We unconditionally cancel the timer here, because otherwise
+        * We always cancel the timer here first, because otherwise
         * we could run into BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
         * when we manage to start the timer before the callback
         * returns HRTIMER_RESTART.
@@ -708,7 +708,7 @@ int rtc_irq_set_freq(struct rtc_device *rtc, struct rtc_task *task, int freq)
        int err = 0;
        unsigned long flags;
 
-       if (freq <= 0 || freq > 5000)
+       if (freq <= 0 || freq > RTC_MAX_FREQ)
                return -EINVAL;
 retry:
        spin_lock_irqsave(&rtc->irq_task_lock, flags);
index 02e17c9..fd211f3 100644 (file)
@@ -711,10 +711,11 @@ struct mdfld_dsi_encoder *mdfld_dsi_dbi_init(struct drm_device *dev,
        /* Create drm encoder object */
        connector = &dsi_connector->base.base;
        encoder = &dbi_output->base.base;
+       /* Review this if we ever get MIPI-HDMI bridges or similar */
        drm_encoder_init(dev,
                        encoder,
                        p_funcs->encoder_funcs,
-                       DRM_MODE_ENCODER_MIPI);
+                       DRM_MODE_ENCODER_LVDS);
        drm_encoder_helper_add(encoder, p_funcs->encoder_helper_funcs);
 
        /* Attach to given connector */
index dc6242c..f0fa986 100644 (file)
@@ -42,9 +42,6 @@
 #include "mdfld_dsi_output.h"
 #include "mdfld_output.h"
 
-#define DRM_MODE_ENCODER_MIPI  5
-
-
 /*
  * DBI encoder which inherits from mdfld_dsi_encoder
  */
index 6e03a91..e685f12 100644 (file)
@@ -777,10 +777,15 @@ struct mdfld_dsi_encoder *mdfld_dsi_dpi_init(struct drm_device *dev,
        /* Create drm encoder object */
        connector = &dsi_connector->base.base;
        encoder = &dpi_output->base.base;
+       /*
+        * On existing hardware this will be a panel of some form,
+        * if future devices also have HDMI bridges this will need
+        * revisiting
+        */
        drm_encoder_init(dev,
                        encoder,
                        p_funcs->encoder_funcs,
-                       DRM_MODE_ENCODER_MIPI);
+                       DRM_MODE_ENCODER_LVDS);
        drm_encoder_helper_add(encoder,
                                p_funcs->encoder_helper_funcs);
        
index 7536095..9050c0f 100644 (file)
@@ -955,7 +955,9 @@ void mdfld_dsi_output_init(struct drm_device *dev,
        psb_output->type = (pipe == 0) ? INTEL_OUTPUT_MIPI : INTEL_OUTPUT_MIPI2;
 
        connector = &psb_output->base;
-       drm_connector_init(dev, connector, &mdfld_dsi_connector_funcs, DRM_MODE_CONNECTOR_MIPI);
+       /* Revisit type if MIPI/HDMI bridges ever appear on Medfield */
+       drm_connector_init(dev, connector, &mdfld_dsi_connector_funcs,
+                                               DRM_MODE_CONNECTOR_LVDS);
        drm_connector_helper_add(connector, &mdfld_dsi_connector_helper_funcs);
        
        connector->display_info.subpixel_order = SubPixelHorizontalRGB;
index 38165e8..09e9687 100644 (file)
@@ -21,8 +21,6 @@
  * DEALINGS IN THE SOFTWARE.
  */
 
-#define DRM_MODE_ENCODER_MIPI  5
-
 /* Medfield DSI controller registers */
 
 #define MIPIA_DEVICE_READY_REG                         0xb000
index 72f487a..fd4732d 100644 (file)
@@ -35,7 +35,6 @@
 
 /* Append new drm mode definition here, align with libdrm definition */
 #define DRM_MODE_SCALE_NO_SCALE        2
-#define DRM_MODE_CONNECTOR_MIPI         15
 
 enum {
        CHIP_PSB_8108 = 0,              /* Poulsbo */
index 1b4afd8..6ea852e 100644 (file)
@@ -70,6 +70,7 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
+#include <linux/module.h>
 #include <linux/workqueue.h>
 #include <xen/balloon.h>
 #include <xen/tmem.h>
index 54b8c28..720d885 100644 (file)
@@ -474,17 +474,22 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
                befs_data_stream *data = &befs_ino->i_data.ds;
                befs_off_t len = data->size;
 
-               befs_debug(sb, "Follow long symlink");
-
-               link = kmalloc(len, GFP_NOFS);
-               if (!link) {
-                       link = ERR_PTR(-ENOMEM);
-               } else if (befs_read_lsymlink(sb, data, link, len) != len) {
-                       kfree(link);
-                       befs_error(sb, "Failed to read entire long symlink");
+               if (len == 0) {
+                       befs_error(sb, "Long symlink with illegal length");
                        link = ERR_PTR(-EIO);
                } else {
-                       link[len - 1] = '\0';
+                       befs_debug(sb, "Follow long symlink");
+
+                       link = kmalloc(len, GFP_NOFS);
+                       if (!link) {
+                               link = ERR_PTR(-ENOMEM);
+                       } else if (befs_read_lsymlink(sb, data, link, len) != len) {
+                               kfree(link);
+                               befs_error(sb, "Failed to read entire long symlink");
+                               link = ERR_PTR(-EIO);
+                       } else {
+                               link[len - 1] = '\0';
+                       }
                }
        } else {
                link = befs_ino->i_data.symlink;
index 0469263..03912c5 100644 (file)
@@ -1415,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)            \
 static inline u##bits btrfs_##name(struct extent_buffer *eb)           \
 {                                                                      \
-       type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+       type *p = page_address(eb->first_page);                         \
        u##bits res = le##bits##_to_cpu(p->member);                     \
-       kunmap_atomic(p, KM_USER0);                                     \
        return res;                                                     \
 }                                                                      \
 static inline void btrfs_set_##name(struct extent_buffer *eb,          \
                                    u##bits val)                        \
 {                                                                      \
-       type *p = kmap_atomic(eb->first_page, KM_USER0);                \
+       type *p = page_address(eb->first_page);                         \
        p->member = cpu_to_le##bits(val);                               \
-       kunmap_atomic(p, KM_USER0);                                     \
 }
 
 #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits)             \
@@ -2367,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv, int update_ref);
+void btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
index 66bac22..f5be06a 100644 (file)
@@ -1782,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 
 
                for (i = 0; i < multi->num_stripes; i++, stripe++) {
+                       if (!stripe->dev->can_discard)
+                               continue;
+
                        ret = btrfs_issue_discard(stripe->dev->bdev,
                                                  stripe->physical,
                                                  stripe->length);
@@ -1789,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                                discarded_bytes += stripe->length;
                        else if (ret != -EOPNOTSUPP)
                                break;
+
+                       /*
+                        * Just in case we get back EOPNOTSUPP for some reason,
+                        * just ignore the return value so we don't screw up
+                        * people calling discard_extent.
+                        */
+                       ret = 0;
                }
                kfree(multi);
        }
-       if (discarded_bytes && ret == -EOPNOTSUPP)
-               ret = 0;
 
        if (actual_bytes)
                *actual_bytes = discarded_bytes;
@@ -6269,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
  */
-int btrfs_drop_snapshot(struct btrfs_root *root,
-                       struct btrfs_block_rsv *block_rsv, int update_ref)
+void btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -6283,13 +6291,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        int level;
 
        path = btrfs_alloc_path();
-       if (!path)
-               return -ENOMEM;
+       if (!path) {
+               err = -ENOMEM;
+               goto out;
+       }
 
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        if (!wc) {
                btrfs_free_path(path);
-               return -ENOMEM;
+               err = -ENOMEM;
+               goto out;
        }
 
        trans = btrfs_start_transaction(tree_root, 0);
@@ -6318,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                path->lowest_level = 0;
                if (ret < 0) {
                        err = ret;
-                       goto out;
+                       goto out_free;
                }
                WARN_ON(ret > 0);
 
@@ -6425,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                free_extent_buffer(root->commit_root);
                kfree(root);
        }
-out:
+out_free:
        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
-       return err;
+out:
+       if (err)
+               btrfs_std_error(root->fs_info, err);
+       return;
 }
 
 /*
@@ -6720,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        struct btrfs_space_info *space_info;
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_device *device;
+       u64 min_free;
+       u64 dev_min = 1;
+       u64 dev_nr = 0;
+       int index;
        int full = 0;
        int ret = 0;
 
@@ -6729,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (!block_group)
                return -1;
 
+       min_free = btrfs_block_group_used(&block_group->item);
+
        /* no bytes used, we're good */
-       if (!btrfs_block_group_used(&block_group->item))
+       if (!min_free)
                goto out;
 
        space_info = block_group->space_info;
@@ -6746,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
         * all of the extents from this block group.  If we can, we're good
         */
        if ((space_info->total_bytes != block_group->key.offset) &&
-          (space_info->bytes_used + space_info->bytes_reserved +
-           space_info->bytes_pinned + space_info->bytes_readonly +
-           btrfs_block_group_used(&block_group->item) <
-           space_info->total_bytes)) {
+           (space_info->bytes_used + space_info->bytes_reserved +
+            space_info->bytes_pinned + space_info->bytes_readonly +
+            min_free < space_info->total_bytes)) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -6766,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        if (full)
                goto out;
 
+       /*
+        * index:
+        *      0: raid10
+        *      1: raid1
+        *      2: dup
+        *      3: raid0
+        *      4: single
+        */
+       index = get_block_group_index(block_group);
+       if (index == 0) {
+               dev_min = 4;
+               /* Divide by 2 */
+               min_free >>= 1;
+       } else if (index == 1) {
+               dev_min = 2;
+       } else if (index == 2) {
+               /* Multiply by 2 */
+               min_free <<= 1;
+       } else if (index == 3) {
+               dev_min = fs_devices->rw_devices;
+               do_div(min_free, dev_min);
+       }
+
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
-               u64 min_free = btrfs_block_group_used(&block_group->item);
                u64 dev_offset;
 
                /*
@@ -6779,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                        ret = find_free_dev_extent(NULL, device, min_free,
                                                   &dev_offset, NULL);
                        if (!ret)
+                               dev_nr++;
+
+                       if (dev_nr >= dev_min)
                                break;
+
                        ret = -1;
                }
        }
index 658d669..e7872e4 100644 (file)
@@ -150,6 +150,8 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->defrag_inodes_lock);
        if (!BTRFS_I(inode)->in_defrag)
                __btrfs_add_inode_defrag(inode, defrag);
+       else
+               kfree(defrag);
        spin_unlock(&root->fs_info->defrag_inodes_lock);
        return 0;
 }
@@ -1638,11 +1640,15 @@ static long btrfs_fallocate(struct file *file, int mode,
 
        cur_offset = alloc_start;
        while (1) {
+               u64 actual_end;
+
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                      alloc_end - cur_offset, 0);
                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), alloc_end);
+               actual_end = min_t(u64, extent_map_end(em), offset + len);
                last_byte = (last_byte + mask) & ~mask;
+
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1655,6 +1661,16 @@ static long btrfs_fallocate(struct file *file, int mode,
                                free_extent_map(em);
                                break;
                        }
+               } else if (actual_end > inode->i_size &&
+                          !(mode & FALLOC_FL_KEEP_SIZE)) {
+                       /*
+                        * We didn't need to allocate any more space, but we
+                        * still extended the size of the file so we need to
+                        * update i_size.
+                        */
+                       inode->i_ctime = CURRENT_TIME;
+                       i_size_write(inode, actual_end);
+                       btrfs_ordered_update_i_size(inode, actual_end, NULL);
                }
                free_extent_map(em);
 
@@ -1804,10 +1820,14 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
                }
        }
 
-       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
-               return -EINVAL;
-       if (offset > inode->i_sb->s_maxbytes)
-               return -EINVAL;
+       if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
+               ret = -EINVAL;
+               goto out;
+       }
+       if (offset > inode->i_sb->s_maxbytes) {
+               ret = -EINVAL;
+               goto out;
+       }
 
        /* Special lock needed here? */
        if (offset != file->f_pos) {
index 6377713..6a265b9 100644 (file)
@@ -1168,9 +1168,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
 
-static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
-                             struct btrfs_free_space *info, u64 offset,
-                             u64 bytes)
+static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+                                      struct btrfs_free_space *info,
+                                      u64 offset, u64 bytes)
 {
        unsigned long start, count;
 
@@ -1181,6 +1181,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
        bitmap_clear(info->bitmap, start, count);
 
        info->bytes -= bytes;
+}
+
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
+                             struct btrfs_free_space *info, u64 offset,
+                             u64 bytes)
+{
+       __bitmap_clear_bits(ctl, info, offset, bytes);
        ctl->free_space -= bytes;
 }
 
@@ -1984,7 +1991,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                return 0;
 
        ret = search_start;
-       bitmap_clear_bits(ctl, entry, ret, bytes);
+       __bitmap_clear_bits(ctl, entry, ret, bytes);
 
        return ret;
 }
@@ -2039,7 +2046,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                continue;
                        }
                } else {
-
                        ret = entry->offset;
 
                        entry->offset += bytes;
index 15fceef..0ccc743 100644 (file)
@@ -7354,11 +7354,15 @@ static int btrfs_set_page_dirty(struct page *page)
 static int btrfs_permission(struct inode *inode, int mask)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+       umode_t mode = inode->i_mode;
 
-       if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
-               return -EROFS;
-       if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
-               return -EACCES;
+       if (mask & MAY_WRITE &&
+           (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
+               if (btrfs_root_readonly(root))
+                       return -EROFS;
+               if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
+                       return -EACCES;
+       }
        return generic_permission(inode, mask);
 }
 
index 7cf0133..970977a 100644 (file)
@@ -2236,6 +2236,10 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, len);
        }
 
+       /* truncate page cache pages from target inode range */
+       truncate_inode_pages_range(&inode->i_data, off,
+                                  ALIGN(off + len, PAGE_CACHE_SIZE) - 1);
+
        /* clone data */
        key.objectid = btrfs_ino(src);
        key.type = BTRFS_EXTENT_DATA_KEY;
index babee65..786639f 100644 (file)
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                                  struct extent_buffer *eb, int slot,
                                  struct btrfs_key *key)
 {
-       struct inode *dir;
-       int ret;
        struct btrfs_inode_ref *ref;
+       struct btrfs_dir_item *di;
+       struct inode *dir;
        struct inode *inode;
-       char *name;
-       int namelen;
        unsigned long ref_ptr;
        unsigned long ref_end;
+       char *name;
+       int namelen;
+       int ret;
        int search_done = 0;
 
        /*
@@ -909,6 +910,25 @@ again:
        }
        btrfs_release_path(path);
 
+       /* look for a conflicting sequence number */
+       di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
+                                        btrfs_inode_ref_index(eb, ref),
+                                        name, namelen, 0);
+       if (di && !IS_ERR(di)) {
+               ret = drop_one_dir_item(trans, root, path, dir, di);
+               BUG_ON(ret);
+       }
+       btrfs_release_path(path);
+
+       /* look for a conflicing name */
+       di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
+                                  name, namelen, 0);
+       if (di && !IS_ERR(di)) {
+               ret = drop_one_dir_item(trans, root, path, dir, di);
+               BUG_ON(ret);
+       }
+       btrfs_release_path(path);
+
 insert:
        /* insert our name */
        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
index 53875ae..f2a4cc7 100644 (file)
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        unsigned long limit;
        unsigned long last_waited = 0;
        int force_reg = 0;
+       int sync_pending = 0;
        struct blk_plug plug;
 
        /*
@@ -229,6 +230,22 @@ loop_lock:
 
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 
+               /*
+                * if we're doing the sync list, record that our
+                * plug has some sync requests on it
+                *
+                * If we're doing the regular list and there are
+                * sync requests sitting around, unplug before
+                * we add more
+                */
+               if (pending_bios == &device->pending_sync_bios) {
+                       sync_pending = 1;
+               } else if (sync_pending) {
+                       blk_finish_plug(&plug);
+                       blk_start_plug(&plug);
+                       sync_pending = 0;
+               }
+
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                        fs_devices->rw_devices--;
                }
 
+               if (device->can_discard)
+                       fs_devices->num_can_discard--;
+
                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
                BUG_ON(!new_device);
                memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                new_device->bdev = NULL;
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
+               new_device->can_discard = 0;
                list_replace_rcu(&device->dev_list, &new_device->dev_list);
 
                call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                                fmode_t flags, void *holder)
 {
+       struct request_queue *q;
        struct block_device *bdev;
        struct list_head *head = &fs_devices->devices;
        struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        seeding = 0;
                }
 
+               q = bdev_get_queue(bdev);
+               if (blk_queue_discard(q)) {
+                       device->can_discard = 1;
+                       fs_devices->num_can_discard++;
+               }
+
                device->bdev = bdev;
                device->in_fs_metadata = 0;
                device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
 
        max_hole_start = search_start;
        max_hole_size = 0;
+       hole_size = 0;
 
        if (search_start >= search_end) {
                ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
                cond_resched();
        }
 
-       hole_size = search_end- search_start;
+       /*
+        * At this point, search_start should be the end of
+        * allocated dev extents, and when shrinking the device,
+        * search_end may be smaller than search_start.
+        */
+       if (search_end > search_start)
+               hole_size = search_end - search_start;
+
        if (hole_size > max_hole_size) {
                max_hole_start = search_start;
                max_hole_size = hole_size;
@@ -1543,6 +1579,7 @@ error:
 
 int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 {
+       struct request_queue *q;
        struct btrfs_trans_handle *trans;
        struct btrfs_device *device;
        struct block_device *bdev;
@@ -1612,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 
        lock_chunks(root);
 
+       q = bdev_get_queue(bdev);
+       if (blk_queue_discard(q))
+               device->can_discard = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1647,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        root->fs_info->fs_devices->num_devices++;
        root->fs_info->fs_devices->open_devices++;
        root->fs_info->fs_devices->rw_devices++;
+       if (device->can_discard)
+               root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2413,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                        total_avail = device->total_bytes - device->bytes_used;
                else
                        total_avail = 0;
-               /* avail is off by max(alloc_start, 1MB), but that is the same
-                * for all devices, so it doesn't hurt the sorting later on
-                */
+
+               /* If there is no space on this device, skip it. */
+               if (total_avail == 0)
+                       continue;
 
                ret = find_free_dev_extent(trans, device,
                                           max_stripe_size * dev_stripes,
index 7c12d61..6d866db 100644 (file)
@@ -48,6 +48,7 @@ struct btrfs_device {
        int writeable;
        int in_fs_metadata;
        int missing;
+       int can_discard;
 
        spinlock_t io_lock;
 
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
        u64 rw_devices;
        u64 missing_devices;
        u64 total_rw_bytes;
+       u64 num_can_discard;
        struct block_device *latest_bdev;
 
        /* all of the devices in the FS, protected by a mutex
index 2fe3cf1..6d40656 100644 (file)
@@ -176,7 +176,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_CIFS_STATS2
                        seq_printf(m, " In Send: %d In MaxReq Wait: %d",
-                               atomic_read(&server->inSend),
+                               atomic_read(&server->in_send),
                                atomic_read(&server->num_waiters));
 #endif
 
index 21de1d6..d0f59fa 100644 (file)
@@ -991,24 +991,6 @@ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
        return pntsd;
 }
 
-static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
-               struct cifs_ntsd *pnntsd, u32 acllen)
-{
-       int xid, rc;
-       struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
-
-       if (IS_ERR(tlink))
-               return PTR_ERR(tlink);
-
-       xid = GetXid();
-       rc = CIFSSMBSetCIFSACL(xid, tlink_tcon(tlink), fid, pnntsd, acllen);
-       FreeXid(xid);
-       cifs_put_tlink(tlink);
-
-       cFYI(DBG2, "SetCIFSACL rc = %d", rc);
-       return rc;
-}
-
 static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
                struct cifs_ntsd *pnntsd, u32 acllen)
 {
@@ -1047,18 +1029,10 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                                struct inode *inode, const char *path)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-       struct cifsFileInfo *open_file;
-       int rc;
 
        cFYI(DBG2, "set ACL for %s from mode 0x%x", path, inode->i_mode);
 
-       open_file = find_readable_file(CIFS_I(inode), true);
-       if (!open_file)
-               return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
-
-       rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-       cifsFileInfo_put(open_file);
-       return rc;
+       return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
index cb71dc1..95da802 100644 (file)
@@ -125,5 +125,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "1.74"
+#define CIFS_VERSION   "1.75"
 #endif                         /* _CIFSFS_H */
index 38ce6d4..95dad9d 100644 (file)
@@ -291,7 +291,7 @@ struct TCP_Server_Info {
        struct fscache_cookie   *fscache; /* client index cache cookie */
 #endif
 #ifdef CONFIG_CIFS_STATS2
-       atomic_t inSend; /* requests trying to send */
+       atomic_t in_send; /* requests trying to send */
        atomic_t num_waiters;   /* blocked waiting to get in sendrecv */
 #endif
 };
@@ -672,12 +672,54 @@ struct mid_q_entry {
        bool multiEnd:1;        /* both received */
 };
 
-struct oplock_q_entry {
-       struct list_head qhead;
-       struct inode *pinode;
-       struct cifs_tcon *tcon;
-       __u16 netfid;
-};
+/*     Make code in transport.c a little cleaner by moving
+       update of optional stats into function below */
+#ifdef CONFIG_CIFS_STATS2
+
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+       atomic_inc(&server->in_send);
+}
+
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+       atomic_dec(&server->in_send);
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+       atomic_inc(&server->num_waiters);
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+       atomic_dec(&server->num_waiters);
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+       mid->when_sent = jiffies;
+}
+#else
+static inline void cifs_in_send_inc(struct TCP_Server_Info *server)
+{
+}
+static inline void cifs_in_send_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_inc(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_num_waiters_dec(struct TCP_Server_Info *server)
+{
+}
+
+static inline void cifs_save_when_sent(struct mid_q_entry *mid)
+{
+}
+#endif
 
 /* for pending dnotify requests */
 struct dir_notify_req {
index 80c2e3a..633c246 100644 (file)
@@ -2878,7 +2878,8 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
        kfree(volume_info->username);
        kzfree(volume_info->password);
        kfree(volume_info->UNC);
-       kfree(volume_info->UNCip);
+       if (volume_info->UNCip != volume_info->UNC + 2)
+               kfree(volume_info->UNCip);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
        kfree(volume_info->prepath);
index ae576fb..72d448b 100644 (file)
@@ -105,8 +105,8 @@ cifs_bp_rename_retry:
        }
        rcu_read_unlock();
        if (namelen != dfsplen || read_seqretry(&rename_lock, seq)) {
-               cERROR(1, "did not end path lookup where expected namelen is %d",
-                       namelen);
+               cFYI(1, "did not end path lookup where expected. namelen=%d "
+                       "dfsplen=%d", namelen, dfsplen);
                /* presumably this is only possible if racing with a rename
                of one of the parent directories  (we can not lock the dentries
                above us to prevent this, but retrying should be harmless) */
index c1b9c4b..10ca6b2 100644 (file)
@@ -266,15 +266,11 @@ static int wait_for_free_request(struct TCP_Server_Info *server,
        while (1) {
                if (atomic_read(&server->inFlight) >= cifs_max_pending) {
                        spin_unlock(&GlobalMid_Lock);
-#ifdef CONFIG_CIFS_STATS2
-                       atomic_inc(&server->num_waiters);
-#endif
+                       cifs_num_waiters_inc(server);
                        wait_event(server->request_q,
                                   atomic_read(&server->inFlight)
                                     < cifs_max_pending);
-#ifdef CONFIG_CIFS_STATS2
-                       atomic_dec(&server->num_waiters);
-#endif
+                       cifs_num_waiters_dec(server);
                        spin_lock(&GlobalMid_Lock);
                } else {
                        if (server->tcpStatus == CifsExiting) {
@@ -381,15 +377,13 @@ cifs_call_async(struct TCP_Server_Info *server, struct kvec *iov,
        mid->callback = callback;
        mid->callback_data = cbdata;
        mid->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-       atomic_inc(&server->inSend);
-#endif
+
+       cifs_in_send_inc(server);
        rc = smb_sendv(server, iov, nvec);
-#ifdef CONFIG_CIFS_STATS2
-       atomic_dec(&server->inSend);
-       mid->when_sent = jiffies;
-#endif
+       cifs_in_send_dec(server);
+       cifs_save_when_sent(mid);
        mutex_unlock(&server->srv_mutex);
+
        if (rc)
                goto out_err;
 
@@ -575,14 +569,10 @@ SendReceive2(const unsigned int xid, struct cifs_ses *ses,
        }
 
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-       atomic_inc(&ses->server->inSend);
-#endif
+       cifs_in_send_inc(ses->server);
        rc = smb_sendv(ses->server, iov, n_vec);
-#ifdef CONFIG_CIFS_STATS2
-       atomic_dec(&ses->server->inSend);
-       midQ->when_sent = jiffies;
-#endif
+       cifs_in_send_dec(ses->server);
+       cifs_save_when_sent(midQ);
 
        mutex_unlock(&ses->server->srv_mutex);
 
@@ -703,14 +693,11 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
        }
 
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-       atomic_inc(&ses->server->inSend);
-#endif
+
+       cifs_in_send_inc(ses->server);
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
-       atomic_dec(&ses->server->inSend);
-       midQ->when_sent = jiffies;
-#endif
+       cifs_in_send_dec(ses->server);
+       cifs_save_when_sent(midQ);
        mutex_unlock(&ses->server->srv_mutex);
 
        if (rc < 0)
@@ -843,14 +830,10 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
        }
 
        midQ->midState = MID_REQUEST_SUBMITTED;
-#ifdef CONFIG_CIFS_STATS2
-       atomic_inc(&ses->server->inSend);
-#endif
+       cifs_in_send_inc(ses->server);
        rc = smb_send(ses->server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
-#ifdef CONFIG_CIFS_STATS2
-       atomic_dec(&ses->server->inSend);
-       midQ->when_sent = jiffies;
-#endif
+       cifs_in_send_dec(ses->server);
+       cifs_save_when_sent(midQ);
        mutex_unlock(&ses->server->srv_mutex);
 
        if (rc < 0) {
index bb85757..5802fa1 100644 (file)
@@ -289,10 +289,10 @@ static inline int ext4_should_order_data(struct inode *inode)
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-       if (!S_ISREG(inode->i_mode))
-               return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
+       if (!S_ISREG(inode->i_mode))
+               return 0;
        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
index b8602cd..0962642 100644 (file)
@@ -800,12 +800,17 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
        }
 
 retry:
-       if (rw == READ && ext4_should_dioread_nolock(inode))
+       if (rw == READ && ext4_should_dioread_nolock(inode)) {
+               if (unlikely(!list_empty(&ei->i_completed_io_list))) {
+                       mutex_lock(&inode->i_mutex);
+                       ext4_flush_completed_IO(inode);
+                       mutex_unlock(&inode->i_mutex);
+               }
                ret = __blockdev_direct_IO(rw, iocb, inode,
                                 inode->i_sb->s_bdev, iov,
                                 offset, nr_segs,
                                 ext4_get_block, NULL, NULL, 0);
-       else {
+       else {
                ret = blockdev_direct_IO(rw, iocb, inode, iov,
                                 offset, nr_segs, ext4_get_block);
 
index d47264c..c4da98a 100644 (file)
@@ -120,6 +120,12 @@ void ext4_evict_inode(struct inode *inode)
        int err;
 
        trace_ext4_evict_inode(inode);
+
+       mutex_lock(&inode->i_mutex);
+       ext4_flush_completed_IO(inode);
+       mutex_unlock(&inode->i_mutex);
+       ext4_ioend_wait(inode);
+
        if (inode->i_nlink) {
                /*
                 * When journalling data dirty buffers are tracked only in the
@@ -983,6 +989,8 @@ static int ext4_journalled_write_end(struct file *file,
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
 
+       BUG_ON(!ext4_handle_valid(handle));
+
        if (copied < len) {
                if (!PageUptodate(page))
                        copied = 0;
@@ -1283,7 +1291,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd,
                        else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT))
                                err = ext4_bio_write_page(&io_submit, page,
                                                          len, mpd->wbc);
-                       else
+                       else if (buffer_uninit(page_bufs)) {
+                               ext4_set_bh_endio(page_bufs, inode);
+                               err = block_write_full_page_endio(page,
+                                       noalloc_get_block_write,
+                                       mpd->wbc, ext4_end_io_buffer_write);
+                       } else
                                err = block_write_full_page(page,
                                        noalloc_get_block_write, mpd->wbc);
 
@@ -1699,6 +1712,8 @@ static int __ext4_journalled_writepage(struct page *page,
                goto out;
        }
 
+       BUG_ON(!ext4_handle_valid(handle));
+
        ret = walk_page_buffers(handle, page_bufs, 0, len, NULL,
                                do_journal_get_write_access);
 
@@ -2668,8 +2683,15 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
                goto out;
        }
 
-       io_end->flag = EXT4_IO_END_UNWRITTEN;
+       /*
+        * It may be over-defensive here to check EXT4_IO_END_UNWRITTEN now,
+        * but being more careful is always safe for the future change.
+        */
        inode = io_end->inode;
+       if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+               io_end->flag |= EXT4_IO_END_UNWRITTEN;
+               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+       }
 
        /* Add the io_end to per-inode completed io list*/
        spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
index 430c401..78839af 100644 (file)
@@ -334,8 +334,10 @@ submit_and_retry:
        if ((io_end->num_io_pages >= MAX_IO_PAGES) &&
            (io_end->pages[io_end->num_io_pages-1] != io_page))
                goto submit_and_retry;
-       if (buffer_uninit(bh))
-               io->io_end->flag |= EXT4_IO_END_UNWRITTEN;
+       if (buffer_uninit(bh) && !(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
+               io_end->flag |= EXT4_IO_END_UNWRITTEN;
+               atomic_inc(&EXT4_I(inode)->i_aiodio_unwritten);
+       }
        io->io_end->size += bh->b_size;
        io->io_next_block++;
        ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
index 4687fea..44d0c8d 100644 (file)
@@ -919,7 +919,6 @@ static void ext4_i_callback(struct rcu_head *head)
 
 static void ext4_destroy_inode(struct inode *inode)
 {
-       ext4_ioend_wait(inode);
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
                ext4_msg(inode->i_sb, KERN_ERR,
                         "Inode %lu (%p): orphan list check failed!",
index 4ad6473..5efbd5d 100644 (file)
@@ -1231,7 +1231,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
        struct super_block *sb = dir->i_sb;
        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
-       struct msdos_dir_entry *de;
+       struct msdos_dir_entry *uninitialized_var(de);
        int err, free_slots, i, nr_bhs;
        loff_t pos, i_pos;
 
index 5942fec..1726d73 100644 (file)
@@ -1188,9 +1188,9 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 out:
        /* UTF-8 doesn't provide FAT semantics */
        if (!strcmp(opts->iocharset, "utf8")) {
-               fat_msg(sb, KERN_ERR, "utf8 is not a recommended IO charset"
+               fat_msg(sb, KERN_WARNING, "utf8 is not a recommended IO charset"
                       " for FAT filesystems, filesystem will be "
-                      "case sensitive!\n");
+                      "case sensitive!");
        }
 
        /* If user doesn't specify allow_utime, it's initialized from dmask. */
@@ -1367,6 +1367,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
        sbi->free_clusters = -1;        /* Don't know yet */
        sbi->free_clus_valid = 0;
        sbi->prev_free = FAT_START_ENT;
+       sb->s_maxbytes = 0xffffffff;
 
        if (!sbi->fat_length && b->fat32_length) {
                struct fat_boot_fsinfo *fsinfo;
@@ -1377,8 +1378,6 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
                sbi->fat_length = le32_to_cpu(b->fat32_length);
                sbi->root_cluster = le32_to_cpu(b->root_cluster);
 
-               sb->s_maxbytes = 0xffffffff;
-
                /* MC - if info_sector is 0, don't multiply by 0 */
                sbi->fsinfo_sector = le16_to_cpu(b->info_sector);
                if (sbi->fsinfo_sector == 0)
index adcf92d..7971f37 100644 (file)
@@ -68,7 +68,7 @@ int jfs_umount(struct super_block *sb)
                /*
                 * Wait for outstanding transactions to be written to log:
                 */
-               jfs_flush_journal(log, 1);
+               jfs_flush_journal(log, 2);
 
        /*
         * close fileset inode allocation map (aka fileset inode)
@@ -146,7 +146,7 @@ int jfs_umount_rw(struct super_block *sb)
         *
         * remove file system from log active file system list.
         */
-       jfs_flush_journal(log, 1);
+       jfs_flush_journal(log, 2);
 
        /*
         * Make sure all metadata makes it to disk
index e56564d..9561c8f 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/namei.h>
 #include <linux/bio.h>         /* struct bio */
 #include <linux/buffer_head.h> /* various write calls */
+#include <linux/prefetch.h>
 
 #include "blocklayout.h"
 
index b257383..07df5f1 100644 (file)
@@ -38,6 +38,7 @@ enum nfs4_callback_opnum {
 struct cb_process_state {
        __be32                  drc_status;
        struct nfs_client       *clp;
+       int                     slotid;
 };
 
 struct cb_compound_hdr_arg {
@@ -166,7 +167,6 @@ extern unsigned nfs4_callback_layoutrecall(
        void *dummy, struct cb_process_state *cps);
 
 extern void nfs4_check_drain_bc_complete(struct nfs4_session *ses);
-extern void nfs4_cb_take_slot(struct nfs_client *clp);
 
 struct cb_devicenotifyitem {
        uint32_t                cbd_notify_type;
index 74780f9..43926ad 100644 (file)
@@ -348,7 +348,7 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Normal */
        if (likely(args->csa_sequenceid == slot->seq_nr + 1)) {
                slot->seq_nr++;
-               return htonl(NFS4_OK);
+               goto out_ok;
        }
 
        /* Replay */
@@ -367,11 +367,14 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args)
        /* Wraparound */
        if (args->csa_sequenceid == 1 && (slot->seq_nr + 1) == 0) {
                slot->seq_nr = 1;
-               return htonl(NFS4_OK);
+               goto out_ok;
        }
 
        /* Misordered request */
        return htonl(NFS4ERR_SEQ_MISORDERED);
+out_ok:
+       tbl->highest_used_slotid = args->csa_slotid;
+       return htonl(NFS4_OK);
 }
 
 /*
@@ -433,26 +436,37 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
                              struct cb_sequenceres *res,
                              struct cb_process_state *cps)
 {
+       struct nfs4_slot_table *tbl;
        struct nfs_client *clp;
        int i;
        __be32 status = htonl(NFS4ERR_BADSESSION);
 
-       cps->clp = NULL;
-
        clp = nfs4_find_client_sessionid(args->csa_addr, &args->csa_sessionid);
        if (clp == NULL)
                goto out;
 
+       tbl = &clp->cl_session->bc_slot_table;
+
+       spin_lock(&tbl->slot_tbl_lock);
        /* state manager is resetting the session */
        if (test_bit(NFS4_SESSION_DRAINING, &clp->cl_session->session_state)) {
-               status = NFS4ERR_DELAY;
+               spin_unlock(&tbl->slot_tbl_lock);
+               status = htonl(NFS4ERR_DELAY);
+               /* Return NFS4ERR_BADSESSION if we're draining the session
+                * in order to reset it.
+                */
+               if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state))
+                       status = htonl(NFS4ERR_BADSESSION);
                goto out;
        }
 
        status = validate_seqid(&clp->cl_session->bc_slot_table, args);
+       spin_unlock(&tbl->slot_tbl_lock);
        if (status)
                goto out;
 
+       cps->slotid = args->csa_slotid;
+
        /*
         * Check for pending referring calls.  If a match is found, a
         * related callback was received before the response to the original
@@ -469,7 +483,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        res->csr_slotid = args->csa_slotid;
        res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
        res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1;
-       nfs4_cb_take_slot(clp);
 
 out:
        cps->clp = clp; /* put in nfs4_callback_compound */
index c6c86a7..918ad64 100644 (file)
@@ -754,26 +754,15 @@ static void nfs4_callback_free_slot(struct nfs4_session *session)
         * Let the state manager know callback processing done.
         * A single slot, so highest used slotid is either 0 or -1
         */
-       tbl->highest_used_slotid--;
+       tbl->highest_used_slotid = -1;
        nfs4_check_drain_bc_complete(session);
        spin_unlock(&tbl->slot_tbl_lock);
 }
 
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
-       if (clp && clp->cl_session)
-               nfs4_callback_free_slot(clp->cl_session);
-}
-
-/* A single slot, so highest used slotid is either 0 or -1 */
-void nfs4_cb_take_slot(struct nfs_client *clp)
-{
-       struct nfs4_slot_table *tbl = &clp->cl_session->bc_slot_table;
-
-       spin_lock(&tbl->slot_tbl_lock);
-       tbl->highest_used_slotid++;
-       BUG_ON(tbl->highest_used_slotid != 0);
-       spin_unlock(&tbl->slot_tbl_lock);
+       if (cps->slotid != -1)
+               nfs4_callback_free_slot(cps->clp->cl_session);
 }
 
 #else /* CONFIG_NFS_V4_1 */
@@ -784,7 +773,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
        return htonl(NFS4ERR_MINOR_VERS_MISMATCH);
 }
 
-static void nfs4_cb_free_slot(struct nfs_client *clp)
+static void nfs4_cb_free_slot(struct cb_process_state *cps)
 {
 }
 #endif /* CONFIG_NFS_V4_1 */
@@ -866,6 +855,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
        struct cb_process_state cps = {
                .drc_status = 0,
                .clp = NULL,
+               .slotid = -1,
        };
        unsigned int nops = 0;
 
@@ -906,7 +896,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r
 
        *hdr_res.status = status;
        *hdr_res.nops = htonl(nops);
-       nfs4_cb_free_slot(cps.clp);
+       nfs4_cb_free_slot(&cps);
        nfs_put_client(cps.clp);
        dprintk("%s: done, status = %u\n", __func__, ntohl(status));
        return rpc_success;
index 9383ca7..d0cda12 100644 (file)
@@ -479,7 +479,6 @@ static int _io_check(struct objio_state *ios, bool is_write)
        for (i = 0; i <  ios->numdevs; i++) {
                struct osd_sense_info osi;
                struct osd_request *or = ios->per_dev[i].or;
-               unsigned dev;
                int ret;
 
                if (!or)
@@ -500,9 +499,8 @@ static int _io_check(struct objio_state *ios, bool is_write)
 
                        continue; /* we recovered */
                }
-               dev = ios->per_dev[i].dev;
-               objlayout_io_set_result(&ios->ol_state, dev,
-                                       &ios->layout->comps[dev].oc_object_id,
+               objlayout_io_set_result(&ios->ol_state, i,
+                                       &ios->layout->comps[i].oc_object_id,
                                        osd_pri_2_pnfs_err(osi.osd_err_pri),
                                        ios->per_dev[i].offset,
                                        ios->per_dev[i].length,
@@ -589,22 +587,19 @@ static void _calc_stripe_info(struct objio_state *ios, u64 file_offset,
 }
 
 static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
-               unsigned pgbase, struct _objio_per_comp *per_dev, int cur_len,
+               unsigned pgbase, struct _objio_per_comp *per_dev, int len,
                gfp_t gfp_flags)
 {
        unsigned pg = *cur_pg;
+       int cur_len = len;
        struct request_queue *q =
                        osd_request_queue(_io_od(ios, per_dev->dev));
 
-       per_dev->length += cur_len;
-
        if (per_dev->bio == NULL) {
-               unsigned stripes = ios->layout->num_comps /
-                                                    ios->layout->mirrors_p1;
-               unsigned pages_in_stripe = stripes *
+               unsigned pages_in_stripe = ios->layout->group_width *
                                      (ios->layout->stripe_unit / PAGE_SIZE);
                unsigned bio_size = (ios->ol_state.nr_pages + pages_in_stripe) /
-                                   stripes;
+                                   ios->layout->group_width;
 
                if (BIO_MAX_PAGES_KMALLOC < bio_size)
                        bio_size = BIO_MAX_PAGES_KMALLOC;
@@ -632,6 +627,7 @@ static int _add_stripe_unit(struct objio_state *ios,  unsigned *cur_pg,
        }
        BUG_ON(cur_len);
 
+       per_dev->length += len;
        *cur_pg = pg;
        return 0;
 }
@@ -650,7 +646,7 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
        int ret = 0;
 
        while (length) {
-               struct _objio_per_comp *per_dev = &ios->per_dev[dev];
+               struct _objio_per_comp *per_dev = &ios->per_dev[dev - first_dev];
                unsigned cur_len, page_off = 0;
 
                if (!per_dev->length) {
@@ -670,8 +666,8 @@ static int _prepare_one_group(struct objio_state *ios, u64 length,
                                cur_len = stripe_unit;
                        }
 
-                       if (max_comp < dev)
-                               max_comp = dev;
+                       if (max_comp < dev - first_dev)
+                               max_comp = dev - first_dev;
                } else {
                        cur_len = stripe_unit;
                }
@@ -806,7 +802,7 @@ static int _read_mirrors(struct objio_state *ios, unsigned cur_comp)
        struct _objio_per_comp *per_dev = &ios->per_dev[cur_comp];
        unsigned dev = per_dev->dev;
        struct pnfs_osd_object_cred *cred =
-                       &ios->layout->comps[dev];
+                       &ios->layout->comps[cur_comp];
        struct osd_obj_id obj = {
                .partition = cred->oc_object_id.oid_partition_id,
                .id = cred->oc_object_id.oid_object_id,
@@ -904,7 +900,7 @@ static int _write_mirrors(struct objio_state *ios, unsigned cur_comp)
        for (; cur_comp < last_comp; ++cur_comp, ++dev) {
                struct osd_request *or = NULL;
                struct pnfs_osd_object_cred *cred =
-                                       &ios->layout->comps[dev];
+                                       &ios->layout->comps[cur_comp];
                struct osd_obj_id obj = {
                        .partition = cred->oc_object_id.oid_partition_id,
                        .id = cred->oc_object_id.oid_object_id,
index 16fc758..b3918f7 100644 (file)
@@ -170,6 +170,9 @@ int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout,
        p = _osd_xdr_decode_data_map(p, &layout->olo_map);
        layout->olo_comps_index = be32_to_cpup(p++);
        layout->olo_num_comps = be32_to_cpup(p++);
+       dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__,
+               layout->olo_comps_index, layout->olo_num_comps);
+
        iter->total_comps = layout->olo_num_comps;
        return 0;
 }
index 75bb316..427a4e8 100644 (file)
 # Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 #
 
-ccflags-y := -I$(src) -I$(src)/linux-2.6
-ccflags-$(CONFIG_XFS_DEBUG) += -g
+ccflags-y += -I$(src)                  # needed for trace events
 
-XFS_LINUX := linux-2.6
+ccflags-$(CONFIG_XFS_DEBUG) += -g
 
 obj-$(CONFIG_XFS_FS)           += xfs.o
 
-xfs-y                          += linux-2.6/xfs_trace.o
-
-xfs-$(CONFIG_XFS_QUOTA)                += $(addprefix quota/, \
-                                  xfs_dquot.o \
-                                  xfs_dquot_item.o \
-                                  xfs_trans_dquot.o \
-                                  xfs_qm_syscalls.o \
-                                  xfs_qm_bhv.o \
-                                  xfs_qm.o)
-xfs-$(CONFIG_XFS_QUOTA)                += linux-2.6/xfs_quotaops.o
-
-ifeq ($(CONFIG_XFS_QUOTA),y)
-xfs-$(CONFIG_PROC_FS)          += quota/xfs_qm_stats.o
-endif
-
-xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
-xfs-$(CONFIG_XFS_POSIX_ACL)    += $(XFS_LINUX)/xfs_acl.o
-xfs-$(CONFIG_PROC_FS)          += $(XFS_LINUX)/xfs_stats.o
-xfs-$(CONFIG_SYSCTL)           += $(XFS_LINUX)/xfs_sysctl.o
-xfs-$(CONFIG_COMPAT)           += $(XFS_LINUX)/xfs_ioctl32.o
+# this one should be compiled first, as the tracing macros can easily blow up
+xfs-y                          += xfs_trace.o
 
+# highlevel code
+xfs-y                          += xfs_aops.o \
+                                  xfs_bit.o \
+                                  xfs_buf.o \
+                                  xfs_dfrag.o \
+                                  xfs_discard.o \
+                                  xfs_error.o \
+                                  xfs_export.o \
+                                  xfs_file.o \
+                                  xfs_filestream.o \
+                                  xfs_fsops.o \
+                                  xfs_fs_subr.o \
+                                  xfs_globals.o \
+                                  xfs_iget.o \
+                                  xfs_ioctl.o \
+                                  xfs_iomap.o \
+                                  xfs_iops.o \
+                                  xfs_itable.o \
+                                  xfs_message.o \
+                                  xfs_mru_cache.o \
+                                  xfs_super.o \
+                                  xfs_sync.o \
+                                  xfs_xattr.o \
+                                  xfs_rename.o \
+                                  xfs_rw.o \
+                                  xfs_utils.o \
+                                  xfs_vnodeops.o \
+                                  kmem.o \
+                                  uuid.o
 
+# code shared with libxfs
 xfs-y                          += xfs_alloc.o \
                                   xfs_alloc_btree.o \
                                   xfs_attr.o \
                                   xfs_attr_leaf.o \
-                                  xfs_bit.o \
                                   xfs_bmap.o \
                                   xfs_bmap_btree.o \
                                   xfs_btree.o \
-                                  xfs_buf_item.o \
                                   xfs_da_btree.o \
                                   xfs_dir2.o \
                                   xfs_dir2_block.o \
@@ -61,49 +70,37 @@ xfs-y                               += xfs_alloc.o \
                                   xfs_dir2_leaf.o \
                                   xfs_dir2_node.o \
                                   xfs_dir2_sf.o \
-                                  xfs_error.o \
-                                  xfs_extfree_item.o \
-                                  xfs_filestream.o \
-                                  xfs_fsops.o \
                                   xfs_ialloc.o \
                                   xfs_ialloc_btree.o \
-                                  xfs_iget.o \
                                   xfs_inode.o \
-                                  xfs_inode_item.o \
-                                  xfs_iomap.o \
-                                  xfs_itable.o \
-                                  xfs_dfrag.o \
-                                  xfs_log.o \
-                                  xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
-                                  xfs_mru_cache.o \
-                                  xfs_rename.o \
-                                  xfs_trans.o \
+                                  xfs_trans.o
+
+# low-level transaction/log code
+xfs-y                          += xfs_log.o \
+                                  xfs_log_cil.o \
+                                  xfs_buf_item.o \
+                                  xfs_extfree_item.o \
+                                  xfs_inode_item.o \
                                   xfs_trans_ail.o \
                                   xfs_trans_buf.o \
                                   xfs_trans_extfree.o \
                                   xfs_trans_inode.o \
-                                  xfs_utils.o \
-                                  xfs_vnodeops.o \
-                                  xfs_rw.o
-
-# Objects in linux/
-xfs-y                          += $(addprefix $(XFS_LINUX)/, \
-                                  kmem.o \
-                                  xfs_aops.o \
-                                  xfs_buf.o \
-                                  xfs_discard.o \
-                                  xfs_export.o \
-                                  xfs_file.o \
-                                  xfs_fs_subr.o \
-                                  xfs_globals.o \
-                                  xfs_ioctl.o \
-                                  xfs_iops.o \
-                                  xfs_message.o \
-                                  xfs_super.o \
-                                  xfs_sync.o \
-                                  xfs_xattr.o)
 
-# Objects in support/
-xfs-y                          += support/uuid.o
+# optional features
+xfs-$(CONFIG_XFS_QUOTA)                += xfs_dquot.o \
+                                  xfs_dquot_item.o \
+                                  xfs_trans_dquot.o \
+                                  xfs_qm_syscalls.o \
+                                  xfs_qm_bhv.o \
+                                  xfs_qm.o \
+                                  xfs_quotaops.o
+ifeq ($(CONFIG_XFS_QUOTA),y)
+xfs-$(CONFIG_PROC_FS)          += xfs_qm_stats.o
+endif
+xfs-$(CONFIG_XFS_RT)           += xfs_rtalloc.o
+xfs-$(CONFIG_XFS_POSIX_ACL)    += xfs_acl.o
+xfs-$(CONFIG_PROC_FS)          += xfs_stats.o
+xfs-$(CONFIG_SYSCTL)           += xfs_sysctl.o
+xfs-$(CONFIG_COMPAT)           += xfs_ioctl32.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
new file mode 100644 (file)
index 0000000..a907de5
--- /dev/null
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/swap.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include "time.h"
+#include "kmem.h"
+#include "xfs_message.h"
+
+/*
+ * Greedy allocation.  May fail and may return vmalloced memory.
+ *
+ * Must be freed using kmem_free_large.
+ */
+void *
+kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
+{
+       void            *ptr;
+       size_t          kmsize = maxsize;
+
+       while (!(ptr = kmem_zalloc_large(kmsize))) {
+               if ((kmsize >>= 1) <= minsize)
+                       kmsize = minsize;
+       }
+       if (ptr)
+               *size = kmsize;
+       return ptr;
+}
+
+void *
+kmem_alloc(size_t size, unsigned int __nocast flags)
+{
+       int     retries = 0;
+       gfp_t   lflags = kmem_flags_convert(flags);
+       void    *ptr;
+
+       do {
+               ptr = kmalloc(size, lflags);
+               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                       return ptr;
+               if (!(++retries % 100))
+                       xfs_err(NULL,
+               "possible memory allocation deadlock in %s (mode:0x%x)",
+                                       __func__, lflags);
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+       } while (1);
+}
+
+void *
+kmem_zalloc(size_t size, unsigned int __nocast flags)
+{
+       void    *ptr;
+
+       ptr = kmem_alloc(size, flags);
+       if (ptr)
+               memset((char *)ptr, 0, (int)size);
+       return ptr;
+}
+
+void
+kmem_free(const void *ptr)
+{
+       if (!is_vmalloc_addr(ptr)) {
+               kfree(ptr);
+       } else {
+               vfree(ptr);
+       }
+}
+
+void *
+kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
+            unsigned int __nocast flags)
+{
+       void    *new;
+
+       new = kmem_alloc(newsize, flags);
+       if (ptr) {
+               if (new)
+                       memcpy(new, ptr,
+                               ((oldsize < newsize) ? oldsize : newsize));
+               kmem_free(ptr);
+       }
+       return new;
+}
+
+void *
+kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+       int     retries = 0;
+       gfp_t   lflags = kmem_flags_convert(flags);
+       void    *ptr;
+
+       do {
+               ptr = kmem_cache_alloc(zone, lflags);
+               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
+                       return ptr;
+               if (!(++retries % 100))
+                       xfs_err(NULL,
+               "possible memory allocation deadlock in %s (mode:0x%x)",
+                                       __func__, lflags);
+               congestion_wait(BLK_RW_ASYNC, HZ/50);
+       } while (1);
+}
+
+void *
+kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+{
+       void    *ptr;
+
+       ptr = kmem_zone_alloc(zone, flags);
+       if (ptr)
+               memset((char *)ptr, 0, kmem_cache_size(zone));
+       return ptr;
+}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
new file mode 100644 (file)
index 0000000..f7c8f7a
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_KMEM_H__
+#define __XFS_SUPPORT_KMEM_H__
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+
+/*
+ * General memory allocation interfaces
+ */
+
+#define KM_SLEEP       0x0001u
+#define KM_NOSLEEP     0x0002u
+#define KM_NOFS                0x0004u
+#define KM_MAYFAIL     0x0008u
+
+/*
+ * We use a special process flag to avoid recursive callbacks into
+ * the filesystem during transactions.  We will also issue our own
+ * warnings, so we explicitly skip any generic ones (silly of us).
+ */
+static inline gfp_t
+kmem_flags_convert(unsigned int __nocast flags)
+{
+       gfp_t   lflags;
+
+       BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
+
+       if (flags & KM_NOSLEEP) {
+               lflags = GFP_ATOMIC | __GFP_NOWARN;
+       } else {
+               lflags = GFP_KERNEL | __GFP_NOWARN;
+               if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
+                       lflags &= ~__GFP_FS;
+       }
+       return lflags;
+}
+
+extern void *kmem_alloc(size_t, unsigned int __nocast);
+extern void *kmem_zalloc(size_t, unsigned int __nocast);
+extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void  kmem_free(const void *);
+
+static inline void *kmem_zalloc_large(size_t size)
+{
+       void *ptr;
+
+       ptr = vmalloc(size);
+       if (ptr)
+               memset(ptr, 0, size);
+       return ptr;
+}
+static inline void kmem_free_large(void *ptr)
+{
+       vfree(ptr);
+}
+
+extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
+
+/*
+ * Zone interfaces
+ */
+
+#define KM_ZONE_HWALIGN        SLAB_HWCACHE_ALIGN
+#define KM_ZONE_RECLAIM        SLAB_RECLAIM_ACCOUNT
+#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
+
+#define kmem_zone      kmem_cache
+#define kmem_zone_t    struct kmem_cache
+
+static inline kmem_zone_t *
+kmem_zone_init(int size, char *zone_name)
+{
+       return kmem_cache_create(zone_name, size, 0, 0, NULL);
+}
+
+static inline kmem_zone_t *
+kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
+                    void (*construct)(void *))
+{
+       return kmem_cache_create(zone_name, size, 0, flags, construct);
+}
+
+static inline void
+kmem_zone_free(kmem_zone_t *zone, void *ptr)
+{
+       kmem_cache_free(zone, ptr);
+}
+
+static inline void
+kmem_zone_destroy(kmem_zone_t *zone)
+{
+       if (zone)
+               kmem_cache_destroy(zone);
+}
+
+extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+
+static inline int
+kmem_shake_allow(gfp_t gfp_mask)
+{
+       return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
+}
+
+#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
deleted file mode 100644 (file)
index a907de5..0000000
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include "time.h"
-#include "kmem.h"
-#include "xfs_message.h"
-
-/*
- * Greedy allocation.  May fail and may return vmalloced memory.
- *
- * Must be freed using kmem_free_large.
- */
-void *
-kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
-{
-       void            *ptr;
-       size_t          kmsize = maxsize;
-
-       while (!(ptr = kmem_zalloc_large(kmsize))) {
-               if ((kmsize >>= 1) <= minsize)
-                       kmsize = minsize;
-       }
-       if (ptr)
-               *size = kmsize;
-       return ptr;
-}
-
-void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
-{
-       int     retries = 0;
-       gfp_t   lflags = kmem_flags_convert(flags);
-       void    *ptr;
-
-       do {
-               ptr = kmalloc(size, lflags);
-               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
-                       return ptr;
-               if (!(++retries % 100))
-                       xfs_err(NULL,
-               "possible memory allocation deadlock in %s (mode:0x%x)",
-                                       __func__, lflags);
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
-       } while (1);
-}
-
-void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
-{
-       void    *ptr;
-
-       ptr = kmem_alloc(size, flags);
-       if (ptr)
-               memset((char *)ptr, 0, (int)size);
-       return ptr;
-}
-
-void
-kmem_free(const void *ptr)
-{
-       if (!is_vmalloc_addr(ptr)) {
-               kfree(ptr);
-       } else {
-               vfree(ptr);
-       }
-}
-
-void *
-kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-            unsigned int __nocast flags)
-{
-       void    *new;
-
-       new = kmem_alloc(newsize, flags);
-       if (ptr) {
-               if (new)
-                       memcpy(new, ptr,
-                               ((oldsize < newsize) ? oldsize : newsize));
-               kmem_free(ptr);
-       }
-       return new;
-}
-
-void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
-       int     retries = 0;
-       gfp_t   lflags = kmem_flags_convert(flags);
-       void    *ptr;
-
-       do {
-               ptr = kmem_cache_alloc(zone, lflags);
-               if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP)))
-                       return ptr;
-               if (!(++retries % 100))
-                       xfs_err(NULL,
-               "possible memory allocation deadlock in %s (mode:0x%x)",
-                                       __func__, lflags);
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
-       } while (1);
-}
-
-void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
-{
-       void    *ptr;
-
-       ptr = kmem_zone_alloc(zone, flags);
-       if (ptr)
-               memset((char *)ptr, 0, kmem_cache_size(zone));
-       return ptr;
-}
diff --git a/fs/xfs/linux-2.6/kmem.h b/fs/xfs/linux-2.6/kmem.h
deleted file mode 100644 (file)
index f7c8f7a..0000000
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-
-/*
- * General memory allocation interfaces
- */
-
-#define KM_SLEEP       0x0001u
-#define KM_NOSLEEP     0x0002u
-#define KM_NOFS                0x0004u
-#define KM_MAYFAIL     0x0008u
-
-/*
- * We use a special process flag to avoid recursive callbacks into
- * the filesystem during transactions.  We will also issue our own
- * warnings, so we explicitly skip any generic ones (silly of us).
- */
-static inline gfp_t
-kmem_flags_convert(unsigned int __nocast flags)
-{
-       gfp_t   lflags;
-
-       BUG_ON(flags & ~(KM_SLEEP|KM_NOSLEEP|KM_NOFS|KM_MAYFAIL));
-
-       if (flags & KM_NOSLEEP) {
-               lflags = GFP_ATOMIC | __GFP_NOWARN;
-       } else {
-               lflags = GFP_KERNEL | __GFP_NOWARN;
-               if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS))
-                       lflags &= ~__GFP_FS;
-       }
-       return lflags;
-}
-
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
-extern void  kmem_free(const void *);
-
-static inline void *kmem_zalloc_large(size_t size)
-{
-       void *ptr;
-
-       ptr = vmalloc(size);
-       if (ptr)
-               memset(ptr, 0, size);
-       return ptr;
-}
-static inline void kmem_free_large(void *ptr)
-{
-       vfree(ptr);
-}
-
-extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
-
-/*
- * Zone interfaces
- */
-
-#define KM_ZONE_HWALIGN        SLAB_HWCACHE_ALIGN
-#define KM_ZONE_RECLAIM        SLAB_RECLAIM_ACCOUNT
-#define KM_ZONE_SPREAD SLAB_MEM_SPREAD
-
-#define kmem_zone      kmem_cache
-#define kmem_zone_t    struct kmem_cache
-
-static inline kmem_zone_t *
-kmem_zone_init(int size, char *zone_name)
-{
-       return kmem_cache_create(zone_name, size, 0, 0, NULL);
-}
-
-static inline kmem_zone_t *
-kmem_zone_init_flags(int size, char *zone_name, unsigned long flags,
-                    void (*construct)(void *))
-{
-       return kmem_cache_create(zone_name, size, 0, flags, construct);
-}
-
-static inline void
-kmem_zone_free(kmem_zone_t *zone, void *ptr)
-{
-       kmem_cache_free(zone, ptr);
-}
-
-static inline void
-kmem_zone_destroy(kmem_zone_t *zone)
-{
-       if (zone)
-               kmem_cache_destroy(zone);
-}
-
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
-
-static inline int
-kmem_shake_allow(gfp_t gfp_mask)
-{
-       return ((gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS));
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h
deleted file mode 100644 (file)
index ff6a198..0000000
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
-       struct rw_semaphore     mr_lock;
-#ifdef DEBUG
-       int                     mr_writer;
-#endif
-} mrlock_t;
-
-#ifdef DEBUG
-#define mrinit(mrp, name)      \
-       do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name)      \
-       do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s)        mrinit(mrp, n)
-#define mrfree(mrp)            do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
-       down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
-       down_write_nested(&mrp->mr_lock, subclass);
-#ifdef DEBUG
-       mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
-       return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
-       if (!down_write_trylock(&mrp->mr_lock))
-               return 0;
-#ifdef DEBUG
-       mrp->mr_writer = 1;
-#endif
-       return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#ifdef DEBUG
-       mrp->mr_writer = 0;
-#endif
-       up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
-       up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#ifdef DEBUG
-       mrp->mr_writer = 0;
-#endif
-       downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/linux-2.6/time.h b/fs/xfs/linux-2.6/time.h
deleted file mode 100644 (file)
index 387e695..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_TIME_H__
-#define __XFS_SUPPORT_TIME_H__
-
-#include <linux/sched.h>
-#include <linux/time.h>
-
-typedef struct timespec timespec_t;
-
-static inline void delay(long ticks)
-{
-       schedule_timeout_uninterruptible(ticks);
-}
-
-static inline void nanotime(struct timespec *tvp)
-{
-       *tvp = CURRENT_TIME;
-}
-
-#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c
deleted file mode 100644 (file)
index b6c4b37..0000000
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * Copyright (c) 2008, Christoph Hellwig
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-#include <linux/slab.h>
-#include <linux/xattr.h>
-#include <linux/posix_acl_xattr.h>
-
-
-/*
- * Locking scheme:
- *  - all ACL updates are protected by inode->i_mutex, which is taken before
- *    calling into this file.
- */
-
-STATIC struct posix_acl *
-xfs_acl_from_disk(struct xfs_acl *aclp)
-{
-       struct posix_acl_entry *acl_e;
-       struct posix_acl *acl;
-       struct xfs_acl_entry *ace;
-       int count, i;
-
-       count = be32_to_cpu(aclp->acl_cnt);
-
-       acl = posix_acl_alloc(count, GFP_KERNEL);
-       if (!acl)
-               return ERR_PTR(-ENOMEM);
-
-       for (i = 0; i < count; i++) {
-               acl_e = &acl->a_entries[i];
-               ace = &aclp->acl_entry[i];
-
-               /*
-                * The tag is 32 bits on disk and 16 bits in core.
-                *
-                * Because every access to it goes through the core
-                * format first this is not a problem.
-                */
-               acl_e->e_tag = be32_to_cpu(ace->ae_tag);
-               acl_e->e_perm = be16_to_cpu(ace->ae_perm);
-
-               switch (acl_e->e_tag) {
-               case ACL_USER:
-               case ACL_GROUP:
-                       acl_e->e_id = be32_to_cpu(ace->ae_id);
-                       break;
-               case ACL_USER_OBJ:
-               case ACL_GROUP_OBJ:
-               case ACL_MASK:
-               case ACL_OTHER:
-                       acl_e->e_id = ACL_UNDEFINED_ID;
-                       break;
-               default:
-                       goto fail;
-               }
-       }
-       return acl;
-
-fail:
-       posix_acl_release(acl);
-       return ERR_PTR(-EINVAL);
-}
-
-STATIC void
-xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
-{
-       const struct posix_acl_entry *acl_e;
-       struct xfs_acl_entry *ace;
-       int i;
-
-       aclp->acl_cnt = cpu_to_be32(acl->a_count);
-       for (i = 0; i < acl->a_count; i++) {
-               ace = &aclp->acl_entry[i];
-               acl_e = &acl->a_entries[i];
-
-               ace->ae_tag = cpu_to_be32(acl_e->e_tag);
-               ace->ae_id = cpu_to_be32(acl_e->e_id);
-               ace->ae_perm = cpu_to_be16(acl_e->e_perm);
-       }
-}
-
-struct posix_acl *
-xfs_get_acl(struct inode *inode, int type)
-{
-       struct xfs_inode *ip = XFS_I(inode);
-       struct posix_acl *acl;
-       struct xfs_acl *xfs_acl;
-       int len = sizeof(struct xfs_acl);
-       unsigned char *ea_name;
-       int error;
-
-       acl = get_cached_acl(inode, type);
-       if (acl != ACL_NOT_CACHED)
-               return acl;
-
-       trace_xfs_get_acl(ip);
-
-       switch (type) {
-       case ACL_TYPE_ACCESS:
-               ea_name = SGI_ACL_FILE;
-               break;
-       case ACL_TYPE_DEFAULT:
-               ea_name = SGI_ACL_DEFAULT;
-               break;
-       default:
-               BUG();
-       }
-
-       /*
-        * If we have a cached ACLs value just return it, not need to
-        * go out to the disk.
-        */
-
-       xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
-       if (!xfs_acl)
-               return ERR_PTR(-ENOMEM);
-
-       error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
-                                                       &len, ATTR_ROOT);
-       if (error) {
-               /*
-                * If the attribute doesn't exist make sure we have a negative
-                * cache entry, for any other error assume it is transient and
-                * leave the cache entry as ACL_NOT_CACHED.
-                */
-               if (error == -ENOATTR) {
-                       acl = NULL;
-                       goto out_update_cache;
-               }
-               goto out;
-       }
-
-       acl = xfs_acl_from_disk(xfs_acl);
-       if (IS_ERR(acl))
-               goto out;
-
- out_update_cache:
-       set_cached_acl(inode, type, acl);
- out:
-       kfree(xfs_acl);
-       return acl;
-}
-
-STATIC int
-xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
-{
-       struct xfs_inode *ip = XFS_I(inode);
-       unsigned char *ea_name;
-       int error;
-
-       if (S_ISLNK(inode->i_mode))
-               return -EOPNOTSUPP;
-
-       switch (type) {
-       case ACL_TYPE_ACCESS:
-               ea_name = SGI_ACL_FILE;
-               break;
-       case ACL_TYPE_DEFAULT:
-               if (!S_ISDIR(inode->i_mode))
-                       return acl ? -EACCES : 0;
-               ea_name = SGI_ACL_DEFAULT;
-               break;
-       default:
-               return -EINVAL;
-       }
-
-       if (acl) {
-               struct xfs_acl *xfs_acl;
-               int len;
-
-               xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
-               if (!xfs_acl)
-                       return -ENOMEM;
-
-               xfs_acl_to_disk(xfs_acl, acl);
-               len = sizeof(struct xfs_acl) -
-                       (sizeof(struct xfs_acl_entry) *
-                        (XFS_ACL_MAX_ENTRIES - acl->a_count));
-
-               error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
-                               len, ATTR_ROOT);
-
-               kfree(xfs_acl);
-       } else {
-               /*
-                * A NULL ACL argument means we want to remove the ACL.
-                */
-               error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
-
-               /*
-                * If the attribute didn't exist to start with that's fine.
-                */
-               if (error == -ENOATTR)
-                       error = 0;
-       }
-
-       if (!error)
-               set_cached_acl(inode, type, acl);
-       return error;
-}
-
-static int
-xfs_set_mode(struct inode *inode, umode_t mode)
-{
-       int error = 0;
-
-       if (mode != inode->i_mode) {
-               struct iattr iattr;
-
-               iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
-               iattr.ia_mode = mode;
-               iattr.ia_ctime = current_fs_time(inode->i_sb);
-
-               error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
-       }
-
-       return error;
-}
-
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
-       int len = sizeof(struct xfs_acl);
-
-       return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
-                           ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
-       return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
-       if (!S_ISDIR(inode->i_mode))
-               return 0;
-       return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
-/*
- * No need for i_mutex because the inode is not yet exposed to the VFS.
- */
-int
-xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
-{
-       umode_t mode = inode->i_mode;
-       int error = 0, inherit = 0;
-
-       if (S_ISDIR(inode->i_mode)) {
-               error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
-               if (error)
-                       goto out;
-       }
-
-       error = posix_acl_create(&acl, GFP_KERNEL, &mode);
-       if (error < 0)
-               return error;
-
-       /*
-        * If posix_acl_create returns a positive value we need to
-        * inherit a permission that can't be represented using the Unix
-        * mode bits and we actually need to set an ACL.
-        */
-       if (error > 0)
-               inherit = 1;
-
-       error = xfs_set_mode(inode, mode);
-       if (error)
-               goto out;
-
-       if (inherit)
-               error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-
-out:
-       posix_acl_release(acl);
-       return error;
-}
-
-int
-xfs_acl_chmod(struct inode *inode)
-{
-       struct posix_acl *acl;
-       int error;
-
-       if (S_ISLNK(inode->i_mode))
-               return -EOPNOTSUPP;
-
-       acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
-       if (IS_ERR(acl) || !acl)
-               return PTR_ERR(acl);
-
-       error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
-       if (error)
-               return error;
-
-       error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
-       posix_acl_release(acl);
-       return error;
-}
-
-static int
-xfs_xattr_acl_get(struct dentry *dentry, const char *name,
-               void *value, size_t size, int type)
-{
-       struct posix_acl *acl;
-       int error;
-
-       acl = xfs_get_acl(dentry->d_inode, type);
-       if (IS_ERR(acl))
-               return PTR_ERR(acl);
-       if (acl == NULL)
-               return -ENODATA;
-
-       error = posix_acl_to_xattr(acl, value, size);
-       posix_acl_release(acl);
-
-       return error;
-}
-
-static int
-xfs_xattr_acl_set(struct dentry *dentry, const char *name,
-               const void *value, size_t size, int flags, int type)
-{
-       struct inode *inode = dentry->d_inode;
-       struct posix_acl *acl = NULL;
-       int error = 0;
-
-       if (flags & XATTR_CREATE)
-               return -EINVAL;
-       if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
-               return value ? -EACCES : 0;
-       if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
-               return -EPERM;
-
-       if (!value)
-               goto set_acl;
-
-       acl = posix_acl_from_xattr(value, size);
-       if (!acl) {
-               /*
-                * acl_set_file(3) may request that we set default ACLs with
-                * zero length -- defend (gracefully) against that here.
-                */
-               goto out;
-       }
-       if (IS_ERR(acl)) {
-               error = PTR_ERR(acl);
-               goto out;
-       }
-
-       error = posix_acl_valid(acl);
-       if (error)
-               goto out_release;
-
-       error = -EINVAL;
-       if (acl->a_count > XFS_ACL_MAX_ENTRIES)
-               goto out_release;
-
-       if (type == ACL_TYPE_ACCESS) {
-               umode_t mode = inode->i_mode;
-               error = posix_acl_equiv_mode(acl, &mode);
-
-               if (error <= 0) {
-                       posix_acl_release(acl);
-                       acl = NULL;
-
-                       if (error < 0)
-                               return error;
-               }
-
-               error = xfs_set_mode(inode, mode);
-               if (error)
-                       goto out_release;
-       }
-
- set_acl:
-       error = xfs_set_acl(inode, type, acl);
- out_release:
-       posix_acl_release(acl);
- out:
-       return error;
-}
-
-const struct xattr_handler xfs_xattr_acl_access_handler = {
-       .prefix = POSIX_ACL_XATTR_ACCESS,
-       .flags  = ACL_TYPE_ACCESS,
-       .get    = xfs_xattr_acl_get,
-       .set    = xfs_xattr_acl_set,
-};
-
-const struct xattr_handler xfs_xattr_acl_default_handler = {
-       .prefix = POSIX_ACL_XATTR_DEFAULT,
-       .flags  = ACL_TYPE_DEFAULT,
-       .get    = xfs_xattr_acl_get,
-       .set    = xfs_xattr_acl_set,
-};
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
deleted file mode 100644 (file)
index 63e971e..0000000
+++ /dev/null
@@ -1,1499 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_trans.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_rw.h"
-#include "xfs_iomap.h"
-#include "xfs_vnodeops.h"
-#include "xfs_trace.h"
-#include "xfs_bmap.h"
-#include <linux/gfp.h>
-#include <linux/mpage.h>
-#include <linux/pagevec.h>
-#include <linux/writeback.h>
-
-
-/*
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC         37
-#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t xfs_ioend_wq[NVSYNC];
-
-void __init
-xfs_ioend_init(void)
-{
-       int i;
-
-       for (i = 0; i < NVSYNC; i++)
-               init_waitqueue_head(&xfs_ioend_wq[i]);
-}
-
-void
-xfs_ioend_wait(
-       xfs_inode_t     *ip)
-{
-       wait_queue_head_t *wq = to_ioend_wq(ip);
-
-       wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-STATIC void
-xfs_ioend_wake(
-       xfs_inode_t     *ip)
-{
-       if (atomic_dec_and_test(&ip->i_iocount))
-               wake_up(to_ioend_wq(ip));
-}
-
-void
-xfs_count_page_state(
-       struct page             *page,
-       int                     *delalloc,
-       int                     *unwritten)
-{
-       struct buffer_head      *bh, *head;
-
-       *delalloc = *unwritten = 0;
-
-       bh = head = page_buffers(page);
-       do {
-               if (buffer_unwritten(bh))
-                       (*unwritten) = 1;
-               else if (buffer_delay(bh))
-                       (*delalloc) = 1;
-       } while ((bh = bh->b_this_page) != head);
-}
-
-STATIC struct block_device *
-xfs_find_bdev_for_inode(
-       struct inode            *inode)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-
-       if (XFS_IS_REALTIME_INODE(ip))
-               return mp->m_rtdev_targp->bt_bdev;
-       else
-               return mp->m_ddev_targp->bt_bdev;
-}
-
-/*
- * We're now finished for good with this ioend structure.
- * Update the page state via the associated buffer_heads,
- * release holds on the inode and bio, and finally free
- * up memory.  Do not use the ioend after this.
- */
-STATIC void
-xfs_destroy_ioend(
-       xfs_ioend_t             *ioend)
-{
-       struct buffer_head      *bh, *next;
-       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-
-       for (bh = ioend->io_buffer_head; bh; bh = next) {
-               next = bh->b_private;
-               bh->b_end_io(bh, !ioend->io_error);
-       }
-
-       /*
-        * Volume managers supporting multiple paths can send back ENODEV
-        * when the final path disappears.  In this case continuing to fill
-        * the page cache with dirty data which cannot be written out is
-        * evil, so prevent that.
-        */
-       if (unlikely(ioend->io_error == -ENODEV)) {
-               xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
-                                     __FILE__, __LINE__);
-       }
-
-       xfs_ioend_wake(ip);
-       mempool_free(ioend, xfs_ioend_pool);
-}
-
-/*
- * If the end of the current ioend is beyond the current EOF,
- * return the new EOF value, otherwise zero.
- */
-STATIC xfs_fsize_t
-xfs_ioend_new_eof(
-       xfs_ioend_t             *ioend)
-{
-       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
-       xfs_fsize_t             isize;
-       xfs_fsize_t             bsize;
-
-       bsize = ioend->io_offset + ioend->io_size;
-       isize = MAX(ip->i_size, ip->i_new_size);
-       isize = MIN(isize, bsize);
-       return isize > ip->i_d.di_size ? isize : 0;
-}
-
-/*
- * Update on-disk file size now that data has been written to disk.  The
- * current in-memory file size is i_size.  If a write is beyond eof i_new_size
- * will be the intended file size until i_size is updated.  If this write does
- * not extend all the way to the valid file size then restrict this update to
- * the end of the write.
- *
- * This function does not block as blocking on the inode lock in IO completion
- * can lead to IO completion order dependency deadlocks.. If it can't get the
- * inode ilock it will return EAGAIN. Callers must handle this.
- */
-STATIC int
-xfs_setfilesize(
-       xfs_ioend_t             *ioend)
-{
-       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
-       xfs_fsize_t             isize;
-
-       if (unlikely(ioend->io_error))
-               return 0;
-
-       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
-               return EAGAIN;
-
-       isize = xfs_ioend_new_eof(ioend);
-       if (isize) {
-               trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
-               ip->i_d.di_size = isize;
-               xfs_mark_inode_dirty(ip);
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return 0;
-}
-
-/*
- * Schedule IO completion handling on the final put of an ioend.
- */
-STATIC void
-xfs_finish_ioend(
-       struct xfs_ioend        *ioend)
-{
-       if (atomic_dec_and_test(&ioend->io_remaining)) {
-               if (ioend->io_type == IO_UNWRITTEN)
-                       queue_work(xfsconvertd_workqueue, &ioend->io_work);
-               else
-                       queue_work(xfsdatad_workqueue, &ioend->io_work);
-       }
-}
-
-/*
- * IO write completion.
- */
-STATIC void
-xfs_end_io(
-       struct work_struct *work)
-{
-       xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
-       struct xfs_inode *ip = XFS_I(ioend->io_inode);
-       int             error = 0;
-
-       /*
-        * For unwritten extents we need to issue transactions to convert a
-        * range to normal written extens after the data I/O has finished.
-        */
-       if (ioend->io_type == IO_UNWRITTEN &&
-           likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-
-               error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-                                                ioend->io_size);
-               if (error)
-                       ioend->io_error = error;
-       }
-
-       /*
-        * We might have to update the on-disk file size after extending
-        * writes.
-        */
-       error = xfs_setfilesize(ioend);
-       ASSERT(!error || error == EAGAIN);
-
-       /*
-        * If we didn't complete processing of the ioend, requeue it to the
-        * tail of the workqueue for another attempt later. Otherwise destroy
-        * it.
-        */
-       if (error == EAGAIN) {
-               atomic_inc(&ioend->io_remaining);
-               xfs_finish_ioend(ioend);
-               /* ensure we don't spin on blocked ioends */
-               delay(1);
-       } else {
-               if (ioend->io_iocb)
-                       aio_complete(ioend->io_iocb, ioend->io_result, 0);
-               xfs_destroy_ioend(ioend);
-       }
-}
-
-/*
- * Call IO completion handling in caller context on the final put of an ioend.
- */
-STATIC void
-xfs_finish_ioend_sync(
-       struct xfs_ioend        *ioend)
-{
-       if (atomic_dec_and_test(&ioend->io_remaining))
-               xfs_end_io(&ioend->io_work);
-}
-
-/*
- * Allocate and initialise an IO completion structure.
- * We need to track unwritten extent write completion here initially.
- * We'll need to extend this for updating the ondisk inode size later
- * (vs. incore size).
- */
-STATIC xfs_ioend_t *
-xfs_alloc_ioend(
-       struct inode            *inode,
-       unsigned int            type)
-{
-       xfs_ioend_t             *ioend;
-
-       ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
-
-       /*
-        * Set the count to 1 initially, which will prevent an I/O
-        * completion callback from happening before we have started
-        * all the I/O from calling the completion routine too early.
-        */
-       atomic_set(&ioend->io_remaining, 1);
-       ioend->io_error = 0;
-       ioend->io_list = NULL;
-       ioend->io_type = type;
-       ioend->io_inode = inode;
-       ioend->io_buffer_head = NULL;
-       ioend->io_buffer_tail = NULL;
-       atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
-       ioend->io_offset = 0;
-       ioend->io_size = 0;
-       ioend->io_iocb = NULL;
-       ioend->io_result = 0;
-
-       INIT_WORK(&ioend->io_work, xfs_end_io);
-       return ioend;
-}
-
-STATIC int
-xfs_map_blocks(
-       struct inode            *inode,
-       loff_t                  offset,
-       struct xfs_bmbt_irec    *imap,
-       int                     type,
-       int                     nonblocking)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 count = 1 << inode->i_blkbits;
-       xfs_fileoff_t           offset_fsb, end_fsb;
-       int                     error = 0;
-       int                     bmapi_flags = XFS_BMAPI_ENTIRE;
-       int                     nimaps = 1;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       if (type == IO_UNWRITTEN)
-               bmapi_flags |= XFS_BMAPI_IGSTATE;
-
-       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
-               if (nonblocking)
-                       return -XFS_ERROR(EAGAIN);
-               xfs_ilock(ip, XFS_ILOCK_SHARED);
-       }
-
-       ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
-              (ip->i_df.if_flags & XFS_IFEXTENTS));
-       ASSERT(offset <= mp->m_maxioffset);
-
-       if (offset + count > mp->m_maxioffset)
-               count = mp->m_maxioffset - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                         bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (error)
-               return -XFS_ERROR(error);
-
-       if (type == IO_DELALLOC &&
-           (!nimaps || isnullstartblock(imap->br_startblock))) {
-               error = xfs_iomap_write_allocate(ip, offset, count, imap);
-               if (!error)
-                       trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
-               return -XFS_ERROR(error);
-       }
-
-#ifdef DEBUG
-       if (type == IO_UNWRITTEN) {
-               ASSERT(nimaps);
-               ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-               ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-       }
-#endif
-       if (nimaps)
-               trace_xfs_map_blocks_found(ip, offset, count, type, imap);
-       return 0;
-}
-
-STATIC int
-xfs_imap_valid(
-       struct inode            *inode,
-       struct xfs_bmbt_irec    *imap,
-       xfs_off_t               offset)
-{
-       offset >>= inode->i_blkbits;
-
-       return offset >= imap->br_startoff &&
-               offset < imap->br_startoff + imap->br_blockcount;
-}
-
-/*
- * BIO completion handler for buffered IO.
- */
-STATIC void
-xfs_end_bio(
-       struct bio              *bio,
-       int                     error)
-{
-       xfs_ioend_t             *ioend = bio->bi_private;
-
-       ASSERT(atomic_read(&bio->bi_cnt) >= 1);
-       ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
-
-       /* Toss bio and pass work off to an xfsdatad thread */
-       bio->bi_private = NULL;
-       bio->bi_end_io = NULL;
-       bio_put(bio);
-
-       xfs_finish_ioend(ioend);
-}
-
-STATIC void
-xfs_submit_ioend_bio(
-       struct writeback_control *wbc,
-       xfs_ioend_t             *ioend,
-       struct bio              *bio)
-{
-       atomic_inc(&ioend->io_remaining);
-       bio->bi_private = ioend;
-       bio->bi_end_io = xfs_end_bio;
-
-       /*
-        * If the I/O is beyond EOF we mark the inode dirty immediately
-        * but don't update the inode size until I/O completion.
-        */
-       if (xfs_ioend_new_eof(ioend))
-               xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
-
-       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
-}
-
-STATIC struct bio *
-xfs_alloc_ioend_bio(
-       struct buffer_head      *bh)
-{
-       int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
-       struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
-
-       ASSERT(bio->bi_private == NULL);
-       bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
-       bio->bi_bdev = bh->b_bdev;
-       return bio;
-}
-
-STATIC void
-xfs_start_buffer_writeback(
-       struct buffer_head      *bh)
-{
-       ASSERT(buffer_mapped(bh));
-       ASSERT(buffer_locked(bh));
-       ASSERT(!buffer_delay(bh));
-       ASSERT(!buffer_unwritten(bh));
-
-       mark_buffer_async_write(bh);
-       set_buffer_uptodate(bh);
-       clear_buffer_dirty(bh);
-}
-
-STATIC void
-xfs_start_page_writeback(
-       struct page             *page,
-       int                     clear_dirty,
-       int                     buffers)
-{
-       ASSERT(PageLocked(page));
-       ASSERT(!PageWriteback(page));
-       if (clear_dirty)
-               clear_page_dirty_for_io(page);
-       set_page_writeback(page);
-       unlock_page(page);
-       /* If no buffers on the page are to be written, finish it here */
-       if (!buffers)
-               end_page_writeback(page);
-}
-
-static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
-{
-       return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
-}
-
-/*
- * Submit all of the bios for all of the ioends we have saved up, covering the
- * initial writepage page and also any probed pages.
- *
- * Because we may have multiple ioends spanning a page, we need to start
- * writeback on all the buffers before we submit them for I/O. If we mark the
- * buffers as we got, then we can end up with a page that only has buffers
- * marked async write and I/O complete on can occur before we mark the other
- * buffers async write.
- *
- * The end result of this is that we trip a bug in end_page_writeback() because
- * we call it twice for the one page as the code in end_buffer_async_write()
- * assumes that all buffers on the page are started at the same time.
- *
- * The fix is two passes across the ioend list - one to start writeback on the
- * buffer_heads, and then submit them for I/O on the second pass.
- */
-STATIC void
-xfs_submit_ioend(
-       struct writeback_control *wbc,
-       xfs_ioend_t             *ioend)
-{
-       xfs_ioend_t             *head = ioend;
-       xfs_ioend_t             *next;
-       struct buffer_head      *bh;
-       struct bio              *bio;
-       sector_t                lastblock = 0;
-
-       /* Pass 1 - start writeback */
-       do {
-               next = ioend->io_list;
-               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
-                       xfs_start_buffer_writeback(bh);
-       } while ((ioend = next) != NULL);
-
-       /* Pass 2 - submit I/O */
-       ioend = head;
-       do {
-               next = ioend->io_list;
-               bio = NULL;
-
-               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
-
-                       if (!bio) {
- retry:
-                               bio = xfs_alloc_ioend_bio(bh);
-                       } else if (bh->b_blocknr != lastblock + 1) {
-                               xfs_submit_ioend_bio(wbc, ioend, bio);
-                               goto retry;
-                       }
-
-                       if (bio_add_buffer(bio, bh) != bh->b_size) {
-                               xfs_submit_ioend_bio(wbc, ioend, bio);
-                               goto retry;
-                       }
-
-                       lastblock = bh->b_blocknr;
-               }
-               if (bio)
-                       xfs_submit_ioend_bio(wbc, ioend, bio);
-               xfs_finish_ioend(ioend);
-       } while ((ioend = next) != NULL);
-}
-
-/*
- * Cancel submission of all buffer_heads so far in this endio.
- * Toss the endio too.  Only ever called for the initial page
- * in a writepage request, so only ever one page.
- */
-STATIC void
-xfs_cancel_ioend(
-       xfs_ioend_t             *ioend)
-{
-       xfs_ioend_t             *next;
-       struct buffer_head      *bh, *next_bh;
-
-       do {
-               next = ioend->io_list;
-               bh = ioend->io_buffer_head;
-               do {
-                       next_bh = bh->b_private;
-                       clear_buffer_async_write(bh);
-                       unlock_buffer(bh);
-               } while ((bh = next_bh) != NULL);
-
-               xfs_ioend_wake(XFS_I(ioend->io_inode));
-               mempool_free(ioend, xfs_ioend_pool);
-       } while ((ioend = next) != NULL);
-}
-
-/*
- * Test to see if we've been building up a completion structure for
- * earlier buffers -- if so, we try to append to this ioend if we
- * can, otherwise we finish off any current ioend and start another.
- * Return true if we've finished the given ioend.
- */
-STATIC void
-xfs_add_to_ioend(
-       struct inode            *inode,
-       struct buffer_head      *bh,
-       xfs_off_t               offset,
-       unsigned int            type,
-       xfs_ioend_t             **result,
-       int                     need_ioend)
-{
-       xfs_ioend_t             *ioend = *result;
-
-       if (!ioend || need_ioend || type != ioend->io_type) {
-               xfs_ioend_t     *previous = *result;
-
-               ioend = xfs_alloc_ioend(inode, type);
-               ioend->io_offset = offset;
-               ioend->io_buffer_head = bh;
-               ioend->io_buffer_tail = bh;
-               if (previous)
-                       previous->io_list = ioend;
-               *result = ioend;
-       } else {
-               ioend->io_buffer_tail->b_private = bh;
-               ioend->io_buffer_tail = bh;
-       }
-
-       bh->b_private = NULL;
-       ioend->io_size += bh->b_size;
-}
-
-STATIC void
-xfs_map_buffer(
-       struct inode            *inode,
-       struct buffer_head      *bh,
-       struct xfs_bmbt_irec    *imap,
-       xfs_off_t               offset)
-{
-       sector_t                bn;
-       struct xfs_mount        *m = XFS_I(inode)->i_mount;
-       xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
-       xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
-
-       ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-       ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-       bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
-             ((offset - iomap_offset) >> inode->i_blkbits);
-
-       ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
-
-       bh->b_blocknr = bn;
-       set_buffer_mapped(bh);
-}
-
-STATIC void
-xfs_map_at_offset(
-       struct inode            *inode,
-       struct buffer_head      *bh,
-       struct xfs_bmbt_irec    *imap,
-       xfs_off_t               offset)
-{
-       ASSERT(imap->br_startblock != HOLESTARTBLOCK);
-       ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
-
-       xfs_map_buffer(inode, bh, imap, offset);
-       set_buffer_mapped(bh);
-       clear_buffer_delay(bh);
-       clear_buffer_unwritten(bh);
-}
-
-/*
- * Test if a given page is suitable for writing as part of an unwritten
- * or delayed allocate extent.
- */
-STATIC int
-xfs_is_delayed_page(
-       struct page             *page,
-       unsigned int            type)
-{
-       if (PageWriteback(page))
-               return 0;
-
-       if (page->mapping && page_has_buffers(page)) {
-               struct buffer_head      *bh, *head;
-               int                     acceptable = 0;
-
-               bh = head = page_buffers(page);
-               do {
-                       if (buffer_unwritten(bh))
-                               acceptable = (type == IO_UNWRITTEN);
-                       else if (buffer_delay(bh))
-                               acceptable = (type == IO_DELALLOC);
-                       else if (buffer_dirty(bh) && buffer_mapped(bh))
-                               acceptable = (type == IO_OVERWRITE);
-                       else
-                               break;
-               } while ((bh = bh->b_this_page) != head);
-
-               if (acceptable)
-                       return 1;
-       }
-
-       return 0;
-}
-
-/*
- * Allocate & map buffers for page given the extent map. Write it out.
- * except for the original page of a writepage, this is called on
- * delalloc/unwritten pages only, for the original page it is possible
- * that the page has no mapping at all.
- */
-STATIC int
-xfs_convert_page(
-       struct inode            *inode,
-       struct page             *page,
-       loff_t                  tindex,
-       struct xfs_bmbt_irec    *imap,
-       xfs_ioend_t             **ioendp,
-       struct writeback_control *wbc)
-{
-       struct buffer_head      *bh, *head;
-       xfs_off_t               end_offset;
-       unsigned long           p_offset;
-       unsigned int            type;
-       int                     len, page_dirty;
-       int                     count = 0, done = 0, uptodate = 1;
-       xfs_off_t               offset = page_offset(page);
-
-       if (page->index != tindex)
-               goto fail;
-       if (!trylock_page(page))
-               goto fail;
-       if (PageWriteback(page))
-               goto fail_unlock_page;
-       if (page->mapping != inode->i_mapping)
-               goto fail_unlock_page;
-       if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
-               goto fail_unlock_page;
-
-       /*
-        * page_dirty is initially a count of buffers on the page before
-        * EOF and is decremented as we move each into a cleanable state.
-        *
-        * Derivation:
-        *
-        * End offset is the highest offset that this page should represent.
-        * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
-        * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
-        * hence give us the correct page_dirty count. On any other page,
-        * it will be zero and in that case we need page_dirty to be the
-        * count of buffers on the page.
-        */
-       end_offset = min_t(unsigned long long,
-                       (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-                       i_size_read(inode));
-
-       len = 1 << inode->i_blkbits;
-       p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
-                                       PAGE_CACHE_SIZE);
-       p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
-       page_dirty = p_offset / len;
-
-       bh = head = page_buffers(page);
-       do {
-               if (offset >= end_offset)
-                       break;
-               if (!buffer_uptodate(bh))
-                       uptodate = 0;
-               if (!(PageUptodate(page) || buffer_uptodate(bh))) {
-                       done = 1;
-                       continue;
-               }
-
-               if (buffer_unwritten(bh) || buffer_delay(bh) ||
-                   buffer_mapped(bh)) {
-                       if (buffer_unwritten(bh))
-                               type = IO_UNWRITTEN;
-                       else if (buffer_delay(bh))
-                               type = IO_DELALLOC;
-                       else
-                               type = IO_OVERWRITE;
-
-                       if (!xfs_imap_valid(inode, imap, offset)) {
-                               done = 1;
-                               continue;
-                       }
-
-                       lock_buffer(bh);
-                       if (type != IO_OVERWRITE)
-                               xfs_map_at_offset(inode, bh, imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, type,
-                                        ioendp, done);
-
-                       page_dirty--;
-                       count++;
-               } else {
-                       done = 1;
-               }
-       } while (offset += len, (bh = bh->b_this_page) != head);
-
-       if (uptodate && bh == head)
-               SetPageUptodate(page);
-
-       if (count) {
-               if (--wbc->nr_to_write <= 0 &&
-                   wbc->sync_mode == WB_SYNC_NONE)
-                       done = 1;
-       }
-       xfs_start_page_writeback(page, !page_dirty, count);
-
-       return done;
- fail_unlock_page:
-       unlock_page(page);
- fail:
-       return 1;
-}
-
-/*
- * Convert & write out a cluster of pages in the same extent as defined
- * by mp and following the start page.
- */
-STATIC void
-xfs_cluster_write(
-       struct inode            *inode,
-       pgoff_t                 tindex,
-       struct xfs_bmbt_irec    *imap,
-       xfs_ioend_t             **ioendp,
-       struct writeback_control *wbc,
-       pgoff_t                 tlast)
-{
-       struct pagevec          pvec;
-       int                     done = 0, i;
-
-       pagevec_init(&pvec, 0);
-       while (!done && tindex <= tlast) {
-               unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
-
-               if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
-                       break;
-
-               for (i = 0; i < pagevec_count(&pvec); i++) {
-                       done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-                                       imap, ioendp, wbc);
-                       if (done)
-                               break;
-               }
-
-               pagevec_release(&pvec);
-               cond_resched();
-       }
-}
-
-STATIC void
-xfs_vm_invalidatepage(
-       struct page             *page,
-       unsigned long           offset)
-{
-       trace_xfs_invalidatepage(page->mapping->host, page, offset);
-       block_invalidatepage(page, offset);
-}
-
-/*
- * If the page has delalloc buffers on it, we need to punch them out before we
- * invalidate the page. If we don't, we leave a stale delalloc mapping on the
- * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
- * is done on that same region - the delalloc extent is returned when none is
- * supposed to be there.
- *
- * We prevent this by truncating away the delalloc regions on the page before
- * invalidating it. Because they are delalloc, we can do this without needing a
- * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
- * truncation without a transaction as there is no space left for block
- * reservation (typically why we see a ENOSPC in writeback).
- *
- * This is not a performance critical path, so for now just do the punching a
- * buffer head at a time.
- */
-STATIC void
-xfs_aops_discard_page(
-       struct page             *page)
-{
-       struct inode            *inode = page->mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct buffer_head      *bh, *head;
-       loff_t                  offset = page_offset(page);
-
-       if (!xfs_is_delayed_page(page, IO_DELALLOC))
-               goto out_invalidate;
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               goto out_invalidate;
-
-       xfs_alert(ip->i_mount,
-               "page discard on page %p, inode 0x%llx, offset %llu.",
-                       page, ip->i_ino, offset);
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       bh = head = page_buffers(page);
-       do {
-               int             error;
-               xfs_fileoff_t   start_fsb;
-
-               if (!buffer_delay(bh))
-                       goto next_buffer;
-
-               start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
-               error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
-               if (error) {
-                       /* something screwed, just bail */
-                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                               xfs_alert(ip->i_mount,
-                       "page discard unable to remove delalloc mapping.");
-                       }
-                       break;
-               }
-next_buffer:
-               offset += 1 << inode->i_blkbits;
-
-       } while ((bh = bh->b_this_page) != head);
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_invalidate:
-       xfs_vm_invalidatepage(page, 0);
-       return;
-}
-
-/*
- * Write out a dirty page.
- *
- * For delalloc space on the page we need to allocate space and flush it.
- * For unwritten space on the page we need to start the conversion to
- * regular allocated space.
- * For any other dirty buffer heads on the page we should flush them.
- */
-STATIC int
-xfs_vm_writepage(
-       struct page             *page,
-       struct writeback_control *wbc)
-{
-       struct inode            *inode = page->mapping->host;
-       struct buffer_head      *bh, *head;
-       struct xfs_bmbt_irec    imap;
-       xfs_ioend_t             *ioend = NULL, *iohead = NULL;
-       loff_t                  offset;
-       unsigned int            type;
-       __uint64_t              end_offset;
-       pgoff_t                 end_index, last_index;
-       ssize_t                 len;
-       int                     err, imap_valid = 0, uptodate = 1;
-       int                     count = 0;
-       int                     nonblocking = 0;
-
-       trace_xfs_writepage(inode, page, 0);
-
-       ASSERT(page_has_buffers(page));
-
-       /*
-        * Refuse to write the page out if we are called from reclaim context.
-        *
-        * This avoids stack overflows when called from deeply used stacks in
-        * random callers for direct reclaim or memcg reclaim.  We explicitly
-        * allow reclaim from kswapd as the stack usage there is relatively low.
-        *
-        * This should really be done by the core VM, but until that happens
-        * filesystems like XFS, btrfs and ext4 have to take care of this
-        * by themselves.
-        */
-       if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
-               goto redirty;
-
-       /*
-        * Given that we do not allow direct reclaim to call us, we should
-        * never be called while in a filesystem transaction.
-        */
-       if (WARN_ON(current->flags & PF_FSTRANS))
-               goto redirty;
-
-       /* Is this page beyond the end of the file? */
-       offset = i_size_read(inode);
-       end_index = offset >> PAGE_CACHE_SHIFT;
-       last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
-       if (page->index >= end_index) {
-               if ((page->index >= end_index + 1) ||
-                   !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
-                       unlock_page(page);
-                       return 0;
-               }
-       }
-
-       end_offset = min_t(unsigned long long,
-                       (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
-                       offset);
-       len = 1 << inode->i_blkbits;
-
-       bh = head = page_buffers(page);
-       offset = page_offset(page);
-       type = IO_OVERWRITE;
-
-       if (wbc->sync_mode == WB_SYNC_NONE)
-               nonblocking = 1;
-
-       do {
-               int new_ioend = 0;
-
-               if (offset >= end_offset)
-                       break;
-               if (!buffer_uptodate(bh))
-                       uptodate = 0;
-
-               /*
-                * set_page_dirty dirties all buffers in a page, independent
-                * of their state.  The dirty state however is entirely
-                * meaningless for holes (!mapped && uptodate), so skip
-                * buffers covering holes here.
-                */
-               if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-                       imap_valid = 0;
-                       continue;
-               }
-
-               if (buffer_unwritten(bh)) {
-                       if (type != IO_UNWRITTEN) {
-                               type = IO_UNWRITTEN;
-                               imap_valid = 0;
-                       }
-               } else if (buffer_delay(bh)) {
-                       if (type != IO_DELALLOC) {
-                               type = IO_DELALLOC;
-                               imap_valid = 0;
-                       }
-               } else if (buffer_uptodate(bh)) {
-                       if (type != IO_OVERWRITE) {
-                               type = IO_OVERWRITE;
-                               imap_valid = 0;
-                       }
-               } else {
-                       if (PageUptodate(page)) {
-                               ASSERT(buffer_mapped(bh));
-                               imap_valid = 0;
-                       }
-                       continue;
-               }
-
-               if (imap_valid)
-                       imap_valid = xfs_imap_valid(inode, &imap, offset);
-               if (!imap_valid) {
-                       /*
-                        * If we didn't have a valid mapping then we need to
-                        * put the new mapping into a separate ioend structure.
-                        * This ensures non-contiguous extents always have
-                        * separate ioends, which is particularly important
-                        * for unwritten extent conversion at I/O completion
-                        * time.
-                        */
-                       new_ioend = 1;
-                       err = xfs_map_blocks(inode, offset, &imap, type,
-                                            nonblocking);
-                       if (err)
-                               goto error;
-                       imap_valid = xfs_imap_valid(inode, &imap, offset);
-               }
-               if (imap_valid) {
-                       lock_buffer(bh);
-                       if (type != IO_OVERWRITE)
-                               xfs_map_at_offset(inode, bh, &imap, offset);
-                       xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-                                        new_ioend);
-                       count++;
-               }
-
-               if (!iohead)
-                       iohead = ioend;
-
-       } while (offset += len, ((bh = bh->b_this_page) != head));
-
-       if (uptodate && bh == head)
-               SetPageUptodate(page);
-
-       xfs_start_page_writeback(page, 1, count);
-
-       if (ioend && imap_valid) {
-               xfs_off_t               end_index;
-
-               end_index = imap.br_startoff + imap.br_blockcount;
-
-               /* to bytes */
-               end_index <<= inode->i_blkbits;
-
-               /* to pages */
-               end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
-
-               /* check against file size */
-               if (end_index > last_index)
-                       end_index = last_index;
-
-               xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-                                 wbc, end_index);
-       }
-
-       if (iohead)
-               xfs_submit_ioend(wbc, iohead);
-
-       return 0;
-
-error:
-       if (iohead)
-               xfs_cancel_ioend(iohead);
-
-       if (err == -EAGAIN)
-               goto redirty;
-
-       xfs_aops_discard_page(page);
-       ClearPageUptodate(page);
-       unlock_page(page);
-       return err;
-
-redirty:
-       redirty_page_for_writepage(wbc, page);
-       unlock_page(page);
-       return 0;
-}
-
-STATIC int
-xfs_vm_writepages(
-       struct address_space    *mapping,
-       struct writeback_control *wbc)
-{
-       xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-       return generic_writepages(mapping, wbc);
-}
-
-/*
- * Called to move a page into cleanable state - and from there
- * to be released. The page should already be clean. We always
- * have buffer heads in this call.
- *
- * Returns 1 if the page is ok to release, 0 otherwise.
- */
-STATIC int
-xfs_vm_releasepage(
-       struct page             *page,
-       gfp_t                   gfp_mask)
-{
-       int                     delalloc, unwritten;
-
-       trace_xfs_releasepage(page->mapping->host, page, 0);
-
-       xfs_count_page_state(page, &delalloc, &unwritten);
-
-       if (WARN_ON(delalloc))
-               return 0;
-       if (WARN_ON(unwritten))
-               return 0;
-
-       return try_to_free_buffers(page);
-}
-
-STATIC int
-__xfs_get_blocks(
-       struct inode            *inode,
-       sector_t                iblock,
-       struct buffer_head      *bh_result,
-       int                     create,
-       int                     direct)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       xfs_fileoff_t           offset_fsb, end_fsb;
-       int                     error = 0;
-       int                     lockmode = 0;
-       struct xfs_bmbt_irec    imap;
-       int                     nimaps = 1;
-       xfs_off_t               offset;
-       ssize_t                 size;
-       int                     new = 0;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       offset = (xfs_off_t)iblock << inode->i_blkbits;
-       ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
-       size = bh_result->b_size;
-
-       if (!create && direct && offset >= i_size_read(inode))
-               return 0;
-
-       if (create) {
-               lockmode = XFS_ILOCK_EXCL;
-               xfs_ilock(ip, lockmode);
-       } else {
-               lockmode = xfs_ilock_map_shared(ip);
-       }
-
-       ASSERT(offset <= mp->m_maxioffset);
-       if (offset + size > mp->m_maxioffset)
-               size = mp->m_maxioffset - offset;
-       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
-       offset_fsb = XFS_B_TO_FSBT(mp, offset);
-
-       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
-                         XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
-       if (error)
-               goto out_unlock;
-
-       if (create &&
-           (!nimaps ||
-            (imap.br_startblock == HOLESTARTBLOCK ||
-             imap.br_startblock == DELAYSTARTBLOCK))) {
-               if (direct) {
-                       error = xfs_iomap_write_direct(ip, offset, size,
-                                                      &imap, nimaps);
-               } else {
-                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
-               }
-               if (error)
-                       goto out_unlock;
-
-               trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
-       } else if (nimaps) {
-               trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
-       } else {
-               trace_xfs_get_blocks_notfound(ip, offset, size);
-               goto out_unlock;
-       }
-       xfs_iunlock(ip, lockmode);
-
-       if (imap.br_startblock != HOLESTARTBLOCK &&
-           imap.br_startblock != DELAYSTARTBLOCK) {
-               /*
-                * For unwritten extents do not report a disk address on
-                * the read case (treat as if we're reading into a hole).
-                */
-               if (create || !ISUNWRITTEN(&imap))
-                       xfs_map_buffer(inode, bh_result, &imap, offset);
-               if (create && ISUNWRITTEN(&imap)) {
-                       if (direct)
-                               bh_result->b_private = inode;
-                       set_buffer_unwritten(bh_result);
-               }
-       }
-
-       /*
-        * If this is a realtime file, data may be on a different device.
-        * to that pointed to from the buffer_head b_bdev currently.
-        */
-       bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
-       /*
-        * If we previously allocated a block out beyond eof and we are now
-        * coming back to use it then we will need to flag it as new even if it
-        * has a disk address.
-        *
-        * With sub-block writes into unwritten extents we also need to mark
-        * the buffer as new so that the unwritten parts of the buffer gets
-        * correctly zeroed.
-        */
-       if (create &&
-           ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
-            (offset >= i_size_read(inode)) ||
-            (new || ISUNWRITTEN(&imap))))
-               set_buffer_new(bh_result);
-
-       if (imap.br_startblock == DELAYSTARTBLOCK) {
-               BUG_ON(direct);
-               if (create) {
-                       set_buffer_uptodate(bh_result);
-                       set_buffer_mapped(bh_result);
-                       set_buffer_delay(bh_result);
-               }
-       }
-
-       /*
-        * If this is O_DIRECT or the mpage code calling tell them how large
-        * the mapping is, so that we can avoid repeated get_blocks calls.
-        */
-       if (direct || size > (1 << inode->i_blkbits)) {
-               xfs_off_t               mapping_size;
-
-               mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
-               mapping_size <<= inode->i_blkbits;
-
-               ASSERT(mapping_size > 0);
-               if (mapping_size > size)
-                       mapping_size = size;
-               if (mapping_size > LONG_MAX)
-                       mapping_size = LONG_MAX;
-
-               bh_result->b_size = mapping_size;
-       }
-
-       return 0;
-
-out_unlock:
-       xfs_iunlock(ip, lockmode);
-       return -error;
-}
-
-int
-xfs_get_blocks(
-       struct inode            *inode,
-       sector_t                iblock,
-       struct buffer_head      *bh_result,
-       int                     create)
-{
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
-}
-
-STATIC int
-xfs_get_blocks_direct(
-       struct inode            *inode,
-       sector_t                iblock,
-       struct buffer_head      *bh_result,
-       int                     create)
-{
-       return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
-}
-
-/*
- * Complete a direct I/O write request.
- *
- * If the private argument is non-NULL __xfs_get_blocks signals us that we
- * need to issue a transaction to convert the range from unwritten to written
- * extents.  In case this is regular synchronous I/O we just call xfs_end_io
- * to do this and we are done.  But in case this was a successful AIO
- * request this handler is called from interrupt context, from which we
- * can't start transactions.  In that case offload the I/O completion to
- * the workqueues we also use for buffered I/O completion.
- */
-STATIC void
-xfs_end_io_direct_write(
-       struct kiocb            *iocb,
-       loff_t                  offset,
-       ssize_t                 size,
-       void                    *private,
-       int                     ret,
-       bool                    is_async)
-{
-       struct xfs_ioend        *ioend = iocb->private;
-
-       /*
-        * blockdev_direct_IO can return an error even after the I/O
-        * completion handler was called.  Thus we need to protect
-        * against double-freeing.
-        */
-       iocb->private = NULL;
-
-       ioend->io_offset = offset;
-       ioend->io_size = size;
-       if (private && size > 0)
-               ioend->io_type = IO_UNWRITTEN;
-
-       if (is_async) {
-               /*
-                * If we are converting an unwritten extent we need to delay
-                * the AIO completion until after the unwrittent extent
-                * conversion has completed, otherwise do it ASAP.
-                */
-               if (ioend->io_type == IO_UNWRITTEN) {
-                       ioend->io_iocb = iocb;
-                       ioend->io_result = ret;
-               } else {
-                       aio_complete(iocb, ret, 0);
-               }
-               xfs_finish_ioend(ioend);
-       } else {
-               xfs_finish_ioend_sync(ioend);
-       }
-
-       /* XXX: probably should move into the real I/O completion handler */
-       inode_dio_done(ioend->io_inode);
-}
-
-STATIC ssize_t
-xfs_vm_direct_IO(
-       int                     rw,
-       struct kiocb            *iocb,
-       const struct iovec      *iov,
-       loff_t                  offset,
-       unsigned long           nr_segs)
-{
-       struct inode            *inode = iocb->ki_filp->f_mapping->host;
-       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-       ssize_t                 ret;
-
-       if (rw & WRITE) {
-               iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
-
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
-                                           xfs_end_io_direct_write, NULL, 0);
-               if (ret != -EIOCBQUEUED && iocb->private)
-                       xfs_destroy_ioend(iocb->private);
-       } else {
-               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
-                                           offset, nr_segs,
-                                           xfs_get_blocks_direct,
-                                           NULL, NULL, 0);
-       }
-
-       return ret;
-}
-
-STATIC void
-xfs_vm_write_failed(
-       struct address_space    *mapping,
-       loff_t                  to)
-{
-       struct inode            *inode = mapping->host;
-
-       if (to > inode->i_size) {
-               /*
-                * punch out the delalloc blocks we have already allocated. We
-                * don't call xfs_setattr() to do this as we may be in the
-                * middle of a multi-iovec write and so the vfs inode->i_size
-                * will not match the xfs ip->i_size and so it will zero too
-                * much. Hence we jus truncate the page cache to zero what is
-                * necessary and punch the delalloc blocks directly.
-                */
-               struct xfs_inode        *ip = XFS_I(inode);
-               xfs_fileoff_t           start_fsb;
-               xfs_fileoff_t           end_fsb;
-               int                     error;
-
-               truncate_pagecache(inode, to, inode->i_size);
-
-               /*
-                * Check if there are any blocks that are outside of i_size
-                * that need to be trimmed back.
-                */
-               start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
-               end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
-               if (end_fsb <= start_fsb)
-                       return;
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-               error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
-                                                       end_fsb - start_fsb);
-               if (error) {
-                       /* something screwed, just bail */
-                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-                               xfs_alert(ip->i_mount,
-                       "xfs_vm_write_failed: unable to clean up ino %lld",
-                                               ip->i_ino);
-                       }
-               }
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
-STATIC int
-xfs_vm_write_begin(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                flags,
-       struct page             **pagep,
-       void                    **fsdata)
-{
-       int                     ret;
-
-       ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
-                               pagep, xfs_get_blocks);
-       if (unlikely(ret))
-               xfs_vm_write_failed(mapping, pos + len);
-       return ret;
-}
-
-STATIC int
-xfs_vm_write_end(
-       struct file             *file,
-       struct address_space    *mapping,
-       loff_t                  pos,
-       unsigned                len,
-       unsigned                copied,
-       struct page             *page,
-       void                    *fsdata)
-{
-       int                     ret;
-
-       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-       if (unlikely(ret < len))
-               xfs_vm_write_failed(mapping, pos + len);
-       return ret;
-}
-
-STATIC sector_t
-xfs_vm_bmap(
-       struct address_space    *mapping,
-       sector_t                block)
-{
-       struct inode            *inode = (struct inode *)mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-
-       trace_xfs_vm_bmap(XFS_I(inode));
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-       return generic_block_bmap(mapping, block, xfs_get_blocks);
-}
-
-STATIC int
-xfs_vm_readpage(
-       struct file             *unused,
-       struct page             *page)
-{
-       return mpage_readpage(page, xfs_get_blocks);
-}
-
-STATIC int
-xfs_vm_readpages(
-       struct file             *unused,
-       struct address_space    *mapping,
-       struct list_head        *pages,
-       unsigned                nr_pages)
-{
-       return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
-}
-
-const struct address_space_operations xfs_address_space_operations = {
-       .readpage               = xfs_vm_readpage,
-       .readpages              = xfs_vm_readpages,
-       .writepage              = xfs_vm_writepage,
-       .writepages             = xfs_vm_writepages,
-       .releasepage            = xfs_vm_releasepage,
-       .invalidatepage         = xfs_vm_invalidatepage,
-       .write_begin            = xfs_vm_write_begin,
-       .write_end              = xfs_vm_write_end,
-       .bmap                   = xfs_vm_bmap,
-       .direct_IO              = xfs_vm_direct_IO,
-       .migratepage            = buffer_migrate_page,
-       .is_partially_uptodate  = block_is_partially_uptodate,
-       .error_remove_page      = generic_error_remove_page,
-};
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
deleted file mode 100644 (file)
index 71f721e..0000000
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_AOPS_H__
-#define __XFS_AOPS_H__
-
-extern struct workqueue_struct *xfsdatad_workqueue;
-extern struct workqueue_struct *xfsconvertd_workqueue;
-extern mempool_t *xfs_ioend_pool;
-
-/*
- * Types of I/O for bmap clustering and I/O completion tracking.
- */
-enum {
-       IO_DIRECT = 0,  /* special case for direct I/O ioends */
-       IO_DELALLOC,    /* mapping covers delalloc region */
-       IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
-       IO_OVERWRITE,   /* mapping covers already allocated extent */
-};
-
-#define XFS_IO_TYPES \
-       { 0,                    "" }, \
-       { IO_DELALLOC,          "delalloc" }, \
-       { IO_UNWRITTEN,         "unwritten" }, \
-       { IO_OVERWRITE,         "overwrite" }
-
-/*
- * xfs_ioend struct manages large extent writes for XFS.
- * It can manage several multi-page bio's at once.
- */
-typedef struct xfs_ioend {
-       struct xfs_ioend        *io_list;       /* next ioend in chain */
-       unsigned int            io_type;        /* delalloc / unwritten */
-       int                     io_error;       /* I/O error code */
-       atomic_t                io_remaining;   /* hold count */
-       struct inode            *io_inode;      /* file being written to */
-       struct buffer_head      *io_buffer_head;/* buffer linked list head */
-       struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
-       size_t                  io_size;        /* size of the extent */
-       xfs_off_t               io_offset;      /* offset in the file */
-       struct work_struct      io_work;        /* xfsdatad work queue */
-       struct kiocb            *io_iocb;
-       int                     io_result;
-} xfs_ioend_t;
-
-extern const struct address_space_operations xfs_address_space_operations;
-extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
-
-extern void xfs_ioend_init(void);
-extern void xfs_ioend_wait(struct xfs_inode *);
-
-extern void xfs_count_page_state(struct page *, int *, int *);
-
-#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
deleted file mode 100644 (file)
index c57836d..0000000
+++ /dev/null
@@ -1,1876 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include <linux/stddef.h>
-#include <linux/errno.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/init.h>
-#include <linux/vmalloc.h>
-#include <linux/bio.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include <linux/workqueue.h>
-#include <linux/percpu.h>
-#include <linux/blkdev.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
-#include <linux/migrate.h>
-#include <linux/backing-dev.h>
-#include <linux/freezer.h>
-
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_trace.h"
-
-static kmem_zone_t *xfs_buf_zone;
-STATIC int xfsbufd(void *);
-STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
-
-static struct workqueue_struct *xfslogd_workqueue;
-struct workqueue_struct *xfsdatad_workqueue;
-struct workqueue_struct *xfsconvertd_workqueue;
-
-#ifdef XFS_BUF_LOCK_TRACKING
-# define XB_SET_OWNER(bp)      ((bp)->b_last_holder = current->pid)
-# define XB_CLEAR_OWNER(bp)    ((bp)->b_last_holder = -1)
-# define XB_GET_OWNER(bp)      ((bp)->b_last_holder)
-#else
-# define XB_SET_OWNER(bp)      do { } while (0)
-# define XB_CLEAR_OWNER(bp)    do { } while (0)
-# define XB_GET_OWNER(bp)      do { } while (0)
-#endif
-
-#define xb_to_gfp(flags) \
-       ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
-         ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
-
-#define xb_to_km(flags) \
-        (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
-
-#define xfs_buf_allocate(flags) \
-       kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
-#define xfs_buf_deallocate(bp) \
-       kmem_zone_free(xfs_buf_zone, (bp));
-
-static inline int
-xfs_buf_is_vmapped(
-       struct xfs_buf  *bp)
-{
-       /*
-        * Return true if the buffer is vmapped.
-        *
-        * The XBF_MAPPED flag is set if the buffer should be mapped, but the
-        * code is clever enough to know it doesn't have to map a single page,
-        * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
-        */
-       return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
-}
-
-static inline int
-xfs_buf_vmap_len(
-       struct xfs_buf  *bp)
-{
-       return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
-}
-
-/*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
-       struct xfs_buf  *bp)
-{
-       struct xfs_buftarg *btp = bp->b_target;
-
-       spin_lock(&btp->bt_lru_lock);
-       if (list_empty(&bp->b_lru)) {
-               atomic_inc(&bp->b_hold);
-               list_add_tail(&bp->b_lru, &btp->bt_lru);
-               btp->bt_lru_nr++;
-       }
-       spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
-       struct xfs_buf  *bp)
-{
-       struct xfs_buftarg *btp = bp->b_target;
-
-       if (list_empty(&bp->b_lru))
-               return;
-
-       spin_lock(&btp->bt_lru_lock);
-       if (!list_empty(&bp->b_lru)) {
-               list_del_init(&bp->b_lru);
-               btp->bt_lru_nr--;
-       }
-       spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * When we mark a buffer stale, we remove the buffer from the LRU and clear the
- * b_lru_ref count so that the buffer is freed immediately when the buffer
- * reference count falls to zero. If the buffer is already on the LRU, we need
- * to remove the reference that LRU holds on the buffer.
- *
- * This prevents build-up of stale buffers on the LRU.
- */
-void
-xfs_buf_stale(
-       struct xfs_buf  *bp)
-{
-       bp->b_flags |= XBF_STALE;
-       atomic_set(&(bp)->b_lru_ref, 0);
-       if (!list_empty(&bp->b_lru)) {
-               struct xfs_buftarg *btp = bp->b_target;
-
-               spin_lock(&btp->bt_lru_lock);
-               if (!list_empty(&bp->b_lru)) {
-                       list_del_init(&bp->b_lru);
-                       btp->bt_lru_nr--;
-                       atomic_dec(&bp->b_hold);
-               }
-               spin_unlock(&btp->bt_lru_lock);
-       }
-       ASSERT(atomic_read(&bp->b_hold) >= 1);
-}
-
-STATIC void
-_xfs_buf_initialize(
-       xfs_buf_t               *bp,
-       xfs_buftarg_t           *target,
-       xfs_off_t               range_base,
-       size_t                  range_length,
-       xfs_buf_flags_t         flags)
-{
-       /*
-        * We don't want certain flags to appear in b_flags.
-        */
-       flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
-
-       memset(bp, 0, sizeof(xfs_buf_t));
-       atomic_set(&bp->b_hold, 1);
-       atomic_set(&bp->b_lru_ref, 1);
-       init_completion(&bp->b_iowait);
-       INIT_LIST_HEAD(&bp->b_lru);
-       INIT_LIST_HEAD(&bp->b_list);
-       RB_CLEAR_NODE(&bp->b_rbnode);
-       sema_init(&bp->b_sema, 0); /* held, no waiters */
-       XB_SET_OWNER(bp);
-       bp->b_target = target;
-       bp->b_file_offset = range_base;
-       /*
-        * Set buffer_length and count_desired to the same value initially.
-        * I/O routines should use count_desired, which will be the same in
-        * most cases but may be reset (e.g. XFS recovery).
-        */
-       bp->b_buffer_length = bp->b_count_desired = range_length;
-       bp->b_flags = flags;
-       bp->b_bn = XFS_BUF_DADDR_NULL;
-       atomic_set(&bp->b_pin_count, 0);
-       init_waitqueue_head(&bp->b_waiters);
-
-       XFS_STATS_INC(xb_create);
-
-       trace_xfs_buf_init(bp, _RET_IP_);
-}
-
-/*
- *     Allocate a page array capable of holding a specified number
- *     of pages, and point the page buf at it.
- */
-STATIC int
-_xfs_buf_get_pages(
-       xfs_buf_t               *bp,
-       int                     page_count,
-       xfs_buf_flags_t         flags)
-{
-       /* Make sure that we have a page list */
-       if (bp->b_pages == NULL) {
-               bp->b_offset = xfs_buf_poff(bp->b_file_offset);
-               bp->b_page_count = page_count;
-               if (page_count <= XB_PAGES) {
-                       bp->b_pages = bp->b_page_array;
-               } else {
-                       bp->b_pages = kmem_alloc(sizeof(struct page *) *
-                                       page_count, xb_to_km(flags));
-                       if (bp->b_pages == NULL)
-                               return -ENOMEM;
-               }
-               memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
-       }
-       return 0;
-}
-
-/*
- *     Frees b_pages if it was allocated.
- */
-STATIC void
-_xfs_buf_free_pages(
-       xfs_buf_t       *bp)
-{
-       if (bp->b_pages != bp->b_page_array) {
-               kmem_free(bp->b_pages);
-               bp->b_pages = NULL;
-       }
-}
-
-/*
- *     Releases the specified buffer.
- *
- *     The modification state of any associated pages is left unchanged.
- *     The buffer most not be on any hash - use xfs_buf_rele instead for
- *     hashed and refcounted buffers
- */
-void
-xfs_buf_free(
-       xfs_buf_t               *bp)
-{
-       trace_xfs_buf_free(bp, _RET_IP_);
-
-       ASSERT(list_empty(&bp->b_lru));
-
-       if (bp->b_flags & _XBF_PAGES) {
-               uint            i;
-
-               if (xfs_buf_is_vmapped(bp))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset,
-                                       bp->b_page_count);
-
-               for (i = 0; i < bp->b_page_count; i++) {
-                       struct page     *page = bp->b_pages[i];
-
-                       __free_page(page);
-               }
-       } else if (bp->b_flags & _XBF_KMEM)
-               kmem_free(bp->b_addr);
-       _xfs_buf_free_pages(bp);
-       xfs_buf_deallocate(bp);
-}
-
-/*
- * Allocates all the pages for buffer in question and builds it's page list.
- */
-STATIC int
-xfs_buf_allocate_memory(
-       xfs_buf_t               *bp,
-       uint                    flags)
-{
-       size_t                  size = bp->b_count_desired;
-       size_t                  nbytes, offset;
-       gfp_t                   gfp_mask = xb_to_gfp(flags);
-       unsigned short          page_count, i;
-       xfs_off_t               end;
-       int                     error;
-
-       /*
-        * for buffers that are contained within a single page, just allocate
-        * the memory from the heap - there's no need for the complexity of
-        * page arrays to keep allocation down to order 0.
-        */
-       if (bp->b_buffer_length < PAGE_SIZE) {
-               bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
-               if (!bp->b_addr) {
-                       /* low memory - use alloc_page loop instead */
-                       goto use_alloc_page;
-               }
-
-               if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
-                                                               PAGE_MASK) !=
-                   ((unsigned long)bp->b_addr & PAGE_MASK)) {
-                       /* b_addr spans two pages - use alloc_page instead */
-                       kmem_free(bp->b_addr);
-                       bp->b_addr = NULL;
-                       goto use_alloc_page;
-               }
-               bp->b_offset = offset_in_page(bp->b_addr);
-               bp->b_pages = bp->b_page_array;
-               bp->b_pages[0] = virt_to_page(bp->b_addr);
-               bp->b_page_count = 1;
-               bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
-               return 0;
-       }
-
-use_alloc_page:
-       end = bp->b_file_offset + bp->b_buffer_length;
-       page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
-       error = _xfs_buf_get_pages(bp, page_count, flags);
-       if (unlikely(error))
-               return error;
-
-       offset = bp->b_offset;
-       bp->b_flags |= _XBF_PAGES;
-
-       for (i = 0; i < bp->b_page_count; i++) {
-               struct page     *page;
-               uint            retries = 0;
-retry:
-               page = alloc_page(gfp_mask);
-               if (unlikely(page == NULL)) {
-                       if (flags & XBF_READ_AHEAD) {
-                               bp->b_page_count = i;
-                               error = ENOMEM;
-                               goto out_free_pages;
-                       }
-
-                       /*
-                        * This could deadlock.
-                        *
-                        * But until all the XFS lowlevel code is revamped to
-                        * handle buffer allocation failures we can't do much.
-                        */
-                       if (!(++retries % 100))
-                               xfs_err(NULL,
-               "possible memory allocation deadlock in %s (mode:0x%x)",
-                                       __func__, gfp_mask);
-
-                       XFS_STATS_INC(xb_page_retries);
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
-                       goto retry;
-               }
-
-               XFS_STATS_INC(xb_page_found);
-
-               nbytes = min_t(size_t, size, PAGE_SIZE - offset);
-               size -= nbytes;
-               bp->b_pages[i] = page;
-               offset = 0;
-       }
-       return 0;
-
-out_free_pages:
-       for (i = 0; i < bp->b_page_count; i++)
-               __free_page(bp->b_pages[i]);
-       return error;
-}
-
-/*
- *     Map buffer into kernel address-space if necessary.
- */
-STATIC int
-_xfs_buf_map_pages(
-       xfs_buf_t               *bp,
-       uint                    flags)
-{
-       ASSERT(bp->b_flags & _XBF_PAGES);
-       if (bp->b_page_count == 1) {
-               /* A single page buffer is always mappable */
-               bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
-               bp->b_flags |= XBF_MAPPED;
-       } else if (flags & XBF_MAPPED) {
-               int retried = 0;
-
-               do {
-                       bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                               -1, PAGE_KERNEL);
-                       if (bp->b_addr)
-                               break;
-                       vm_unmap_aliases();
-               } while (retried++ <= 1);
-
-               if (!bp->b_addr)
-                       return -ENOMEM;
-               bp->b_addr += bp->b_offset;
-               bp->b_flags |= XBF_MAPPED;
-       }
-
-       return 0;
-}
-
-/*
- *     Finding and Reading Buffers
- */
-
-/*
- *     Look up, and creates if absent, a lockable buffer for
- *     a given range of an inode.  The buffer is returned
- *     locked.  If other overlapping buffers exist, they are
- *     released before the new buffer is created and locked,
- *     which may imply that this call will block until those buffers
- *     are unlocked.  No I/O is implied by this call.
- */
-xfs_buf_t *
-_xfs_buf_find(
-       xfs_buftarg_t           *btp,   /* block device target          */
-       xfs_off_t               ioff,   /* starting offset of range     */
-       size_t                  isize,  /* length of range              */
-       xfs_buf_flags_t         flags,
-       xfs_buf_t               *new_bp)
-{
-       xfs_off_t               range_base;
-       size_t                  range_length;
-       struct xfs_perag        *pag;
-       struct rb_node          **rbp;
-       struct rb_node          *parent;
-       xfs_buf_t               *bp;
-
-       range_base = (ioff << BBSHIFT);
-       range_length = (isize << BBSHIFT);
-
-       /* Check for IOs smaller than the sector size / not sector aligned */
-       ASSERT(!(range_length < (1 << btp->bt_sshift)));
-       ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
-
-       /* get tree root */
-       pag = xfs_perag_get(btp->bt_mount,
-                               xfs_daddr_to_agno(btp->bt_mount, ioff));
-
-       /* walk tree */
-       spin_lock(&pag->pag_buf_lock);
-       rbp = &pag->pag_buf_tree.rb_node;
-       parent = NULL;
-       bp = NULL;
-       while (*rbp) {
-               parent = *rbp;
-               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
-               if (range_base < bp->b_file_offset)
-                       rbp = &(*rbp)->rb_left;
-               else if (range_base > bp->b_file_offset)
-                       rbp = &(*rbp)->rb_right;
-               else {
-                       /*
-                        * found a block offset match. If the range doesn't
-                        * match, the only way this is allowed is if the buffer
-                        * in the cache is stale and the transaction that made
-                        * it stale has not yet committed. i.e. we are
-                        * reallocating a busy extent. Skip this buffer and
-                        * continue searching to the right for an exact match.
-                        */
-                       if (bp->b_buffer_length != range_length) {
-                               ASSERT(bp->b_flags & XBF_STALE);
-                               rbp = &(*rbp)->rb_right;
-                               continue;
-                       }
-                       atomic_inc(&bp->b_hold);
-                       goto found;
-               }
-       }
-
-       /* No match found */
-       if (new_bp) {
-               _xfs_buf_initialize(new_bp, btp, range_base,
-                               range_length, flags);
-               rb_link_node(&new_bp->b_rbnode, parent, rbp);
-               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
-               /* the buffer keeps the perag reference until it is freed */
-               new_bp->b_pag = pag;
-               spin_unlock(&pag->pag_buf_lock);
-       } else {
-               XFS_STATS_INC(xb_miss_locked);
-               spin_unlock(&pag->pag_buf_lock);
-               xfs_perag_put(pag);
-       }
-       return new_bp;
-
-found:
-       spin_unlock(&pag->pag_buf_lock);
-       xfs_perag_put(pag);
-
-       if (!xfs_buf_trylock(bp)) {
-               if (flags & XBF_TRYLOCK) {
-                       xfs_buf_rele(bp);
-                       XFS_STATS_INC(xb_busy_locked);
-                       return NULL;
-               }
-               xfs_buf_lock(bp);
-               XFS_STATS_INC(xb_get_locked_waited);
-       }
-
-       /*
-        * if the buffer is stale, clear all the external state associated with
-        * it. We need to keep flags such as how we allocated the buffer memory
-        * intact here.
-        */
-       if (bp->b_flags & XBF_STALE) {
-               ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
-               bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
-       }
-
-       trace_xfs_buf_find(bp, flags, _RET_IP_);
-       XFS_STATS_INC(xb_get_locked);
-       return bp;
-}
-
-/*
- *     Assembles a buffer covering the specified range.
- *     Storage in memory for all portions of the buffer will be allocated,
- *     although backing storage may not be.
- */
-xfs_buf_t *
-xfs_buf_get(
-       xfs_buftarg_t           *target,/* target for buffer            */
-       xfs_off_t               ioff,   /* starting offset of range     */
-       size_t                  isize,  /* length of range              */
-       xfs_buf_flags_t         flags)
-{
-       xfs_buf_t               *bp, *new_bp;
-       int                     error = 0;
-
-       new_bp = xfs_buf_allocate(flags);
-       if (unlikely(!new_bp))
-               return NULL;
-
-       bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
-       if (bp == new_bp) {
-               error = xfs_buf_allocate_memory(bp, flags);
-               if (error)
-                       goto no_buffer;
-       } else {
-               xfs_buf_deallocate(new_bp);
-               if (unlikely(bp == NULL))
-                       return NULL;
-       }
-
-       if (!(bp->b_flags & XBF_MAPPED)) {
-               error = _xfs_buf_map_pages(bp, flags);
-               if (unlikely(error)) {
-                       xfs_warn(target->bt_mount,
-                               "%s: failed to map pages\n", __func__);
-                       goto no_buffer;
-               }
-       }
-
-       XFS_STATS_INC(xb_get);
-
-       /*
-        * Always fill in the block number now, the mapped cases can do
-        * their own overlay of this later.
-        */
-       bp->b_bn = ioff;
-       bp->b_count_desired = bp->b_buffer_length;
-
-       trace_xfs_buf_get(bp, flags, _RET_IP_);
-       return bp;
-
- no_buffer:
-       if (flags & (XBF_LOCK | XBF_TRYLOCK))
-               xfs_buf_unlock(bp);
-       xfs_buf_rele(bp);
-       return NULL;
-}
-
-STATIC int
-_xfs_buf_read(
-       xfs_buf_t               *bp,
-       xfs_buf_flags_t         flags)
-{
-       int                     status;
-
-       ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
-       ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
-
-       bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
-       bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
-
-       status = xfs_buf_iorequest(bp);
-       if (status || bp->b_error || (flags & XBF_ASYNC))
-               return status;
-       return xfs_buf_iowait(bp);
-}
-
-xfs_buf_t *
-xfs_buf_read(
-       xfs_buftarg_t           *target,
-       xfs_off_t               ioff,
-       size_t                  isize,
-       xfs_buf_flags_t         flags)
-{
-       xfs_buf_t               *bp;
-
-       flags |= XBF_READ;
-
-       bp = xfs_buf_get(target, ioff, isize, flags);
-       if (bp) {
-               trace_xfs_buf_read(bp, flags, _RET_IP_);
-
-               if (!XFS_BUF_ISDONE(bp)) {
-                       XFS_STATS_INC(xb_get_read);
-                       _xfs_buf_read(bp, flags);
-               } else if (flags & XBF_ASYNC) {
-                       /*
-                        * Read ahead call which is already satisfied,
-                        * drop the buffer
-                        */
-                       goto no_buffer;
-               } else {
-                       /* We do not want read in the flags */
-                       bp->b_flags &= ~XBF_READ;
-               }
-       }
-
-       return bp;
-
- no_buffer:
-       if (flags & (XBF_LOCK | XBF_TRYLOCK))
-               xfs_buf_unlock(bp);
-       xfs_buf_rele(bp);
-       return NULL;
-}
-
-/*
- *     If we are not low on memory then do the readahead in a deadlock
- *     safe manner.
- */
-void
-xfs_buf_readahead(
-       xfs_buftarg_t           *target,
-       xfs_off_t               ioff,
-       size_t                  isize)
-{
-       if (bdi_read_congested(target->bt_bdi))
-               return;
-
-       xfs_buf_read(target, ioff, isize,
-                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
-}
-
-/*
- * Read an uncached buffer from disk. Allocates and returns a locked
- * buffer containing the disk contents or nothing.
- */
-struct xfs_buf *
-xfs_buf_read_uncached(
-       struct xfs_mount        *mp,
-       struct xfs_buftarg      *target,
-       xfs_daddr_t             daddr,
-       size_t                  length,
-       int                     flags)
-{
-       xfs_buf_t               *bp;
-       int                     error;
-
-       bp = xfs_buf_get_uncached(target, length, flags);
-       if (!bp)
-               return NULL;
-
-       /* set up the buffer for a read IO */
-       XFS_BUF_SET_ADDR(bp, daddr);
-       XFS_BUF_READ(bp);
-
-       xfsbdstrat(mp, bp);
-       error = xfs_buf_iowait(bp);
-       if (error || bp->b_error) {
-               xfs_buf_relse(bp);
-               return NULL;
-       }
-       return bp;
-}
-
-xfs_buf_t *
-xfs_buf_get_empty(
-       size_t                  len,
-       xfs_buftarg_t           *target)
-{
-       xfs_buf_t               *bp;
-
-       bp = xfs_buf_allocate(0);
-       if (bp)
-               _xfs_buf_initialize(bp, target, 0, len, 0);
-       return bp;
-}
-
-/*
- * Return a buffer allocated as an empty buffer and associated to external
- * memory via xfs_buf_associate_memory() back to it's empty state.
- */
-void
-xfs_buf_set_empty(
-       struct xfs_buf          *bp,
-       size_t                  len)
-{
-       if (bp->b_pages)
-               _xfs_buf_free_pages(bp);
-
-       bp->b_pages = NULL;
-       bp->b_page_count = 0;
-       bp->b_addr = NULL;
-       bp->b_file_offset = 0;
-       bp->b_buffer_length = bp->b_count_desired = len;
-       bp->b_bn = XFS_BUF_DADDR_NULL;
-       bp->b_flags &= ~XBF_MAPPED;
-}
-
-static inline struct page *
-mem_to_page(
-       void                    *addr)
-{
-       if ((!is_vmalloc_addr(addr))) {
-               return virt_to_page(addr);
-       } else {
-               return vmalloc_to_page(addr);
-       }
-}
-
-int
-xfs_buf_associate_memory(
-       xfs_buf_t               *bp,
-       void                    *mem,
-       size_t                  len)
-{
-       int                     rval;
-       int                     i = 0;
-       unsigned long           pageaddr;
-       unsigned long           offset;
-       size_t                  buflen;
-       int                     page_count;
-
-       pageaddr = (unsigned long)mem & PAGE_MASK;
-       offset = (unsigned long)mem - pageaddr;
-       buflen = PAGE_ALIGN(len + offset);
-       page_count = buflen >> PAGE_SHIFT;
-
-       /* Free any previous set of page pointers */
-       if (bp->b_pages)
-               _xfs_buf_free_pages(bp);
-
-       bp->b_pages = NULL;
-       bp->b_addr = mem;
-
-       rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
-       if (rval)
-               return rval;
-
-       bp->b_offset = offset;
-
-       for (i = 0; i < bp->b_page_count; i++) {
-               bp->b_pages[i] = mem_to_page((void *)pageaddr);
-               pageaddr += PAGE_SIZE;
-       }
-
-       bp->b_count_desired = len;
-       bp->b_buffer_length = buflen;
-       bp->b_flags |= XBF_MAPPED;
-
-       return 0;
-}
-
-xfs_buf_t *
-xfs_buf_get_uncached(
-       struct xfs_buftarg      *target,
-       size_t                  len,
-       int                     flags)
-{
-       unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
-       int                     error, i;
-       xfs_buf_t               *bp;
-
-       bp = xfs_buf_allocate(0);
-       if (unlikely(bp == NULL))
-               goto fail;
-       _xfs_buf_initialize(bp, target, 0, len, 0);
-
-       error = _xfs_buf_get_pages(bp, page_count, 0);
-       if (error)
-               goto fail_free_buf;
-
-       for (i = 0; i < page_count; i++) {
-               bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
-               if (!bp->b_pages[i])
-                       goto fail_free_mem;
-       }
-       bp->b_flags |= _XBF_PAGES;
-
-       error = _xfs_buf_map_pages(bp, XBF_MAPPED);
-       if (unlikely(error)) {
-               xfs_warn(target->bt_mount,
-                       "%s: failed to map pages\n", __func__);
-               goto fail_free_mem;
-       }
-
-       trace_xfs_buf_get_uncached(bp, _RET_IP_);
-       return bp;
-
- fail_free_mem:
-       while (--i >= 0)
-               __free_page(bp->b_pages[i]);
-       _xfs_buf_free_pages(bp);
- fail_free_buf:
-       xfs_buf_deallocate(bp);
- fail:
-       return NULL;
-}
-
-/*
- *     Increment reference count on buffer, to hold the buffer concurrently
- *     with another thread which may release (free) the buffer asynchronously.
- *     Must hold the buffer already to call this function.
- */
-void
-xfs_buf_hold(
-       xfs_buf_t               *bp)
-{
-       trace_xfs_buf_hold(bp, _RET_IP_);
-       atomic_inc(&bp->b_hold);
-}
-
-/*
- *     Releases a hold on the specified buffer.  If the
- *     the hold count is 1, calls xfs_buf_free.
- */
-void
-xfs_buf_rele(
-       xfs_buf_t               *bp)
-{
-       struct xfs_perag        *pag = bp->b_pag;
-
-       trace_xfs_buf_rele(bp, _RET_IP_);
-
-       if (!pag) {
-               ASSERT(list_empty(&bp->b_lru));
-               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
-               if (atomic_dec_and_test(&bp->b_hold))
-                       xfs_buf_free(bp);
-               return;
-       }
-
-       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
-       ASSERT(atomic_read(&bp->b_hold) > 0);
-       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-               if (!(bp->b_flags & XBF_STALE) &&
-                          atomic_read(&bp->b_lru_ref)) {
-                       xfs_buf_lru_add(bp);
-                       spin_unlock(&pag->pag_buf_lock);
-               } else {
-                       xfs_buf_lru_del(bp);
-                       ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
-                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
-                       spin_unlock(&pag->pag_buf_lock);
-                       xfs_perag_put(pag);
-                       xfs_buf_free(bp);
-               }
-       }
-}
-
-
-/*
- *     Lock a buffer object, if it is not already locked.
- *
- *     If we come across a stale, pinned, locked buffer, we know that we are
- *     being asked to lock a buffer that has been reallocated. Because it is
- *     pinned, we know that the log has not been pushed to disk and hence it
- *     will still be locked.  Rather than continuing to have trylock attempts
- *     fail until someone else pushes the log, push it ourselves before
- *     returning.  This means that the xfsaild will not get stuck trying
- *     to push on stale inode buffers.
- */
-int
-xfs_buf_trylock(
-       struct xfs_buf          *bp)
-{
-       int                     locked;
-
-       locked = down_trylock(&bp->b_sema) == 0;
-       if (locked)
-               XB_SET_OWNER(bp);
-       else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-               xfs_log_force(bp->b_target->bt_mount, 0);
-
-       trace_xfs_buf_trylock(bp, _RET_IP_);
-       return locked;
-}
-
-/*
- *     Lock a buffer object.
- *
- *     If we come across a stale, pinned, locked buffer, we know that we
- *     are being asked to lock a buffer that has been reallocated. Because
- *     it is pinned, we know that the log has not been pushed to disk and
- *     hence it will still be locked. Rather than sleeping until someone
- *     else pushes the log, push it ourselves before trying to get the lock.
- */
-void
-xfs_buf_lock(
-       struct xfs_buf          *bp)
-{
-       trace_xfs_buf_lock(bp, _RET_IP_);
-
-       if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-               xfs_log_force(bp->b_target->bt_mount, 0);
-       down(&bp->b_sema);
-       XB_SET_OWNER(bp);
-
-       trace_xfs_buf_lock_done(bp, _RET_IP_);
-}
-
-/*
- *     Releases the lock on the buffer object.
- *     If the buffer is marked delwri but is not queued, do so before we
- *     unlock the buffer as we need to set flags correctly.  We also need to
- *     take a reference for the delwri queue because the unlocker is going to
- *     drop their's and they don't know we just queued it.
- */
-void
-xfs_buf_unlock(
-       struct xfs_buf          *bp)
-{
-       if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
-               atomic_inc(&bp->b_hold);
-               bp->b_flags |= XBF_ASYNC;
-               xfs_buf_delwri_queue(bp, 0);
-       }
-
-       XB_CLEAR_OWNER(bp);
-       up(&bp->b_sema);
-
-       trace_xfs_buf_unlock(bp, _RET_IP_);
-}
-
-STATIC void
-xfs_buf_wait_unpin(
-       xfs_buf_t               *bp)
-{
-       DECLARE_WAITQUEUE       (wait, current);
-
-       if (atomic_read(&bp->b_pin_count) == 0)
-               return;
-
-       add_wait_queue(&bp->b_waiters, &wait);
-       for (;;) {
-               set_current_state(TASK_UNINTERRUPTIBLE);
-               if (atomic_read(&bp->b_pin_count) == 0)
-                       break;
-               io_schedule();
-       }
-       remove_wait_queue(&bp->b_waiters, &wait);
-       set_current_state(TASK_RUNNING);
-}
-
-/*
- *     Buffer Utility Routines
- */
-
-STATIC void
-xfs_buf_iodone_work(
-       struct work_struct      *work)
-{
-       xfs_buf_t               *bp =
-               container_of(work, xfs_buf_t, b_iodone_work);
-
-       if (bp->b_iodone)
-               (*(bp->b_iodone))(bp);
-       else if (bp->b_flags & XBF_ASYNC)
-               xfs_buf_relse(bp);
-}
-
-void
-xfs_buf_ioend(
-       xfs_buf_t               *bp,
-       int                     schedule)
-{
-       trace_xfs_buf_iodone(bp, _RET_IP_);
-
-       bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
-       if (bp->b_error == 0)
-               bp->b_flags |= XBF_DONE;
-
-       if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
-               if (schedule) {
-                       INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
-                       queue_work(xfslogd_workqueue, &bp->b_iodone_work);
-               } else {
-                       xfs_buf_iodone_work(&bp->b_iodone_work);
-               }
-       } else {
-               complete(&bp->b_iowait);
-       }
-}
-
-void
-xfs_buf_ioerror(
-       xfs_buf_t               *bp,
-       int                     error)
-{
-       ASSERT(error >= 0 && error <= 0xffff);
-       bp->b_error = (unsigned short)error;
-       trace_xfs_buf_ioerror(bp, error, _RET_IP_);
-}
-
-int
-xfs_bwrite(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp)
-{
-       int                     error;
-
-       bp->b_flags |= XBF_WRITE;
-       bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
-
-       xfs_buf_delwri_dequeue(bp);
-       xfs_bdstrat_cb(bp);
-
-       error = xfs_buf_iowait(bp);
-       if (error)
-               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
-       xfs_buf_relse(bp);
-       return error;
-}
-
-void
-xfs_bdwrite(
-       void                    *mp,
-       struct xfs_buf          *bp)
-{
-       trace_xfs_buf_bdwrite(bp, _RET_IP_);
-
-       bp->b_flags &= ~XBF_READ;
-       bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
-
-       xfs_buf_delwri_queue(bp, 1);
-}
-
-/*
- * Called when we want to stop a buffer from getting written or read.
- * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
- * so that the proper iodone callbacks get called.
- */
-STATIC int
-xfs_bioerror(
-       xfs_buf_t *bp)
-{
-#ifdef XFSERRORDEBUG
-       ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
-#endif
-
-       /*
-        * No need to wait until the buffer is unpinned, we aren't flushing it.
-        */
-       xfs_buf_ioerror(bp, EIO);
-
-       /*
-        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
-        */
-       XFS_BUF_UNREAD(bp);
-       XFS_BUF_UNDELAYWRITE(bp);
-       XFS_BUF_UNDONE(bp);
-       XFS_BUF_STALE(bp);
-
-       xfs_buf_ioend(bp, 0);
-
-       return EIO;
-}
-
-/*
- * Same as xfs_bioerror, except that we are releasing the buffer
- * here ourselves, and avoiding the xfs_buf_ioend call.
- * This is meant for userdata errors; metadata bufs come with
- * iodone functions attached, so that we can track down errors.
- */
-STATIC int
-xfs_bioerror_relse(
-       struct xfs_buf  *bp)
-{
-       int64_t         fl = bp->b_flags;
-       /*
-        * No need to wait until the buffer is unpinned.
-        * We aren't flushing it.
-        *
-        * chunkhold expects B_DONE to be set, whether
-        * we actually finish the I/O or not. We don't want to
-        * change that interface.
-        */
-       XFS_BUF_UNREAD(bp);
-       XFS_BUF_UNDELAYWRITE(bp);
-       XFS_BUF_DONE(bp);
-       XFS_BUF_STALE(bp);
-       bp->b_iodone = NULL;
-       if (!(fl & XBF_ASYNC)) {
-               /*
-                * Mark b_error and B_ERROR _both_.
-                * Lot's of chunkcache code assumes that.
-                * There's no reason to mark error for
-                * ASYNC buffers.
-                */
-               xfs_buf_ioerror(bp, EIO);
-               XFS_BUF_FINISH_IOWAIT(bp);
-       } else {
-               xfs_buf_relse(bp);
-       }
-
-       return EIO;
-}
-
-
-/*
- * All xfs metadata buffers except log state machine buffers
- * get this attached as their b_bdstrat callback function.
- * This is so that we can catch a buffer
- * after prematurely unpinning it to forcibly shutdown the filesystem.
- */
-int
-xfs_bdstrat_cb(
-       struct xfs_buf  *bp)
-{
-       if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
-               trace_xfs_bdstrat_shut(bp, _RET_IP_);
-               /*
-                * Metadata write that didn't get logged but
-                * written delayed anyway. These aren't associated
-                * with a transaction, and can be ignored.
-                */
-               if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
-                       return xfs_bioerror_relse(bp);
-               else
-                       return xfs_bioerror(bp);
-       }
-
-       xfs_buf_iorequest(bp);
-       return 0;
-}
-
-/*
- * Wrapper around bdstrat so that we can stop data from going to disk in case
- * we are shutting down the filesystem.  Typically user data goes thru this
- * path; one of the exceptions is the superblock.
- */
-void
-xfsbdstrat(
-       struct xfs_mount        *mp,
-       struct xfs_buf          *bp)
-{
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               trace_xfs_bdstrat_shut(bp, _RET_IP_);
-               xfs_bioerror_relse(bp);
-               return;
-       }
-
-       xfs_buf_iorequest(bp);
-}
-
-STATIC void
-_xfs_buf_ioend(
-       xfs_buf_t               *bp,
-       int                     schedule)
-{
-       if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
-               xfs_buf_ioend(bp, schedule);
-}
-
-STATIC void
-xfs_buf_bio_end_io(
-       struct bio              *bio,
-       int                     error)
-{
-       xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-
-       xfs_buf_ioerror(bp, -error);
-
-       if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
-               invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
-
-       _xfs_buf_ioend(bp, 1);
-       bio_put(bio);
-}
-
-STATIC void
-_xfs_buf_ioapply(
-       xfs_buf_t               *bp)
-{
-       int                     rw, map_i, total_nr_pages, nr_pages;
-       struct bio              *bio;
-       int                     offset = bp->b_offset;
-       int                     size = bp->b_count_desired;
-       sector_t                sector = bp->b_bn;
-
-       total_nr_pages = bp->b_page_count;
-       map_i = 0;
-
-       if (bp->b_flags & XBF_WRITE) {
-               if (bp->b_flags & XBF_SYNCIO)
-                       rw = WRITE_SYNC;
-               else
-                       rw = WRITE;
-               if (bp->b_flags & XBF_FUA)
-                       rw |= REQ_FUA;
-               if (bp->b_flags & XBF_FLUSH)
-                       rw |= REQ_FLUSH;
-       } else if (bp->b_flags & XBF_READ_AHEAD) {
-               rw = READA;
-       } else {
-               rw = READ;
-       }
-
-       /* we only use the buffer cache for meta-data */
-       rw |= REQ_META;
-
-next_chunk:
-       atomic_inc(&bp->b_io_remaining);
-       nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
-       if (nr_pages > total_nr_pages)
-               nr_pages = total_nr_pages;
-
-       bio = bio_alloc(GFP_NOIO, nr_pages);
-       bio->bi_bdev = bp->b_target->bt_bdev;
-       bio->bi_sector = sector;
-       bio->bi_end_io = xfs_buf_bio_end_io;
-       bio->bi_private = bp;
-
-
-       for (; size && nr_pages; nr_pages--, map_i++) {
-               int     rbytes, nbytes = PAGE_SIZE - offset;
-
-               if (nbytes > size)
-                       nbytes = size;
-
-               rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
-               if (rbytes < nbytes)
-                       break;
-
-               offset = 0;
-               sector += nbytes >> BBSHIFT;
-               size -= nbytes;
-               total_nr_pages--;
-       }
-
-       if (likely(bio->bi_size)) {
-               if (xfs_buf_is_vmapped(bp)) {
-                       flush_kernel_vmap_range(bp->b_addr,
-                                               xfs_buf_vmap_len(bp));
-               }
-               submit_bio(rw, bio);
-               if (size)
-                       goto next_chunk;
-       } else {
-               xfs_buf_ioerror(bp, EIO);
-               bio_put(bio);
-       }
-}
-
-int
-xfs_buf_iorequest(
-       xfs_buf_t               *bp)
-{
-       trace_xfs_buf_iorequest(bp, _RET_IP_);
-
-       if (bp->b_flags & XBF_DELWRI) {
-               xfs_buf_delwri_queue(bp, 1);
-               return 0;
-       }
-
-       if (bp->b_flags & XBF_WRITE) {
-               xfs_buf_wait_unpin(bp);
-       }
-
-       xfs_buf_hold(bp);
-
-       /* Set the count to 1 initially, this will stop an I/O
-        * completion callout which happens before we have started
-        * all the I/O from calling xfs_buf_ioend too early.
-        */
-       atomic_set(&bp->b_io_remaining, 1);
-       _xfs_buf_ioapply(bp);
-       _xfs_buf_ioend(bp, 0);
-
-       xfs_buf_rele(bp);
-       return 0;
-}
-
-/*
- *     Waits for I/O to complete on the buffer supplied.
- *     It returns immediately if no I/O is pending.
- *     It returns the I/O error code, if any, or 0 if there was no error.
- */
-int
-xfs_buf_iowait(
-       xfs_buf_t               *bp)
-{
-       trace_xfs_buf_iowait(bp, _RET_IP_);
-
-       wait_for_completion(&bp->b_iowait);
-
-       trace_xfs_buf_iowait_done(bp, _RET_IP_);
-       return bp->b_error;
-}
-
-xfs_caddr_t
-xfs_buf_offset(
-       xfs_buf_t               *bp,
-       size_t                  offset)
-{
-       struct page             *page;
-
-       if (bp->b_flags & XBF_MAPPED)
-               return bp->b_addr + offset;
-
-       offset += bp->b_offset;
-       page = bp->b_pages[offset >> PAGE_SHIFT];
-       return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
-}
-
-/*
- *     Move data into or out of a buffer.
- */
-void
-xfs_buf_iomove(
-       xfs_buf_t               *bp,    /* buffer to process            */
-       size_t                  boff,   /* starting buffer offset       */
-       size_t                  bsize,  /* length to copy               */
-       void                    *data,  /* data address                 */
-       xfs_buf_rw_t            mode)   /* read/write/zero flag         */
-{
-       size_t                  bend, cpoff, csize;
-       struct page             *page;
-
-       bend = boff + bsize;
-       while (boff < bend) {
-               page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
-               cpoff = xfs_buf_poff(boff + bp->b_offset);
-               csize = min_t(size_t,
-                             PAGE_SIZE-cpoff, bp->b_count_desired-boff);
-
-               ASSERT(((csize + cpoff) <= PAGE_SIZE));
-
-               switch (mode) {
-               case XBRW_ZERO:
-                       memset(page_address(page) + cpoff, 0, csize);
-                       break;
-               case XBRW_READ:
-                       memcpy(data, page_address(page) + cpoff, csize);
-                       break;
-               case XBRW_WRITE:
-                       memcpy(page_address(page) + cpoff, data, csize);
-               }
-
-               boff += csize;
-               data += csize;
-       }
-}
-
-/*
- *     Handling of buffer targets (buftargs).
- */
-
-/*
- * Wait for any bufs with callbacks that have been submitted but have not yet
- * returned. These buffers will have an elevated hold count, so wait on those
- * while freeing all the buffers only held by the LRU.
- */
-void
-xfs_wait_buftarg(
-       struct xfs_buftarg      *btp)
-{
-       struct xfs_buf          *bp;
-
-restart:
-       spin_lock(&btp->bt_lru_lock);
-       while (!list_empty(&btp->bt_lru)) {
-               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-               if (atomic_read(&bp->b_hold) > 1) {
-                       spin_unlock(&btp->bt_lru_lock);
-                       delay(100);
-                       goto restart;
-               }
-               /*
-                * clear the LRU reference count so the bufer doesn't get
-                * ignored in xfs_buf_rele().
-                */
-               atomic_set(&bp->b_lru_ref, 0);
-               spin_unlock(&btp->bt_lru_lock);
-               xfs_buf_rele(bp);
-               spin_lock(&btp->bt_lru_lock);
-       }
-       spin_unlock(&btp->bt_lru_lock);
-}
-
-int
-xfs_buftarg_shrink(
-       struct shrinker         *shrink,
-       struct shrink_control   *sc)
-{
-       struct xfs_buftarg      *btp = container_of(shrink,
-                                       struct xfs_buftarg, bt_shrinker);
-       struct xfs_buf          *bp;
-       int nr_to_scan = sc->nr_to_scan;
-       LIST_HEAD(dispose);
-
-       if (!nr_to_scan)
-               return btp->bt_lru_nr;
-
-       spin_lock(&btp->bt_lru_lock);
-       while (!list_empty(&btp->bt_lru)) {
-               if (nr_to_scan-- <= 0)
-                       break;
-
-               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
-               /*
-                * Decrement the b_lru_ref count unless the value is already
-                * zero. If the value is already zero, we need to reclaim the
-                * buffer, otherwise it gets another trip through the LRU.
-                */
-               if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-                       list_move_tail(&bp->b_lru, &btp->bt_lru);
-                       continue;
-               }
-
-               /*
-                * remove the buffer from the LRU now to avoid needing another
-                * lock round trip inside xfs_buf_rele().
-                */
-               list_move(&bp->b_lru, &dispose);
-               btp->bt_lru_nr--;
-       }
-       spin_unlock(&btp->bt_lru_lock);
-
-       while (!list_empty(&dispose)) {
-               bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
-               list_del_init(&bp->b_lru);
-               xfs_buf_rele(bp);
-       }
-
-       return btp->bt_lru_nr;
-}
-
-void
-xfs_free_buftarg(
-       struct xfs_mount        *mp,
-       struct xfs_buftarg      *btp)
-{
-       unregister_shrinker(&btp->bt_shrinker);
-
-       xfs_flush_buftarg(btp, 1);
-       if (mp->m_flags & XFS_MOUNT_BARRIER)
-               xfs_blkdev_issue_flush(btp);
-
-       kthread_stop(btp->bt_task);
-       kmem_free(btp);
-}
-
-STATIC int
-xfs_setsize_buftarg_flags(
-       xfs_buftarg_t           *btp,
-       unsigned int            blocksize,
-       unsigned int            sectorsize,
-       int                     verbose)
-{
-       btp->bt_bsize = blocksize;
-       btp->bt_sshift = ffs(sectorsize) - 1;
-       btp->bt_smask = sectorsize - 1;
-
-       if (set_blocksize(btp->bt_bdev, sectorsize)) {
-               xfs_warn(btp->bt_mount,
-                       "Cannot set_blocksize to %u on device %s\n",
-                       sectorsize, xfs_buf_target_name(btp));
-               return EINVAL;
-       }
-
-       return 0;
-}
-
-/*
- *     When allocating the initial buffer target we have not yet
- *     read in the superblock, so don't know what sized sectors
- *     are being used is at this early stage.  Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
-       xfs_buftarg_t           *btp,
-       struct block_device     *bdev)
-{
-       return xfs_setsize_buftarg_flags(btp,
-                       PAGE_SIZE, bdev_logical_block_size(bdev), 0);
-}
-
-int
-xfs_setsize_buftarg(
-       xfs_buftarg_t           *btp,
-       unsigned int            blocksize,
-       unsigned int            sectorsize)
-{
-       return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
-}
-
-STATIC int
-xfs_alloc_delwrite_queue(
-       xfs_buftarg_t           *btp,
-       const char              *fsname)
-{
-       INIT_LIST_HEAD(&btp->bt_delwrite_queue);
-       spin_lock_init(&btp->bt_delwrite_lock);
-       btp->bt_flags = 0;
-       btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
-       if (IS_ERR(btp->bt_task))
-               return PTR_ERR(btp->bt_task);
-       return 0;
-}
-
-xfs_buftarg_t *
-xfs_alloc_buftarg(
-       struct xfs_mount        *mp,
-       struct block_device     *bdev,
-       int                     external,
-       const char              *fsname)
-{
-       xfs_buftarg_t           *btp;
-
-       btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
-
-       btp->bt_mount = mp;
-       btp->bt_dev =  bdev->bd_dev;
-       btp->bt_bdev = bdev;
-       btp->bt_bdi = blk_get_backing_dev_info(bdev);
-       if (!btp->bt_bdi)
-               goto error;
-
-       INIT_LIST_HEAD(&btp->bt_lru);
-       spin_lock_init(&btp->bt_lru_lock);
-       if (xfs_setsize_buftarg_early(btp, bdev))
-               goto error;
-       if (xfs_alloc_delwrite_queue(btp, fsname))
-               goto error;
-       btp->bt_shrinker.shrink = xfs_buftarg_shrink;
-       btp->bt_shrinker.seeks = DEFAULT_SEEKS;
-       register_shrinker(&btp->bt_shrinker);
-       return btp;
-
-error:
-       kmem_free(btp);
-       return NULL;
-}
-
-
-/*
- *     Delayed write buffer handling
- */
-STATIC void
-xfs_buf_delwri_queue(
-       xfs_buf_t               *bp,
-       int                     unlock)
-{
-       struct list_head        *dwq = &bp->b_target->bt_delwrite_queue;
-       spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
-
-       trace_xfs_buf_delwri_queue(bp, _RET_IP_);
-
-       ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
-
-       spin_lock(dwlk);
-       /* If already in the queue, dequeue and place at tail */
-       if (!list_empty(&bp->b_list)) {
-               ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-               if (unlock)
-                       atomic_dec(&bp->b_hold);
-               list_del(&bp->b_list);
-       }
-
-       if (list_empty(dwq)) {
-               /* start xfsbufd as it is about to have something to do */
-               wake_up_process(bp->b_target->bt_task);
-       }
-
-       bp->b_flags |= _XBF_DELWRI_Q;
-       list_add_tail(&bp->b_list, dwq);
-       bp->b_queuetime = jiffies;
-       spin_unlock(dwlk);
-
-       if (unlock)
-               xfs_buf_unlock(bp);
-}
-
-void
-xfs_buf_delwri_dequeue(
-       xfs_buf_t               *bp)
-{
-       spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
-       int                     dequeued = 0;
-
-       spin_lock(dwlk);
-       if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
-               ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-               list_del_init(&bp->b_list);
-               dequeued = 1;
-       }
-       bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
-       spin_unlock(dwlk);
-
-       if (dequeued)
-               xfs_buf_rele(bp);
-
-       trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
-}
-
-/*
- * If a delwri buffer needs to be pushed before it has aged out, then promote
- * it to the head of the delwri queue so that it will be flushed on the next
- * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
- * than the age currently needed to flush the buffer. Hence the next time the
- * xfsbufd sees it is guaranteed to be considered old enough to flush.
- */
-void
-xfs_buf_delwri_promote(
-       struct xfs_buf  *bp)
-{
-       struct xfs_buftarg *btp = bp->b_target;
-       long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
-
-       ASSERT(bp->b_flags & XBF_DELWRI);
-       ASSERT(bp->b_flags & _XBF_DELWRI_Q);
-
-       /*
-        * Check the buffer age before locking the delayed write queue as we
-        * don't need to promote buffers that are already past the flush age.
-        */
-       if (bp->b_queuetime < jiffies - age)
-               return;
-       bp->b_queuetime = jiffies - age;
-       spin_lock(&btp->bt_delwrite_lock);
-       list_move(&bp->b_list, &btp->bt_delwrite_queue);
-       spin_unlock(&btp->bt_delwrite_lock);
-}
-
-STATIC void
-xfs_buf_runall_queues(
-       struct workqueue_struct *queue)
-{
-       flush_workqueue(queue);
-}
-
-/*
- * Move as many buffers as specified to the supplied list
- * idicating if we skipped any buffers to prevent deadlocks.
- */
-STATIC int
-xfs_buf_delwri_split(
-       xfs_buftarg_t   *target,
-       struct list_head *list,
-       unsigned long   age)
-{
-       xfs_buf_t       *bp, *n;
-       struct list_head *dwq = &target->bt_delwrite_queue;
-       spinlock_t      *dwlk = &target->bt_delwrite_lock;
-       int             skipped = 0;
-       int             force;
-
-       force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-       INIT_LIST_HEAD(list);
-       spin_lock(dwlk);
-       list_for_each_entry_safe(bp, n, dwq, b_list) {
-               ASSERT(bp->b_flags & XBF_DELWRI);
-
-               if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
-                       if (!force &&
-                           time_before(jiffies, bp->b_queuetime + age)) {
-                               xfs_buf_unlock(bp);
-                               break;
-                       }
-
-                       bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
-                       bp->b_flags |= XBF_WRITE;
-                       list_move_tail(&bp->b_list, list);
-                       trace_xfs_buf_delwri_split(bp, _RET_IP_);
-               } else
-                       skipped++;
-       }
-       spin_unlock(dwlk);
-
-       return skipped;
-
-}
-
-/*
- * Compare function is more complex than it needs to be because
- * the return value is only 32 bits and we are doing comparisons
- * on 64 bit values
- */
-static int
-xfs_buf_cmp(
-       void            *priv,
-       struct list_head *a,
-       struct list_head *b)
-{
-       struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
-       struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
-       xfs_daddr_t             diff;
-
-       diff = ap->b_bn - bp->b_bn;
-       if (diff < 0)
-               return -1;
-       if (diff > 0)
-               return 1;
-       return 0;
-}
-
-STATIC int
-xfsbufd(
-       void            *data)
-{
-       xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
-
-       current->flags |= PF_MEMALLOC;
-
-       set_freezable();
-
-       do {
-               long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
-               long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
-               struct list_head tmp;
-               struct blk_plug plug;
-
-               if (unlikely(freezing(current))) {
-                       set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-                       refrigerator();
-               } else {
-                       clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-               }
-
-               /* sleep for a long time if there is nothing to do. */
-               if (list_empty(&target->bt_delwrite_queue))
-                       tout = MAX_SCHEDULE_TIMEOUT;
-               schedule_timeout_interruptible(tout);
-
-               xfs_buf_delwri_split(target, &tmp, age);
-               list_sort(NULL, &tmp, xfs_buf_cmp);
-
-               blk_start_plug(&plug);
-               while (!list_empty(&tmp)) {
-                       struct xfs_buf *bp;
-                       bp = list_first_entry(&tmp, struct xfs_buf, b_list);
-                       list_del_init(&bp->b_list);
-                       xfs_bdstrat_cb(bp);
-               }
-               blk_finish_plug(&plug);
-       } while (!kthread_should_stop());
-
-       return 0;
-}
-
-/*
- *     Go through all incore buffers, and release buffers if they belong to
- *     the given device. This is used in filesystem error handling to
- *     preserve the consistency of its metadata.
- */
-int
-xfs_flush_buftarg(
-       xfs_buftarg_t   *target,
-       int             wait)
-{
-       xfs_buf_t       *bp;
-       int             pincount = 0;
-       LIST_HEAD(tmp_list);
-       LIST_HEAD(wait_list);
-       struct blk_plug plug;
-
-       xfs_buf_runall_queues(xfsconvertd_workqueue);
-       xfs_buf_runall_queues(xfsdatad_workqueue);
-       xfs_buf_runall_queues(xfslogd_workqueue);
-
-       set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
-       pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
-
-       /*
-        * Dropped the delayed write list lock, now walk the temporary list.
-        * All I/O is issued async and then if we need to wait for completion
-        * we do that after issuing all the IO.
-        */
-       list_sort(NULL, &tmp_list, xfs_buf_cmp);
-
-       blk_start_plug(&plug);
-       while (!list_empty(&tmp_list)) {
-               bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
-               ASSERT(target == bp->b_target);
-               list_del_init(&bp->b_list);
-               if (wait) {
-                       bp->b_flags &= ~XBF_ASYNC;
-                       list_add(&bp->b_list, &wait_list);
-               }
-               xfs_bdstrat_cb(bp);
-       }
-       blk_finish_plug(&plug);
-
-       if (wait) {
-               /* Wait for IO to complete. */
-               while (!list_empty(&wait_list)) {
-                       bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
-
-                       list_del_init(&bp->b_list);
-                       xfs_buf_iowait(bp);
-                       xfs_buf_relse(bp);
-               }
-       }
-
-       return pincount;
-}
-
-int __init
-xfs_buf_init(void)
-{
-       xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
-                                               KM_ZONE_HWALIGN, NULL);
-       if (!xfs_buf_zone)
-               goto out;
-
-       xfslogd_workqueue = alloc_workqueue("xfslogd",
-                                       WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
-       if (!xfslogd_workqueue)
-               goto out_free_buf_zone;
-
-       xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
-       if (!xfsdatad_workqueue)
-               goto out_destroy_xfslogd_workqueue;
-
-       xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
-                                               WQ_MEM_RECLAIM, 1);
-       if (!xfsconvertd_workqueue)
-               goto out_destroy_xfsdatad_workqueue;
-
-       return 0;
-
- out_destroy_xfsdatad_workqueue:
-       destroy_workqueue(xfsdatad_workqueue);
- out_destroy_xfslogd_workqueue:
-       destroy_workqueue(xfslogd_workqueue);
- out_free_buf_zone:
-       kmem_zone_destroy(xfs_buf_zone);
- out:
-       return -ENOMEM;
-}
-
-void
-xfs_buf_terminate(void)
-{
-       destroy_workqueue(xfsconvertd_workqueue);
-       destroy_workqueue(xfsdatad_workqueue);
-       destroy_workqueue(xfslogd_workqueue);
-       kmem_zone_destroy(xfs_buf_zone);
-}
-
-#ifdef CONFIG_KDB_MODULES
-struct list_head *
-xfs_get_buftarg_list(void)
-{
-       return &xfs_buftarg_list;
-}
-#endif
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
deleted file mode 100644 (file)
index 620972b..0000000
+++ /dev/null
@@ -1,326 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_BUF_H__
-#define __XFS_BUF_H__
-
-#include <linux/list.h>
-#include <linux/types.h>
-#include <linux/spinlock.h>
-#include <asm/system.h>
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/uio.h>
-
-/*
- *     Base types
- */
-
-#define XFS_BUF_DADDR_NULL     ((xfs_daddr_t) (-1LL))
-
-#define xfs_buf_ctob(pp)       ((pp) * PAGE_CACHE_SIZE)
-#define xfs_buf_btoc(dd)       (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_btoct(dd)      ((dd) >> PAGE_CACHE_SHIFT)
-#define xfs_buf_poff(aa)       ((aa) & ~PAGE_CACHE_MASK)
-
-typedef enum {
-       XBRW_READ = 1,                  /* transfer into target memory */
-       XBRW_WRITE = 2,                 /* transfer from target memory */
-       XBRW_ZERO = 3,                  /* Zero target memory */
-} xfs_buf_rw_t;
-
-#define XBF_READ       (1 << 0) /* buffer intended for reading from device */
-#define XBF_WRITE      (1 << 1) /* buffer intended for writing to device */
-#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
-#define XBF_MAPPED     (1 << 3) /* buffer mapped (b_addr valid) */
-#define XBF_ASYNC      (1 << 4) /* initiator will not wait for completion */
-#define XBF_DONE       (1 << 5) /* all pages in the buffer uptodate */
-#define XBF_DELWRI     (1 << 6) /* buffer has dirty pages */
-#define XBF_STALE      (1 << 7) /* buffer has been staled, do not find it */
-
-/* I/O hints for the BIO layer */
-#define XBF_SYNCIO     (1 << 10)/* treat this buffer as synchronous I/O */
-#define XBF_FUA                (1 << 11)/* force cache write through mode */
-#define XBF_FLUSH      (1 << 12)/* flush the disk cache before a write */
-
-/* flags used only as arguments to access routines */
-#define XBF_LOCK       (1 << 15)/* lock requested */
-#define XBF_TRYLOCK    (1 << 16)/* lock requested, but do not wait */
-#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */
-
-/* flags used only internally */
-#define _XBF_PAGES     (1 << 20)/* backed by refcounted pages */
-#define _XBF_KMEM      (1 << 21)/* backed by heap memory */
-#define _XBF_DELWRI_Q  (1 << 22)/* buffer on delwri queue */
-
-typedef unsigned int xfs_buf_flags_t;
-
-#define XFS_BUF_FLAGS \
-       { XBF_READ,             "READ" }, \
-       { XBF_WRITE,            "WRITE" }, \
-       { XBF_READ_AHEAD,       "READ_AHEAD" }, \
-       { XBF_MAPPED,           "MAPPED" }, \
-       { XBF_ASYNC,            "ASYNC" }, \
-       { XBF_DONE,             "DONE" }, \
-       { XBF_DELWRI,           "DELWRI" }, \
-       { XBF_STALE,            "STALE" }, \
-       { XBF_SYNCIO,           "SYNCIO" }, \
-       { XBF_FUA,              "FUA" }, \
-       { XBF_FLUSH,            "FLUSH" }, \
-       { XBF_LOCK,             "LOCK" },       /* should never be set */\
-       { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
-       { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
-       { _XBF_PAGES,           "PAGES" }, \
-       { _XBF_KMEM,            "KMEM" }, \
-       { _XBF_DELWRI_Q,        "DELWRI_Q" }
-
-typedef enum {
-       XBT_FORCE_SLEEP = 0,
-       XBT_FORCE_FLUSH = 1,
-} xfs_buftarg_flags_t;
-
-typedef struct xfs_buftarg {
-       dev_t                   bt_dev;
-       struct block_device     *bt_bdev;
-       struct backing_dev_info *bt_bdi;
-       struct xfs_mount        *bt_mount;
-       unsigned int            bt_bsize;
-       unsigned int            bt_sshift;
-       size_t                  bt_smask;
-
-       /* per device delwri queue */
-       struct task_struct      *bt_task;
-       struct list_head        bt_delwrite_queue;
-       spinlock_t              bt_delwrite_lock;
-       unsigned long           bt_flags;
-
-       /* LRU control structures */
-       struct shrinker         bt_shrinker;
-       struct list_head        bt_lru;
-       spinlock_t              bt_lru_lock;
-       unsigned int            bt_lru_nr;
-} xfs_buftarg_t;
-
-struct xfs_buf;
-typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
-
-#define XB_PAGES       2
-
-typedef struct xfs_buf {
-       /*
-        * first cacheline holds all the fields needed for an uncontended cache
-        * hit to be fully processed. The semaphore straddles the cacheline
-        * boundary, but the counter and lock sits on the first cacheline,
-        * which is the only bit that is touched if we hit the semaphore
-        * fast-path on locking.
-        */
-       struct rb_node          b_rbnode;       /* rbtree node */
-       xfs_off_t               b_file_offset;  /* offset in file */
-       size_t                  b_buffer_length;/* size of buffer in bytes */
-       atomic_t                b_hold;         /* reference count */
-       atomic_t                b_lru_ref;      /* lru reclaim ref count */
-       xfs_buf_flags_t         b_flags;        /* status flags */
-       struct semaphore        b_sema;         /* semaphore for lockables */
-
-       struct list_head        b_lru;          /* lru list */
-       wait_queue_head_t       b_waiters;      /* unpin waiters */
-       struct list_head        b_list;
-       struct xfs_perag        *b_pag;         /* contains rbtree root */
-       xfs_buftarg_t           *b_target;      /* buffer target (device) */
-       xfs_daddr_t             b_bn;           /* block number for I/O */
-       size_t                  b_count_desired;/* desired transfer size */
-       void                    *b_addr;        /* virtual address of buffer */
-       struct work_struct      b_iodone_work;
-       xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
-       struct completion       b_iowait;       /* queue for I/O waiters */
-       void                    *b_fspriv;
-       struct xfs_trans        *b_transp;
-       struct page             **b_pages;      /* array of page pointers */
-       struct page             *b_page_array[XB_PAGES]; /* inline pages */
-       unsigned long           b_queuetime;    /* time buffer was queued */
-       atomic_t                b_pin_count;    /* pin count */
-       atomic_t                b_io_remaining; /* #outstanding I/O requests */
-       unsigned int            b_page_count;   /* size of page array */
-       unsigned int            b_offset;       /* page offset in first page */
-       unsigned short          b_error;        /* error code on I/O */
-#ifdef XFS_BUF_LOCK_TRACKING
-       int                     b_last_holder;
-#endif
-} xfs_buf_t;
-
-
-/* Finding and Reading Buffers */
-extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
-                               xfs_buf_flags_t, xfs_buf_t *);
-#define xfs_incore(buftarg,blkno,len,lockit) \
-       _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
-
-extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
-                               xfs_buf_flags_t);
-extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
-                               xfs_buf_flags_t);
-
-extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
-extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
-extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
-extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
-extern void xfs_buf_hold(xfs_buf_t *);
-extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
-struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
-                               struct xfs_buftarg *target,
-                               xfs_daddr_t daddr, size_t length, int flags);
-
-/* Releasing Buffers */
-extern void xfs_buf_free(xfs_buf_t *);
-extern void xfs_buf_rele(xfs_buf_t *);
-
-/* Locking and Unlocking Buffers */
-extern int xfs_buf_trylock(xfs_buf_t *);
-extern void xfs_buf_lock(xfs_buf_t *);
-extern void xfs_buf_unlock(xfs_buf_t *);
-#define xfs_buf_islocked(bp) \
-       ((bp)->b_sema.count <= 0)
-
-/* Buffer Read and Write Routines */
-extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
-extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
-
-extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
-extern int xfs_bdstrat_cb(struct xfs_buf *);
-
-extern void xfs_buf_ioend(xfs_buf_t *, int);
-extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iorequest(xfs_buf_t *);
-extern int xfs_buf_iowait(xfs_buf_t *);
-extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
-                               xfs_buf_rw_t);
-#define xfs_buf_zero(bp, off, len) \
-           xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
-
-static inline int xfs_buf_geterror(xfs_buf_t *bp)
-{
-       return bp ? bp->b_error : ENOMEM;
-}
-
-/* Buffer Utility Routines */
-extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
-
-/* Delayed Write Buffer Routines */
-extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
-extern void xfs_buf_delwri_promote(xfs_buf_t *);
-
-/* Buffer Daemon Setup Routines */
-extern int xfs_buf_init(void);
-extern void xfs_buf_terminate(void);
-
-static inline const char *
-xfs_buf_target_name(struct xfs_buftarg *target)
-{
-       static char __b[BDEVNAME_SIZE];
-
-       return bdevname(target->bt_bdev, __b);
-}
-
-
-#define XFS_BUF_ZEROFLAGS(bp) \
-       ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
-                           XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
-
-void xfs_buf_stale(struct xfs_buf *bp);
-#define XFS_BUF_STALE(bp)      xfs_buf_stale(bp);
-#define XFS_BUF_UNSTALE(bp)    ((bp)->b_flags &= ~XBF_STALE)
-#define XFS_BUF_ISSTALE(bp)    ((bp)->b_flags & XBF_STALE)
-#define XFS_BUF_SUPER_STALE(bp)        do {                            \
-                                       XFS_BUF_STALE(bp);      \
-                                       xfs_buf_delwri_dequeue(bp);     \
-                                       XFS_BUF_DONE(bp);       \
-                               } while (0)
-
-#define XFS_BUF_DELAYWRITE(bp)         ((bp)->b_flags |= XBF_DELWRI)
-#define XFS_BUF_UNDELAYWRITE(bp)       xfs_buf_delwri_dequeue(bp)
-#define XFS_BUF_ISDELAYWRITE(bp)       ((bp)->b_flags & XBF_DELWRI)
-
-#define XFS_BUF_DONE(bp)       ((bp)->b_flags |= XBF_DONE)
-#define XFS_BUF_UNDONE(bp)     ((bp)->b_flags &= ~XBF_DONE)
-#define XFS_BUF_ISDONE(bp)     ((bp)->b_flags & XBF_DONE)
-
-#define XFS_BUF_ASYNC(bp)      ((bp)->b_flags |= XBF_ASYNC)
-#define XFS_BUF_UNASYNC(bp)    ((bp)->b_flags &= ~XBF_ASYNC)
-#define XFS_BUF_ISASYNC(bp)    ((bp)->b_flags & XBF_ASYNC)
-
-#define XFS_BUF_READ(bp)       ((bp)->b_flags |= XBF_READ)
-#define XFS_BUF_UNREAD(bp)     ((bp)->b_flags &= ~XBF_READ)
-#define XFS_BUF_ISREAD(bp)     ((bp)->b_flags & XBF_READ)
-
-#define XFS_BUF_WRITE(bp)      ((bp)->b_flags |= XBF_WRITE)
-#define XFS_BUF_UNWRITE(bp)    ((bp)->b_flags &= ~XBF_WRITE)
-#define XFS_BUF_ISWRITE(bp)    ((bp)->b_flags & XBF_WRITE)
-
-#define XFS_BUF_ADDR(bp)               ((bp)->b_bn)
-#define XFS_BUF_SET_ADDR(bp, bno)      ((bp)->b_bn = (xfs_daddr_t)(bno))
-#define XFS_BUF_OFFSET(bp)             ((bp)->b_file_offset)
-#define XFS_BUF_SET_OFFSET(bp, off)    ((bp)->b_file_offset = (off))
-#define XFS_BUF_COUNT(bp)              ((bp)->b_count_desired)
-#define XFS_BUF_SET_COUNT(bp, cnt)     ((bp)->b_count_desired = (cnt))
-#define XFS_BUF_SIZE(bp)               ((bp)->b_buffer_length)
-#define XFS_BUF_SET_SIZE(bp, cnt)      ((bp)->b_buffer_length = (cnt))
-
-static inline void
-xfs_buf_set_ref(
-       struct xfs_buf  *bp,
-       int             lru_ref)
-{
-       atomic_set(&bp->b_lru_ref, lru_ref);
-}
-#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)   xfs_buf_set_ref(bp, ref)
-#define XFS_BUF_SET_VTYPE(bp, type)            do { } while (0)
-
-static inline int xfs_buf_ispinned(struct xfs_buf *bp)
-{
-       return atomic_read(&bp->b_pin_count);
-}
-
-#define XFS_BUF_FINISH_IOWAIT(bp)      complete(&bp->b_iowait);
-
-static inline void xfs_buf_relse(xfs_buf_t *bp)
-{
-       xfs_buf_unlock(bp);
-       xfs_buf_rele(bp);
-}
-
-/*
- *     Handling of buftargs.
- */
-extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
-                       struct block_device *, int, const char *);
-extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
-extern void xfs_wait_buftarg(xfs_buftarg_t *);
-extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
-extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
-
-#ifdef CONFIG_KDB_MODULES
-extern struct list_head *xfs_get_buftarg_list(void);
-#endif
-
-#define xfs_getsize_buftarg(buftarg)   block_size((buftarg)->bt_bdev)
-#define xfs_readonly_buftarg(buftarg)  bdev_read_only((buftarg)->bt_bdev)
-
-#define xfs_binval(buftarg)            xfs_flush_buftarg(buftarg, 1)
-#define XFS_bflush(buftarg)            xfs_flush_buftarg(buftarg, 1)
-
-#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
deleted file mode 100644 (file)
index 244e797..0000000
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (C) 2010 Red Hat, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_btree.h"
-#include "xfs_inode.h"
-#include "xfs_alloc.h"
-#include "xfs_error.h"
-#include "xfs_discard.h"
-#include "xfs_trace.h"
-
-STATIC int
-xfs_trim_extents(
-       struct xfs_mount        *mp,
-       xfs_agnumber_t          agno,
-       xfs_fsblock_t           start,
-       xfs_fsblock_t           len,
-       xfs_fsblock_t           minlen,
-       __uint64_t              *blocks_trimmed)
-{
-       struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
-       struct xfs_btree_cur    *cur;
-       struct xfs_buf          *agbp;
-       struct xfs_perag        *pag;
-       int                     error;
-       int                     i;
-
-       pag = xfs_perag_get(mp, agno);
-
-       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
-       if (error || !agbp)
-               goto out_put_perag;
-
-       cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
-
-       /*
-        * Force out the log.  This means any transactions that might have freed
-        * space before we took the AGF buffer lock are now on disk, and the
-        * volatile disk cache is flushed.
-        */
-       xfs_log_force(mp, XFS_LOG_SYNC);
-
-       /*
-        * Look up the longest btree in the AGF and start with it.
-        */
-       error = xfs_alloc_lookup_le(cur, 0,
-                                   XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
-       if (error)
-               goto out_del_cursor;
-
-       /*
-        * Loop until we are done with all extents that are large
-        * enough to be worth discarding.
-        */
-       while (i) {
-               xfs_agblock_t fbno;
-               xfs_extlen_t flen;
-
-               error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
-               if (error)
-                       goto out_del_cursor;
-               XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
-               ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
-
-               /*
-                * Too small?  Give up.
-                */
-               if (flen < minlen) {
-                       trace_xfs_discard_toosmall(mp, agno, fbno, flen);
-                       goto out_del_cursor;
-               }
-
-               /*
-                * If the extent is entirely outside of the range we are
-                * supposed to discard skip it.  Do not bother to trim
-                * down partially overlapping ranges for now.
-                */
-               if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
-                   XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
-                       trace_xfs_discard_exclude(mp, agno, fbno, flen);
-                       goto next_extent;
-               }
-
-               /*
-                * If any blocks in the range are still busy, skip the
-                * discard and try again the next time.
-                */
-               if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
-                       trace_xfs_discard_busy(mp, agno, fbno, flen);
-                       goto next_extent;
-               }
-
-               trace_xfs_discard_extent(mp, agno, fbno, flen);
-               error = -blkdev_issue_discard(bdev,
-                               XFS_AGB_TO_DADDR(mp, agno, fbno),
-                               XFS_FSB_TO_BB(mp, flen),
-                               GFP_NOFS, 0);
-               if (error)
-                       goto out_del_cursor;
-               *blocks_trimmed += flen;
-
-next_extent:
-               error = xfs_btree_decrement(cur, 0, &i);
-               if (error)
-                       goto out_del_cursor;
-       }
-
-out_del_cursor:
-       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
-       xfs_buf_relse(agbp);
-out_put_perag:
-       xfs_perag_put(pag);
-       return error;
-}
-
-int
-xfs_ioc_trim(
-       struct xfs_mount                *mp,
-       struct fstrim_range __user      *urange)
-{
-       struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
-       unsigned int            granularity = q->limits.discard_granularity;
-       struct fstrim_range     range;
-       xfs_fsblock_t           start, len, minlen;
-       xfs_agnumber_t          start_agno, end_agno, agno;
-       __uint64_t              blocks_trimmed = 0;
-       int                     error, last_error = 0;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-       if (!blk_queue_discard(q))
-               return -XFS_ERROR(EOPNOTSUPP);
-       if (copy_from_user(&range, urange, sizeof(range)))
-               return -XFS_ERROR(EFAULT);
-
-       /*
-        * Truncating down the len isn't actually quite correct, but using
-        * XFS_B_TO_FSB would mean we trivially get overflows for values
-        * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
-        * used by the fstrim application.  In the end it really doesn't
-        * matter as trimming blocks is an advisory interface.
-        */
-       start = XFS_B_TO_FSBT(mp, range.start);
-       len = XFS_B_TO_FSBT(mp, range.len);
-       minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
-
-       start_agno = XFS_FSB_TO_AGNO(mp, start);
-       if (start_agno >= mp->m_sb.sb_agcount)
-               return -XFS_ERROR(EINVAL);
-
-       end_agno = XFS_FSB_TO_AGNO(mp, start + len);
-       if (end_agno >= mp->m_sb.sb_agcount)
-               end_agno = mp->m_sb.sb_agcount - 1;
-
-       for (agno = start_agno; agno <= end_agno; agno++) {
-               error = -xfs_trim_extents(mp, agno, start, len, minlen,
-                                         &blocks_trimmed);
-               if (error)
-                       last_error = error;
-       }
-
-       if (last_error)
-               return last_error;
-
-       range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
-       if (copy_to_user(urange, &range, sizeof(range)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-int
-xfs_discard_extents(
-       struct xfs_mount        *mp,
-       struct list_head        *list)
-{
-       struct xfs_busy_extent  *busyp;
-       int                     error = 0;
-
-       list_for_each_entry(busyp, list, list) {
-               trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
-                                        busyp->length);
-
-               error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
-                               XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
-                               XFS_FSB_TO_BB(mp, busyp->length),
-                               GFP_NOFS, 0);
-               if (error && error != EOPNOTSUPP) {
-                       xfs_info(mp,
-        "discard failed for extent [0x%llu,%u], error %d",
-                                (unsigned long long)busyp->bno,
-                                busyp->length,
-                                error);
-                       return error;
-               }
-       }
-
-       return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_discard.h b/fs/xfs/linux-2.6/xfs_discard.h
deleted file mode 100644 (file)
index 344879a..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef XFS_DISCARD_H
-#define XFS_DISCARD_H 1
-
-struct fstrim_range;
-struct list_head;
-
-extern int     xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
-extern int     xfs_discard_extents(struct xfs_mount *, struct list_head *);
-
-#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
deleted file mode 100644 (file)
index 75e5d32..0000000
+++ /dev/null
@@ -1,250 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_types.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_mount.h"
-#include "xfs_export.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_trace.h"
-
-/*
- * Note that we only accept fileids which are long enough rather than allow
- * the parent generation number to default to zero.  XFS considers zero a
- * valid generation number not an invalid/wildcard value.
- */
-static int xfs_fileid_length(int fileid_type)
-{
-       switch (fileid_type) {
-       case FILEID_INO32_GEN:
-               return 2;
-       case FILEID_INO32_GEN_PARENT:
-               return 4;
-       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
-               return 3;
-       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-               return 6;
-       }
-       return 255; /* invalid */
-}
-
-STATIC int
-xfs_fs_encode_fh(
-       struct dentry           *dentry,
-       __u32                   *fh,
-       int                     *max_len,
-       int                     connectable)
-{
-       struct fid              *fid = (struct fid *)fh;
-       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fh;
-       struct inode            *inode = dentry->d_inode;
-       int                     fileid_type;
-       int                     len;
-
-       /* Directories don't need their parent encoded, they have ".." */
-       if (S_ISDIR(inode->i_mode) || !connectable)
-               fileid_type = FILEID_INO32_GEN;
-       else
-               fileid_type = FILEID_INO32_GEN_PARENT;
-
-       /*
-        * If the the filesystem may contain 64bit inode numbers, we need
-        * to use larger file handles that can represent them.
-        *
-        * While we only allocate inodes that do not fit into 32 bits any
-        * large enough filesystem may contain them, thus the slightly
-        * confusing looking conditional below.
-        */
-       if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
-           (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
-               fileid_type |= XFS_FILEID_TYPE_64FLAG;
-
-       /*
-        * Only encode if there is enough space given.  In practice
-        * this means we can't export a filesystem with 64bit inodes
-        * over NFSv2 with the subtree_check export option; the other
-        * seven combinations work.  The real answer is "don't use v2".
-        */
-       len = xfs_fileid_length(fileid_type);
-       if (*max_len < len) {
-               *max_len = len;
-               return 255;
-       }
-       *max_len = len;
-
-       switch (fileid_type) {
-       case FILEID_INO32_GEN_PARENT:
-               spin_lock(&dentry->d_lock);
-               fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
-               fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
-               /*FALLTHRU*/
-       case FILEID_INO32_GEN:
-               fid->i32.ino = inode->i_ino;
-               fid->i32.gen = inode->i_generation;
-               break;
-       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-               spin_lock(&dentry->d_lock);
-               fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
-               fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
-               /*FALLTHRU*/
-       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
-               fid64->ino = inode->i_ino;
-               fid64->gen = inode->i_generation;
-               break;
-       }
-
-       return fileid_type;
-}
-
-STATIC struct inode *
-xfs_nfs_get_inode(
-       struct super_block      *sb,
-       u64                     ino,
-       u32                     generation)
- {
-       xfs_mount_t             *mp = XFS_M(sb);
-       xfs_inode_t             *ip;
-       int                     error;
-
-       /*
-        * NFS can sometimes send requests for ino 0.  Fail them gracefully.
-        */
-       if (ino == 0)
-               return ERR_PTR(-ESTALE);
-
-       /*
-        * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
-        * fine and not an indication of a corrupted filesystem as clients can
-        * send invalid file handles and we have to handle it gracefully..
-        */
-       error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
-       if (error) {
-               /*
-                * EINVAL means the inode cluster doesn't exist anymore.
-                * This implies the filehandle is stale, so we should
-                * translate it here.
-                * We don't use ESTALE directly down the chain to not
-                * confuse applications using bulkstat that expect EINVAL.
-                */
-               if (error == EINVAL || error == ENOENT)
-                       error = ESTALE;
-               return ERR_PTR(-error);
-       }
-
-       if (ip->i_d.di_gen != generation) {
-               IRELE(ip);
-               return ERR_PTR(-ESTALE);
-       }
-
-       return VFS_I(ip);
-}
-
-STATIC struct dentry *
-xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-                int fh_len, int fileid_type)
-{
-       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fid;
-       struct inode            *inode = NULL;
-
-       if (fh_len < xfs_fileid_length(fileid_type))
-               return NULL;
-
-       switch (fileid_type) {
-       case FILEID_INO32_GEN_PARENT:
-       case FILEID_INO32_GEN:
-               inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
-               break;
-       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
-               inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
-               break;
-       }
-
-       return d_obtain_alias(inode);
-}
-
-STATIC struct dentry *
-xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
-                int fh_len, int fileid_type)
-{
-       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fid;
-       struct inode            *inode = NULL;
-
-       switch (fileid_type) {
-       case FILEID_INO32_GEN_PARENT:
-               inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
-                                             fid->i32.parent_gen);
-               break;
-       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-               inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
-                                             fid64->parent_gen);
-               break;
-       }
-
-       return d_obtain_alias(inode);
-}
-
-STATIC struct dentry *
-xfs_fs_get_parent(
-       struct dentry           *child)
-{
-       int                     error;
-       struct xfs_inode        *cip;
-
-       error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
-       if (unlikely(error))
-               return ERR_PTR(-error);
-
-       return d_obtain_alias(VFS_I(cip));
-}
-
-STATIC int
-xfs_fs_nfs_commit_metadata(
-       struct inode            *inode)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     error = 0;
-
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_ipincount(ip)) {
-               error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
-                               XFS_LOG_SYNC, NULL);
-       }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       return error;
-}
-
-const struct export_operations xfs_export_operations = {
-       .encode_fh              = xfs_fs_encode_fh,
-       .fh_to_dentry           = xfs_fs_fh_to_dentry,
-       .fh_to_parent           = xfs_fs_fh_to_parent,
-       .get_parent             = xfs_fs_get_parent,
-       .commit_metadata        = xfs_fs_nfs_commit_metadata,
-};
diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h
deleted file mode 100644 (file)
index 3272b6a..0000000
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_EXPORT_H__
-#define __XFS_EXPORT_H__
-
-/*
- * Common defines for code related to exporting XFS filesystems over NFS.
- *
- * The NFS fileid goes out on the wire as an array of
- * 32bit unsigned ints in host order.  There are 5 possible
- * formats.
- *
- * (1) fileid_type=0x00
- *     (no fileid data; handled by the generic code)
- *
- * (2) fileid_type=0x01
- *     inode-num
- *     generation
- *
- * (3) fileid_type=0x02
- *     inode-num
- *     generation
- *     parent-inode-num
- *     parent-generation
- *
- * (4) fileid_type=0x81
- *     inode-num-lo32
- *     inode-num-hi32
- *     generation
- *
- * (5) fileid_type=0x82
- *     inode-num-lo32
- *     inode-num-hi32
- *     generation
- *     parent-inode-num-lo32
- *     parent-inode-num-hi32
- *     parent-generation
- *
- * Note, the NFS filehandle also includes an fsid portion which
- * may have an inode number in it.  That number is hardcoded to
- * 32bits and there is no way for XFS to intercept it.  In
- * practice this means when exporting an XFS filesystem with 64bit
- * inodes you should either export the mountpoint (rather than
- * a subdirectory) or use the "fsid" export option.
- */
-
-struct xfs_fid64 {
-       u64 ino;
-       u32 gen;
-       u64 parent_ino;
-       u32 parent_gen;
-} __attribute__((packed));
-
-/* This flag goes on the wire.  Don't play with it. */
-#define XFS_FILEID_TYPE_64FLAG 0x80    /* NFS fileid has 64bit inodes */
-
-#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
deleted file mode 100644 (file)
index 7f7b424..0000000
+++ /dev/null
@@ -1,1096 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_trans.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_error.h"
-#include "xfs_vnodeops.h"
-#include "xfs_da_btree.h"
-#include "xfs_ioctl.h"
-#include "xfs_trace.h"
-
-#include <linux/dcache.h>
-#include <linux/falloc.h>
-
-static const struct vm_operations_struct xfs_file_vm_ops;
-
-/*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       if (type & XFS_IOLOCK_EXCL)
-               mutex_lock(&VFS_I(ip)->i_mutex);
-       xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       xfs_iunlock(ip, type);
-       if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
-}
-
-static inline void
-xfs_rw_ilock_demote(
-       struct xfs_inode        *ip,
-       int                     type)
-{
-       xfs_ilock_demote(ip, type);
-       if (type & XFS_IOLOCK_EXCL)
-               mutex_unlock(&VFS_I(ip)->i_mutex);
-}
-
-/*
- *     xfs_iozero
- *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
- */
-STATIC int
-xfs_iozero(
-       struct xfs_inode        *ip,    /* inode                        */
-       loff_t                  pos,    /* offset in file               */
-       size_t                  count)  /* size of data to zero         */
-{
-       struct page             *page;
-       struct address_space    *mapping;
-       int                     status;
-
-       mapping = VFS_I(ip)->i_mapping;
-       do {
-               unsigned offset, bytes;
-               void *fsdata;
-
-               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
-               bytes = PAGE_CACHE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
-
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
-
-               zero_user(page, offset, bytes);
-
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
-               pos += bytes;
-               count -= bytes;
-               status = 0;
-       } while (count);
-
-       return (-status);
-}
-
-STATIC int
-xfs_file_fsync(
-       struct file             *file,
-       loff_t                  start,
-       loff_t                  end,
-       int                     datasync)
-{
-       struct inode            *inode = file->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_trans        *tp;
-       int                     error = 0;
-       int                     log_flushed = 0;
-
-       trace_xfs_file_fsync(ip);
-
-       error = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (error)
-               return error;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-
-       xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       xfs_ioend_wait(ip);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
-       if (mp->m_flags & XFS_MOUNT_BARRIER) {
-               /*
-                * If we have an RT and/or log subvolume we need to make sure
-                * to flush the write cache the device used for file data
-                * first.  This is to ensure newly written file data make
-                * it to disk before logging the new inode size in case of
-                * an extending write.
-                */
-               if (XFS_IS_REALTIME_INODE(ip))
-                       xfs_blkdev_issue_flush(mp->m_rtdev_targp);
-               else if (mp->m_logdev_targp != mp->m_ddev_targp)
-                       xfs_blkdev_issue_flush(mp->m_ddev_targp);
-       }
-
-       /*
-        * We always need to make sure that the required inode state is safe on
-        * disk.  The inode might be clean but we still might need to force the
-        * log because of committed transactions that haven't hit the disk yet.
-        * Likewise, there could be unflushed non-transactional changes to the
-        * inode core that have to go to disk and this requires us to issue
-        * a synchronous transaction to capture these changes correctly.
-        *
-        * This code relies on the assumption that if the i_update_core field
-        * of the inode is clear and the inode is unpinned then it is clean
-        * and no action is required.
-        */
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-
-       /*
-        * First check if the VFS inode is marked dirty.  All the dirtying
-        * of non-transactional updates no goes through mark_inode_dirty*,
-        * which allows us to distinguish beteeen pure timestamp updates
-        * and i_size updates which need to be caught for fdatasync.
-        * After that also theck for the dirty state in the XFS inode, which
-        * might gets cleared when the inode gets written out via the AIL
-        * or xfs_iflush_cluster.
-        */
-       if (((inode->i_state & I_DIRTY_DATASYNC) ||
-           ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
-           ip->i_update_core) {
-               /*
-                * Kick off a transaction to log the inode core to get the
-                * updates.  The sync transaction will also force the log.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-               error = xfs_trans_reserve(tp, 0,
-                               XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-               if (error) {
-                       xfs_trans_cancel(tp, 0);
-                       return -error;
-               }
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-               /*
-                * Note - it's possible that we might have pushed ourselves out
-                * of the way during trans_reserve which would flush the inode.
-                * But there's no guarantee that the inode buffer has actually
-                * gone out yet (it's delwri).  Plus the buffer could be pinned
-                * anyway if it's part of an inode in another recent
-                * transaction.  So we play it safe and fire off the
-                * transaction anyway.
-                */
-               xfs_trans_ijoin(tp, ip);
-               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-               xfs_trans_set_sync(tp);
-               error = _xfs_trans_commit(tp, 0, &log_flushed);
-
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       } else {
-               /*
-                * Timestamps/size haven't changed since last inode flush or
-                * inode transaction commit.  That means either nothing got
-                * written or a transaction committed which caught the updates.
-                * If the latter happened and the transaction hasn't hit the
-                * disk yet, the inode will be still be pinned.  If it is,
-                * force the log.
-                */
-               if (xfs_ipincount(ip)) {
-                       error = _xfs_log_force_lsn(mp,
-                                       ip->i_itemp->ili_last_lsn,
-                                       XFS_LOG_SYNC, &log_flushed);
-               }
-               xfs_iunlock(ip, XFS_ILOCK_SHARED);
-       }
-
-       /*
-        * If we only have a single device, and the log force about was
-        * a no-op we might have to flush the data device cache here.
-        * This can only happen for fdatasync/O_DSYNC if we were overwriting
-        * an already allocated file and thus do not have any metadata to
-        * commit.
-        */
-       if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
-           mp->m_logdev_targp == mp->m_ddev_targp &&
-           !XFS_IS_REALTIME_INODE(ip) &&
-           !log_flushed)
-               xfs_blkdev_issue_flush(mp->m_ddev_targp);
-
-       return -error;
-}
-
-STATIC ssize_t
-xfs_file_aio_read(
-       struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       struct file             *file = iocb->ki_filp;
-       struct inode            *inode = file->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       size_t                  size = 0;
-       ssize_t                 ret = 0;
-       int                     ioflags = 0;
-       xfs_fsize_t             n;
-       unsigned long           seg;
-
-       XFS_STATS_INC(xs_read_calls);
-
-       BUG_ON(iocb->ki_pos != pos);
-
-       if (unlikely(file->f_flags & O_DIRECT))
-               ioflags |= IO_ISDIRECT;
-       if (file->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-
-       /* START copy & waste from filemap.c */
-       for (seg = 0; seg < nr_segs; seg++) {
-               const struct iovec *iv = &iovp[seg];
-
-               /*
-                * If any segment has a negative length, or the cumulative
-                * length ever wraps negative then return -EINVAL.
-                */
-               size += iv->iov_len;
-               if (unlikely((ssize_t)(size|iv->iov_len) < 0))
-                       return XFS_ERROR(-EINVAL);
-       }
-       /* END copy & waste from filemap.c */
-
-       if (unlikely(ioflags & IO_ISDIRECT)) {
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                               mp->m_rtdev_targp : mp->m_ddev_targp;
-               if ((iocb->ki_pos & target->bt_smask) ||
-                   (size & target->bt_smask)) {
-                       if (iocb->ki_pos == ip->i_size)
-                               return 0;
-                       return -XFS_ERROR(EINVAL);
-               }
-       }
-
-       n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
-       if (n <= 0 || size == 0)
-               return 0;
-
-       if (n < size)
-               size = n;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -EIO;
-
-       if (unlikely(ioflags & IO_ISDIRECT)) {
-               xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
-
-               if (inode->i_mapping->nrpages) {
-                       ret = -xfs_flushinval_pages(ip,
-                                       (iocb->ki_pos & PAGE_CACHE_MASK),
-                                       -1, FI_REMAPF_LOCKED);
-                       if (ret) {
-                               xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
-                               return ret;
-                       }
-               }
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-       } else
-               xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
-       trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
-
-       ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
-       if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
-
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-       return ret;
-}
-
-STATIC ssize_t
-xfs_file_splice_read(
-       struct file             *infilp,
-       loff_t                  *ppos,
-       struct pipe_inode_info  *pipe,
-       size_t                  count,
-       unsigned int            flags)
-{
-       struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
-       int                     ioflags = 0;
-       ssize_t                 ret;
-
-       XFS_STATS_INC(xs_read_calls);
-
-       if (infilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
-
-       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
-       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
-
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-       if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
-
-       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
-       return ret;
-}
-
-STATIC void
-xfs_aio_write_isize_update(
-       struct inode    *inode,
-       loff_t          *ppos,
-       ssize_t         bytes_written)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             isize = i_size_read(inode);
-
-       if (bytes_written > 0)
-               XFS_STATS_ADD(xs_write_bytes, bytes_written);
-
-       if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
-                                       *ppos > isize))
-               *ppos = isize;
-
-       if (*ppos > ip->i_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               if (*ppos > ip->i_size)
-                       ip->i_size = *ppos;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
-/*
- * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
- * part of the I/O may have been written to disk before the error occurred.  In
- * this case the on-disk file size may have been adjusted beyond the in-memory
- * file size and now needs to be truncated back.
- */
-STATIC void
-xfs_aio_write_newsize_update(
-       struct xfs_inode        *ip)
-{
-       if (ip->i_new_size) {
-               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
-               ip->i_new_size = 0;
-               if (ip->i_d.di_size > ip->i_size)
-                       ip->i_d.di_size = ip->i_size;
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       }
-}
-
-/*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
-       struct pipe_inode_info  *pipe,
-       struct file             *outfilp,
-       loff_t                  *ppos,
-       size_t                  count,
-       unsigned int            flags)
-{
-       struct inode            *inode = outfilp->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
-       int                     ioflags = 0;
-       ssize_t                 ret;
-
-       XFS_STATS_INC(xs_write_calls);
-
-       if (outfilp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
-
-       xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-       new_size = *ppos + count;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
-       ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-
-       xfs_aio_write_isize_update(inode, ppos, ret);
-       xfs_aio_write_newsize_update(ip);
-       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return ret;
-}
-
-/*
- * This routine is called to handle zeroing any space in the last
- * block of the file that is beyond the EOF.  We do this since the
- * size is being increased without writing anything to that block
- * and we don't want anyone to read the garbage on the disk.
- */
-STATIC int                             /* error (positive) */
-xfs_zero_last_block(
-       xfs_inode_t     *ip,
-       xfs_fsize_t     offset,
-       xfs_fsize_t     isize)
-{
-       xfs_fileoff_t   last_fsb;
-       xfs_mount_t     *mp = ip->i_mount;
-       int             nimaps;
-       int             zero_offset;
-       int             zero_len;
-       int             error = 0;
-       xfs_bmbt_irec_t imap;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       zero_offset = XFS_B_FSB_OFFSET(mp, isize);
-       if (zero_offset == 0) {
-               /*
-                * There are no extra bytes in the last block on disk to
-                * zero, so return.
-                */
-               return 0;
-       }
-
-       last_fsb = XFS_B_TO_FSBT(mp, isize);
-       nimaps = 1;
-       error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
-                         &nimaps, NULL);
-       if (error) {
-               return error;
-       }
-       ASSERT(nimaps > 0);
-       /*
-        * If the block underlying isize is just a hole, then there
-        * is nothing to zero.
-        */
-       if (imap.br_startblock == HOLESTARTBLOCK) {
-               return 0;
-       }
-       /*
-        * Zero the part of the last block beyond the EOF, and write it
-        * out sync.  We need to drop the ilock while we do this so we
-        * don't deadlock when the buffer cache calls back to us.
-        */
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       zero_len = mp->m_sb.sb_blocksize - zero_offset;
-       if (isize + zero_len > offset)
-               zero_len = offset - isize;
-       error = xfs_iozero(ip, isize, zero_len);
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       ASSERT(error >= 0);
-       return error;
-}
-
-/*
- * Zero any on disk space between the current EOF and the new,
- * larger EOF.  This handles the normal case of zeroing the remainder
- * of the last block in the file and the unusual case of zeroing blocks
- * out beyond the size of the file.  This second case only happens
- * with fixed size extents and when the system crashes before the inode
- * size was updated but after blocks were allocated.  If fill is set,
- * then any holes in the range are filled and zeroed.  If not, the holes
- * are left alone as holes.
- */
-
-int                                    /* error (positive) */
-xfs_zero_eof(
-       xfs_inode_t     *ip,
-       xfs_off_t       offset,         /* starting I/O offset */
-       xfs_fsize_t     isize)          /* current inode size */
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       xfs_fileoff_t   start_zero_fsb;
-       xfs_fileoff_t   end_zero_fsb;
-       xfs_fileoff_t   zero_count_fsb;
-       xfs_fileoff_t   last_fsb;
-       xfs_fileoff_t   zero_off;
-       xfs_fsize_t     zero_len;
-       int             nimaps;
-       int             error = 0;
-       xfs_bmbt_irec_t imap;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-       ASSERT(offset > isize);
-
-       /*
-        * First handle zeroing the block on which isize resides.
-        * We only zero a part of that block so it is handled specially.
-        */
-       error = xfs_zero_last_block(ip, offset, isize);
-       if (error) {
-               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-               return error;
-       }
-
-       /*
-        * Calculate the range between the new size and the old
-        * where blocks needing to be zeroed may exist.  To get the
-        * block where the last byte in the file currently resides,
-        * we need to subtract one from the size and truncate back
-        * to a block boundary.  We subtract 1 in case the size is
-        * exactly on a block boundary.
-        */
-       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
-       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
-       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
-       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
-       if (last_fsb == end_zero_fsb) {
-               /*
-                * The size was only incremented on its last block.
-                * We took care of that above, so just return.
-                */
-               return 0;
-       }
-
-       ASSERT(start_zero_fsb <= end_zero_fsb);
-       while (start_zero_fsb <= end_zero_fsb) {
-               nimaps = 1;
-               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
-               error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
-                                 0, NULL, 0, &imap, &nimaps, NULL);
-               if (error) {
-                       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
-                       return error;
-               }
-               ASSERT(nimaps > 0);
-
-               if (imap.br_state == XFS_EXT_UNWRITTEN ||
-                   imap.br_startblock == HOLESTARTBLOCK) {
-                       /*
-                        * This loop handles initializing pages that were
-                        * partially initialized by the code below this
-                        * loop. It basically zeroes the part of the page
-                        * that sits on a hole and sets the page as P_HOLE
-                        * and calls remapf if it is a mapped file.
-                        */
-                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-                       continue;
-               }
-
-               /*
-                * There are blocks we need to zero.
-                * Drop the inode lock while we're doing the I/O.
-                * We'll still have the iolock to protect us.
-                */
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
-               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
-
-               if ((zero_off + zero_len) > offset)
-                       zero_len = offset - zero_off;
-
-               error = xfs_iozero(ip, zero_off, zero_len);
-               if (error) {
-                       goto out_lock;
-               }
-
-               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
-               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
-
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-       }
-
-       return 0;
-
-out_lock:
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       ASSERT(error >= 0);
-       return error;
-}
-
-/*
- * Common pre-write limit and setup checks.
- *
- * Returns with iolock held according to @iolock.
- */
-STATIC ssize_t
-xfs_file_aio_write_checks(
-       struct file             *file,
-       loff_t                  *pos,
-       size_t                  *count,
-       int                     *iolock)
-{
-       struct inode            *inode = file->f_mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       xfs_fsize_t             new_size;
-       int                     error = 0;
-
-       error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
-       if (error) {
-               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
-               *iolock = 0;
-               return error;
-       }
-
-       new_size = *pos + *count;
-       if (new_size > ip->i_size)
-               ip->i_new_size = new_size;
-
-       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-               file_update_time(file);
-
-       /*
-        * If the offset is beyond the size of the file, we need to zero any
-        * blocks that fall between the existing EOF and the start of this
-        * write.
-        */
-       if (*pos > ip->i_size)
-               error = -xfs_zero_eof(ip, *pos, ip->i_size);
-
-       xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
-       if (error)
-               return error;
-
-       /*
-        * If we're writing the file then make sure to clear the setuid and
-        * setgid bits if the process is not being run by root.  This keeps
-        * people from modifying setuid and setgid binaries.
-        */
-       return file_remove_suid(file);
-
-}
-
-/*
- * xfs_file_dio_aio_write - handle direct IO writes
- *
- * Lock the inode appropriately to prepare for and issue a direct IO write.
- * By separating it from the buffered write path we remove all the tricky to
- * follow locking changes and looping.
- *
- * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
- * until we're sure the bytes at the new EOF have been zeroed and/or the cached
- * pages are flushed out.
- *
- * In most cases the direct IO writes will be done holding IOLOCK_SHARED
- * allowing them to be done in parallel with reads and other direct IO writes.
- * However, if the IO is not aligned to filesystem blocks, the direct IO layer
- * needs to do sub-block zeroing and that requires serialisation against other
- * direct IOs to the same block. In this case we need to serialise the
- * submission of the unaligned IOs so that we don't get racing block zeroing in
- * the dio layer.  To avoid the problem with aio, we also need to wait for
- * outstanding IOs to complete so that unwritten extent conversion is completed
- * before we try to map the overlapping block. This is currently implemented by
- * hitting it with a big hammer (i.e. xfs_ioend_wait()).
- *
- * Returns with locks held indicated by @iolock and errors indicated by
- * negative return values.
- */
-STATIC ssize_t
-xfs_file_dio_aio_write(
-       struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  ocount,
-       int                     *iolock)
-{
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       ssize_t                 ret = 0;
-       size_t                  count = ocount;
-       int                     unaligned_io = 0;
-       struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
-                                       mp->m_rtdev_targp : mp->m_ddev_targp;
-
-       *iolock = 0;
-       if ((pos & target->bt_smask) || (count & target->bt_smask))
-               return -XFS_ERROR(EINVAL);
-
-       if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
-               unaligned_io = 1;
-
-       if (unaligned_io || mapping->nrpages || pos > ip->i_size)
-               *iolock = XFS_IOLOCK_EXCL;
-       else
-               *iolock = XFS_IOLOCK_SHARED;
-       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-
-       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-       if (ret)
-               return ret;
-
-       if (mapping->nrpages) {
-               WARN_ON(*iolock != XFS_IOLOCK_EXCL);
-               ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
-                                                       FI_REMAPF_LOCKED);
-               if (ret)
-                       return ret;
-       }
-
-       /*
-        * If we are doing unaligned IO, wait for all other IO to drain,
-        * otherwise demote the lock if we had to flush cached pages
-        */
-       if (unaligned_io)
-               xfs_ioend_wait(ip);
-       else if (*iolock == XFS_IOLOCK_EXCL) {
-               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
-               *iolock = XFS_IOLOCK_SHARED;
-       }
-
-       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_direct_write(iocb, iovp,
-                       &nr_segs, pos, &iocb->ki_pos, count, ocount);
-
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
-       return ret;
-}
-
-STATIC ssize_t
-xfs_file_buffered_aio_write(
-       struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos,
-       size_t                  ocount,
-       int                     *iolock)
-{
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       ssize_t                 ret;
-       int                     enospc = 0;
-       size_t                  count = ocount;
-
-       *iolock = XFS_IOLOCK_EXCL;
-       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
-
-       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
-       if (ret)
-               return ret;
-
-       /* We can write back this queue in page reclaim */
-       current->backing_dev_info = mapping->backing_dev_info;
-
-write_retry:
-       trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
-       ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-                       pos, &iocb->ki_pos, count, ret);
-       /*
-        * if we just got an ENOSPC, flush the inode now we aren't holding any
-        * page locks and retry *once*
-        */
-       if (ret == -ENOSPC && !enospc) {
-               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-               if (ret)
-                       return ret;
-               enospc = 1;
-               goto write_retry;
-       }
-       current->backing_dev_info = NULL;
-       return ret;
-}
-
-STATIC ssize_t
-xfs_file_aio_write(
-       struct kiocb            *iocb,
-       const struct iovec      *iovp,
-       unsigned long           nr_segs,
-       loff_t                  pos)
-{
-       struct file             *file = iocb->ki_filp;
-       struct address_space    *mapping = file->f_mapping;
-       struct inode            *inode = mapping->host;
-       struct xfs_inode        *ip = XFS_I(inode);
-       ssize_t                 ret;
-       int                     iolock;
-       size_t                  ocount = 0;
-
-       XFS_STATS_INC(xs_write_calls);
-
-       BUG_ON(iocb->ki_pos != pos);
-
-       ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
-       if (ret)
-               return ret;
-
-       if (ocount == 0)
-               return 0;
-
-       xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
-
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return -EIO;
-
-       if (unlikely(file->f_flags & O_DIRECT))
-               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &iolock);
-       else
-               ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
-                                               ocount, &iolock);
-
-       xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
-
-       if (ret <= 0)
-               goto out_unlock;
-
-       /* Handle various SYNC-type writes */
-       if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-               loff_t end = pos + ret - 1;
-               int error;
-
-               xfs_rw_iunlock(ip, iolock);
-               error = xfs_file_fsync(file, pos, end,
-                                     (file->f_flags & __O_SYNC) ? 0 : 1);
-               xfs_rw_ilock(ip, iolock);
-               if (error)
-                       ret = error;
-       }
-
-out_unlock:
-       xfs_aio_write_newsize_update(ip);
-       xfs_rw_iunlock(ip, iolock);
-       return ret;
-}
-
-STATIC long
-xfs_file_fallocate(
-       struct file     *file,
-       int             mode,
-       loff_t          offset,
-       loff_t          len)
-{
-       struct inode    *inode = file->f_path.dentry->d_inode;
-       long            error;
-       loff_t          new_size = 0;
-       xfs_flock64_t   bf;
-       xfs_inode_t     *ip = XFS_I(inode);
-       int             cmd = XFS_IOC_RESVSP;
-       int             attr_flags = XFS_ATTR_NOLOCK;
-
-       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
-               return -EOPNOTSUPP;
-
-       bf.l_whence = 0;
-       bf.l_start = offset;
-       bf.l_len = len;
-
-       xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-       if (mode & FALLOC_FL_PUNCH_HOLE)
-               cmd = XFS_IOC_UNRESVSP;
-
-       /* check the new inode size is valid before allocating */
-       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-           offset + len > i_size_read(inode)) {
-               new_size = offset + len;
-               error = inode_newsize_ok(inode, new_size);
-               if (error)
-                       goto out_unlock;
-       }
-
-       if (file->f_flags & O_DSYNC)
-               attr_flags |= XFS_ATTR_SYNC;
-
-       error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
-       if (error)
-               goto out_unlock;
-
-       /* Change file size if needed */
-       if (new_size) {
-               struct iattr iattr;
-
-               iattr.ia_valid = ATTR_SIZE;
-               iattr.ia_size = new_size;
-               error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
-       }
-
-out_unlock:
-       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-       return error;
-}
-
-
-STATIC int
-xfs_file_open(
-       struct inode    *inode,
-       struct file     *file)
-{
-       if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
-               return -EFBIG;
-       if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
-               return -EIO;
-       return 0;
-}
-
-STATIC int
-xfs_dir_open(
-       struct inode    *inode,
-       struct file     *file)
-{
-       struct xfs_inode *ip = XFS_I(inode);
-       int             mode;
-       int             error;
-
-       error = xfs_file_open(inode, file);
-       if (error)
-               return error;
-
-       /*
-        * If there are any blocks, read-ahead block 0 as we're almost
-        * certain to have the next operation be a read there.
-        */
-       mode = xfs_ilock_map_shared(ip);
-       if (ip->i_d.di_nextents > 0)
-               xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
-       xfs_iunlock(ip, mode);
-       return 0;
-}
-
-STATIC int
-xfs_file_release(
-       struct inode    *inode,
-       struct file     *filp)
-{
-       return -xfs_release(XFS_I(inode));
-}
-
-STATIC int
-xfs_file_readdir(
-       struct file     *filp,
-       void            *dirent,
-       filldir_t       filldir)
-{
-       struct inode    *inode = filp->f_path.dentry->d_inode;
-       xfs_inode_t     *ip = XFS_I(inode);
-       int             error;
-       size_t          bufsize;
-
-       /*
-        * The Linux API doesn't pass down the total size of the buffer
-        * we read into down to the filesystem.  With the filldir concept
-        * it's not needed for correct information, but the XFS dir2 leaf
-        * code wants an estimate of the buffer size to calculate it's
-        * readahead window and size the buffers used for mapping to
-        * physical blocks.
-        *
-        * Try to give it an estimate that's good enough, maybe at some
-        * point we can change the ->readdir prototype to include the
-        * buffer size.  For now we use the current glibc buffer size.
-        */
-       bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
-
-       error = xfs_readdir(ip, dirent, bufsize,
-                               (xfs_off_t *)&filp->f_pos, filldir);
-       if (error)
-               return -error;
-       return 0;
-}
-
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-       vma->vm_flags |= VM_CAN_NONLINEAR;
-
-       file_accessed(filp);
-       return 0;
-}
-
-/*
- * mmap()d file has taken write protection fault and is being made
- * writable. We can set the page state up correctly for a writable
- * page, which means we can do correct delalloc accounting (ENOSPC
- * checking!) and unwritten extent mapping.
- */
-STATIC int
-xfs_vm_page_mkwrite(
-       struct vm_area_struct   *vma,
-       struct vm_fault         *vmf)
-{
-       return block_page_mkwrite(vma, vmf, xfs_get_blocks);
-}
-
-const struct file_operations xfs_file_operations = {
-       .llseek         = generic_file_llseek,
-       .read           = do_sync_read,
-       .write          = do_sync_write,
-       .aio_read       = xfs_file_aio_read,
-       .aio_write      = xfs_file_aio_write,
-       .splice_read    = xfs_file_splice_read,
-       .splice_write   = xfs_file_splice_write,
-       .unlocked_ioctl = xfs_file_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = xfs_file_compat_ioctl,
-#endif
-       .mmap           = xfs_file_mmap,
-       .open           = xfs_file_open,
-       .release        = xfs_file_release,
-       .fsync          = xfs_file_fsync,
-       .fallocate      = xfs_file_fallocate,
-};
-
-const struct file_operations xfs_dir_file_operations = {
-       .open           = xfs_dir_open,
-       .read           = generic_read_dir,
-       .readdir        = xfs_file_readdir,
-       .llseek         = generic_file_llseek,
-       .unlocked_ioctl = xfs_file_ioctl,
-#ifdef CONFIG_COMPAT
-       .compat_ioctl   = xfs_file_compat_ioctl,
-#endif
-       .fsync          = xfs_file_fsync,
-};
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = filemap_fault,
-       .page_mkwrite   = xfs_vm_page_mkwrite,
-};
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
deleted file mode 100644 (file)
index ed88ed1..0000000
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       int             fiopt)
-{
-       /* can't toss partial tail pages, so mask them out */
-       last &= ~(PAGE_SIZE - 1);
-       truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-
-int
-xfs_flushinval_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       int             fiopt)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-       int             ret = 0;
-
-       trace_xfs_pagecache_inval(ip, first, last);
-
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       ret = filemap_write_and_wait_range(mapping, first,
-                               last == -1 ? LLONG_MAX : last);
-       if (!ret)
-               truncate_inode_pages_range(mapping, first, last);
-       return -ret;
-}
-
-int
-xfs_flush_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       uint64_t        flags,
-       int             fiopt)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-       int             ret = 0;
-       int             ret2;
-
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       ret = -filemap_fdatawrite_range(mapping, first,
-                               last == -1 ? LLONG_MAX : last);
-       if (flags & XBF_ASYNC)
-               return ret;
-       ret2 = xfs_wait_on_pages(ip, first, last);
-       if (!ret)
-               ret = ret2;
-       return ret;
-}
-
-int
-xfs_wait_on_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-
-       if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-               return -filemap_fdatawait_range(mapping, first,
-                                       last == -1 ? ip->i_size - 1 : last);
-       }
-       return 0;
-}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
deleted file mode 100644 (file)
index 76e81cf..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_sysctl.h"
-
-/*
- * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
- * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second).
- */
-xfs_param_t xfs_params = {
-                         /*    MIN             DFLT            MAX     */
-       .sgid_inherit   = {     0,              0,              1       },
-       .symlink_mode   = {     0,              0,              1       },
-       .panic_mask     = {     0,              0,              255     },
-       .error_level    = {     0,              3,              11      },
-       .syncd_timer    = {     1*100,          30*100,         7200*100},
-       .stats_clear    = {     0,              0,              1       },
-       .inherit_sync   = {     0,              1,              1       },
-       .inherit_nodump = {     0,              1,              1       },
-       .inherit_noatim = {     0,              1,              1       },
-       .xfs_buf_timer  = {     100/2,          1*100,          30*100  },
-       .xfs_buf_age    = {     1*100,          15*100,         7200*100},
-       .inherit_nosym  = {     0,              0,              1       },
-       .rotorstep      = {     1,              1,              255     },
-       .inherit_nodfrg = {     0,              1,              1       },
-       .fstrm_timer    = {     1,              30*100,         3600*100},
-};
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
deleted file mode 100644 (file)
index f7ce7de..0000000
+++ /dev/null
@@ -1,1556 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ioctl.h"
-#include "xfs_rtalloc.h"
-#include "xfs_itable.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_bmap.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_dfrag.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_discard.h"
-#include "xfs_quota.h"
-#include "xfs_inode_item.h"
-#include "xfs_export.h"
-#include "xfs_trace.h"
-
-#include <linux/capability.h>
-#include <linux/dcache.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/pagemap.h>
-#include <linux/slab.h>
-#include <linux/exportfs.h>
-
-/*
- * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
- * a file or fs handle.
- *
- * XFS_IOC_PATH_TO_FSHANDLE
- *    returns fs handle for a mount point or path within that mount point
- * XFS_IOC_FD_TO_HANDLE
- *    returns full handle for a FD opened in user space
- * XFS_IOC_PATH_TO_HANDLE
- *    returns full handle for a path
- */
-int
-xfs_find_handle(
-       unsigned int            cmd,
-       xfs_fsop_handlereq_t    *hreq)
-{
-       int                     hsize;
-       xfs_handle_t            handle;
-       struct inode            *inode;
-       struct file             *file = NULL;
-       struct path             path;
-       int                     error;
-       struct xfs_inode        *ip;
-
-       if (cmd == XFS_IOC_FD_TO_HANDLE) {
-               file = fget(hreq->fd);
-               if (!file)
-                       return -EBADF;
-               inode = file->f_path.dentry->d_inode;
-       } else {
-               error = user_lpath((const char __user *)hreq->path, &path);
-               if (error)
-                       return error;
-               inode = path.dentry->d_inode;
-       }
-       ip = XFS_I(inode);
-
-       /*
-        * We can only generate handles for inodes residing on a XFS filesystem,
-        * and only for regular files, directories or symbolic links.
-        */
-       error = -EINVAL;
-       if (inode->i_sb->s_magic != XFS_SB_MAGIC)
-               goto out_put;
-
-       error = -EBADF;
-       if (!S_ISREG(inode->i_mode) &&
-           !S_ISDIR(inode->i_mode) &&
-           !S_ISLNK(inode->i_mode))
-               goto out_put;
-
-
-       memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
-
-       if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
-               /*
-                * This handle only contains an fsid, zero the rest.
-                */
-               memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
-               hsize = sizeof(xfs_fsid_t);
-       } else {
-               int             lock_mode;
-
-               lock_mode = xfs_ilock_map_shared(ip);
-               handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
-                                       sizeof(handle.ha_fid.fid_len);
-               handle.ha_fid.fid_pad = 0;
-               handle.ha_fid.fid_gen = ip->i_d.di_gen;
-               handle.ha_fid.fid_ino = ip->i_ino;
-               xfs_iunlock_map_shared(ip, lock_mode);
-
-               hsize = XFS_HSIZE(handle);
-       }
-
-       error = -EFAULT;
-       if (copy_to_user(hreq->ohandle, &handle, hsize) ||
-           copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
-               goto out_put;
-
-       error = 0;
-
- out_put:
-       if (cmd == XFS_IOC_FD_TO_HANDLE)
-               fput(file);
-       else
-               path_put(&path);
-       return error;
-}
-
-/*
- * No need to do permission checks on the various pathname components
- * as the handle operations are privileged.
- */
-STATIC int
-xfs_handle_acceptable(
-       void                    *context,
-       struct dentry           *dentry)
-{
-       return 1;
-}
-
-/*
- * Convert userspace handle data into a dentry.
- */
-struct dentry *
-xfs_handle_to_dentry(
-       struct file             *parfilp,
-       void __user             *uhandle,
-       u32                     hlen)
-{
-       xfs_handle_t            handle;
-       struct xfs_fid64        fid;
-
-       /*
-        * Only allow handle opens under a directory.
-        */
-       if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
-               return ERR_PTR(-ENOTDIR);
-
-       if (hlen != sizeof(xfs_handle_t))
-               return ERR_PTR(-EINVAL);
-       if (copy_from_user(&handle, uhandle, hlen))
-               return ERR_PTR(-EFAULT);
-       if (handle.ha_fid.fid_len !=
-           sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
-               return ERR_PTR(-EINVAL);
-
-       memset(&fid, 0, sizeof(struct fid));
-       fid.ino = handle.ha_fid.fid_ino;
-       fid.gen = handle.ha_fid.fid_gen;
-
-       return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
-                       FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
-                       xfs_handle_acceptable, NULL);
-}
-
-STATIC struct dentry *
-xfs_handlereq_to_dentry(
-       struct file             *parfilp,
-       xfs_fsop_handlereq_t    *hreq)
-{
-       return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
-}
-
-int
-xfs_open_by_handle(
-       struct file             *parfilp,
-       xfs_fsop_handlereq_t    *hreq)
-{
-       const struct cred       *cred = current_cred();
-       int                     error;
-       int                     fd;
-       int                     permflag;
-       struct file             *filp;
-       struct inode            *inode;
-       struct dentry           *dentry;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-
-       dentry = xfs_handlereq_to_dentry(parfilp, hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-       inode = dentry->d_inode;
-
-       /* Restrict xfs_open_by_handle to directories & regular files. */
-       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
-               error = -XFS_ERROR(EPERM);
-               goto out_dput;
-       }
-
-#if BITS_PER_LONG != 32
-       hreq->oflags |= O_LARGEFILE;
-#endif
-
-       /* Put open permission in namei format. */
-       permflag = hreq->oflags;
-       if ((permflag+1) & O_ACCMODE)
-               permflag++;
-       if (permflag & O_TRUNC)
-               permflag |= 2;
-
-       if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
-           (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
-               error = -XFS_ERROR(EPERM);
-               goto out_dput;
-       }
-
-       if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
-               error = -XFS_ERROR(EACCES);
-               goto out_dput;
-       }
-
-       /* Can't write directories. */
-       if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
-               error = -XFS_ERROR(EISDIR);
-               goto out_dput;
-       }
-
-       fd = get_unused_fd();
-       if (fd < 0) {
-               error = fd;
-               goto out_dput;
-       }
-
-       filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
-                          hreq->oflags, cred);
-       if (IS_ERR(filp)) {
-               put_unused_fd(fd);
-               return PTR_ERR(filp);
-       }
-
-       if (S_ISREG(inode->i_mode)) {
-               filp->f_flags |= O_NOATIME;
-               filp->f_mode |= FMODE_NOCMTIME;
-       }
-
-       fd_install(fd, filp);
-       return fd;
-
- out_dput:
-       dput(dentry);
-       return error;
-}
-
-/*
- * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
- * unused first argument.
- */
-STATIC int
-do_readlink(
-       char __user             *buffer,
-       int                     buflen,
-       const char              *link)
-{
-        int len;
-
-       len = PTR_ERR(link);
-       if (IS_ERR(link))
-               goto out;
-
-       len = strlen(link);
-       if (len > (unsigned) buflen)
-               len = buflen;
-       if (copy_to_user(buffer, link, len))
-               len = -EFAULT;
- out:
-       return len;
-}
-
-
-int
-xfs_readlink_by_handle(
-       struct file             *parfilp,
-       xfs_fsop_handlereq_t    *hreq)
-{
-       struct dentry           *dentry;
-       __u32                   olen;
-       void                    *link;
-       int                     error;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-
-       dentry = xfs_handlereq_to_dentry(parfilp, hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       /* Restrict this handle operation to symlinks only. */
-       if (!S_ISLNK(dentry->d_inode->i_mode)) {
-               error = -XFS_ERROR(EINVAL);
-               goto out_dput;
-       }
-
-       if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
-               error = -XFS_ERROR(EFAULT);
-               goto out_dput;
-       }
-
-       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-       if (!link) {
-               error = -XFS_ERROR(ENOMEM);
-               goto out_dput;
-       }
-
-       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
-       if (error)
-               goto out_kfree;
-       error = do_readlink(hreq->ohandle, olen, link);
-       if (error)
-               goto out_kfree;
-
- out_kfree:
-       kfree(link);
- out_dput:
-       dput(dentry);
-       return error;
-}
-
-STATIC int
-xfs_fssetdm_by_handle(
-       struct file             *parfilp,
-       void                    __user *arg)
-{
-       int                     error;
-       struct fsdmidata        fsd;
-       xfs_fsop_setdm_handlereq_t dmhreq;
-       struct dentry           *dentry;
-
-       if (!capable(CAP_MKNOD))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-               error = -XFS_ERROR(EPERM);
-               goto out;
-       }
-
-       if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
-               error = -XFS_ERROR(EFAULT);
-               goto out;
-       }
-
-       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
-                                fsd.fsd_dmstate);
-
- out:
-       dput(dentry);
-       return error;
-}
-
-STATIC int
-xfs_attrlist_by_handle(
-       struct file             *parfilp,
-       void                    __user *arg)
-{
-       int                     error = -ENOMEM;
-       attrlist_cursor_kern_t  *cursor;
-       xfs_fsop_attrlist_handlereq_t al_hreq;
-       struct dentry           *dentry;
-       char                    *kbuf;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-       if (al_hreq.buflen > XATTR_LIST_MAX)
-               return -XFS_ERROR(EINVAL);
-
-       /*
-        * Reject flags, only allow namespaces.
-        */
-       if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-               return -XFS_ERROR(EINVAL);
-
-       dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
-       if (!kbuf)
-               goto out_dput;
-
-       cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
-                                       al_hreq.flags, cursor);
-       if (error)
-               goto out_kfree;
-
-       if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
-               error = -EFAULT;
-
- out_kfree:
-       kfree(kbuf);
- out_dput:
-       dput(dentry);
-       return error;
-}
-
-int
-xfs_attrmulti_attr_get(
-       struct inode            *inode,
-       unsigned char           *name,
-       unsigned char           __user *ubuf,
-       __uint32_t              *len,
-       __uint32_t              flags)
-{
-       unsigned char           *kbuf;
-       int                     error = EFAULT;
-
-       if (*len > XATTR_SIZE_MAX)
-               return EINVAL;
-       kbuf = kmalloc(*len, GFP_KERNEL);
-       if (!kbuf)
-               return ENOMEM;
-
-       error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
-       if (error)
-               goto out_kfree;
-
-       if (copy_to_user(ubuf, kbuf, *len))
-               error = EFAULT;
-
- out_kfree:
-       kfree(kbuf);
-       return error;
-}
-
-int
-xfs_attrmulti_attr_set(
-       struct inode            *inode,
-       unsigned char           *name,
-       const unsigned char     __user *ubuf,
-       __uint32_t              len,
-       __uint32_t              flags)
-{
-       unsigned char           *kbuf;
-       int                     error = EFAULT;
-
-       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-               return EPERM;
-       if (len > XATTR_SIZE_MAX)
-               return EINVAL;
-
-       kbuf = memdup_user(ubuf, len);
-       if (IS_ERR(kbuf))
-               return PTR_ERR(kbuf);
-
-       error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
-
-       return error;
-}
-
-int
-xfs_attrmulti_attr_remove(
-       struct inode            *inode,
-       unsigned char           *name,
-       __uint32_t              flags)
-{
-       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-               return EPERM;
-       return xfs_attr_remove(XFS_I(inode), name, flags);
-}
-
-STATIC int
-xfs_attrmulti_by_handle(
-       struct file             *parfilp,
-       void                    __user *arg)
-{
-       int                     error;
-       xfs_attr_multiop_t      *ops;
-       xfs_fsop_attrmulti_handlereq_t am_hreq;
-       struct dentry           *dentry;
-       unsigned int            i, size;
-       unsigned char           *attr_name;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       /* overflow check */
-       if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
-               return -E2BIG;
-
-       dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       error = E2BIG;
-       size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
-       if (!size || size > 16 * PAGE_SIZE)
-               goto out_dput;
-
-       ops = memdup_user(am_hreq.ops, size);
-       if (IS_ERR(ops)) {
-               error = PTR_ERR(ops);
-               goto out_dput;
-       }
-
-       attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
-       if (!attr_name)
-               goto out_kfree_ops;
-
-       error = 0;
-       for (i = 0; i < am_hreq.opcount; i++) {
-               ops[i].am_error = strncpy_from_user((char *)attr_name,
-                               ops[i].am_attrname, MAXNAMELEN);
-               if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                       error = -ERANGE;
-               if (ops[i].am_error < 0)
-                       break;
-
-               switch (ops[i].am_opcode) {
-               case ATTR_OP_GET:
-                       ops[i].am_error = xfs_attrmulti_attr_get(
-                                       dentry->d_inode, attr_name,
-                                       ops[i].am_attrvalue, &ops[i].am_length,
-                                       ops[i].am_flags);
-                       break;
-               case ATTR_OP_SET:
-                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                       if (ops[i].am_error)
-                               break;
-                       ops[i].am_error = xfs_attrmulti_attr_set(
-                                       dentry->d_inode, attr_name,
-                                       ops[i].am_attrvalue, ops[i].am_length,
-                                       ops[i].am_flags);
-                       mnt_drop_write(parfilp->f_path.mnt);
-                       break;
-               case ATTR_OP_REMOVE:
-                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                       if (ops[i].am_error)
-                               break;
-                       ops[i].am_error = xfs_attrmulti_attr_remove(
-                                       dentry->d_inode, attr_name,
-                                       ops[i].am_flags);
-                       mnt_drop_write(parfilp->f_path.mnt);
-                       break;
-               default:
-                       ops[i].am_error = EINVAL;
-               }
-       }
-
-       if (copy_to_user(am_hreq.ops, ops, size))
-               error = XFS_ERROR(EFAULT);
-
-       kfree(attr_name);
- out_kfree_ops:
-       kfree(ops);
- out_dput:
-       dput(dentry);
-       return -error;
-}
-
-int
-xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
-       struct file             *filp,
-       int                     ioflags,
-       unsigned int            cmd,
-       xfs_flock64_t           *bf)
-{
-       int                     attr_flags = 0;
-       int                     error;
-
-       /*
-        * Only allow the sys admin to reserve space unless
-        * unwritten extents are enabled.
-        */
-       if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
-           !capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-
-       if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
-               return -XFS_ERROR(EPERM);
-
-       if (!(filp->f_mode & FMODE_WRITE))
-               return -XFS_ERROR(EBADF);
-
-       if (!S_ISREG(inode->i_mode))
-               return -XFS_ERROR(EINVAL);
-
-       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-               attr_flags |= XFS_ATTR_NONBLOCK;
-
-       if (filp->f_flags & O_DSYNC)
-               attr_flags |= XFS_ATTR_SYNC;
-
-       if (ioflags & IO_INVIS)
-               attr_flags |= XFS_ATTR_DMI;
-
-       error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
-       return -error;
-}
-
-STATIC int
-xfs_ioc_bulkstat(
-       xfs_mount_t             *mp,
-       unsigned int            cmd,
-       void                    __user *arg)
-{
-       xfs_fsop_bulkreq_t      bulkreq;
-       int                     count;  /* # of records returned */
-       xfs_ino_t               inlast; /* last inode number */
-       int                     done;
-       int                     error;
-
-       /* done = 1 if there are more stats to get and if bulkstat */
-       /* should be called again (unused here, but used in dmapi) */
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -EPERM;
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-               return -XFS_ERROR(EFAULT);
-
-       if ((count = bulkreq.icount) <= 0)
-               return -XFS_ERROR(EINVAL);
-
-       if (bulkreq.ubuffer == NULL)
-               return -XFS_ERROR(EINVAL);
-
-       if (cmd == XFS_IOC_FSINUMBERS)
-               error = xfs_inumbers(mp, &inlast, &count,
-                                       bulkreq.ubuffer, xfs_inumbers_fmt);
-       else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
-               error = xfs_bulkstat_single(mp, &inlast,
-                                               bulkreq.ubuffer, &done);
-       else    /* XFS_IOC_FSBULKSTAT */
-               error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
-                                    sizeof(xfs_bstat_t), bulkreq.ubuffer,
-                                    &done);
-
-       if (error)
-               return -error;
-
-       if (bulkreq.ocount != NULL) {
-               if (copy_to_user(bulkreq.lastip, &inlast,
-                                               sizeof(xfs_ino_t)))
-                       return -XFS_ERROR(EFAULT);
-
-               if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                       return -XFS_ERROR(EFAULT);
-       }
-
-       return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry_v1(
-       xfs_mount_t             *mp,
-       void                    __user *arg)
-{
-       xfs_fsop_geom_t         fsgeo;
-       int                     error;
-
-       error = xfs_fs_geometry(mp, &fsgeo, 3);
-       if (error)
-               return -error;
-
-       /*
-        * Caller should have passed an argument of type
-        * xfs_fsop_geom_v1_t.  This is a proper subset of the
-        * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
-        */
-       if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_ioc_fsgeometry(
-       xfs_mount_t             *mp,
-       void                    __user *arg)
-{
-       xfs_fsop_geom_t         fsgeo;
-       int                     error;
-
-       error = xfs_fs_geometry(mp, &fsgeo, 4);
-       if (error)
-               return -error;
-
-       if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-/*
- * Linux extended inode flags interface.
- */
-
-STATIC unsigned int
-xfs_merge_ioc_xflags(
-       unsigned int    flags,
-       unsigned int    start)
-{
-       unsigned int    xflags = start;
-
-       if (flags & FS_IMMUTABLE_FL)
-               xflags |= XFS_XFLAG_IMMUTABLE;
-       else
-               xflags &= ~XFS_XFLAG_IMMUTABLE;
-       if (flags & FS_APPEND_FL)
-               xflags |= XFS_XFLAG_APPEND;
-       else
-               xflags &= ~XFS_XFLAG_APPEND;
-       if (flags & FS_SYNC_FL)
-               xflags |= XFS_XFLAG_SYNC;
-       else
-               xflags &= ~XFS_XFLAG_SYNC;
-       if (flags & FS_NOATIME_FL)
-               xflags |= XFS_XFLAG_NOATIME;
-       else
-               xflags &= ~XFS_XFLAG_NOATIME;
-       if (flags & FS_NODUMP_FL)
-               xflags |= XFS_XFLAG_NODUMP;
-       else
-               xflags &= ~XFS_XFLAG_NODUMP;
-
-       return xflags;
-}
-
-STATIC unsigned int
-xfs_di2lxflags(
-       __uint16_t      di_flags)
-{
-       unsigned int    flags = 0;
-
-       if (di_flags & XFS_DIFLAG_IMMUTABLE)
-               flags |= FS_IMMUTABLE_FL;
-       if (di_flags & XFS_DIFLAG_APPEND)
-               flags |= FS_APPEND_FL;
-       if (di_flags & XFS_DIFLAG_SYNC)
-               flags |= FS_SYNC_FL;
-       if (di_flags & XFS_DIFLAG_NOATIME)
-               flags |= FS_NOATIME_FL;
-       if (di_flags & XFS_DIFLAG_NODUMP)
-               flags |= FS_NODUMP_FL;
-       return flags;
-}
-
-STATIC int
-xfs_ioc_fsgetxattr(
-       xfs_inode_t             *ip,
-       int                     attr,
-       void                    __user *arg)
-{
-       struct fsxattr          fa;
-
-       memset(&fa, 0, sizeof(struct fsxattr));
-
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       fa.fsx_xflags = xfs_ip2xflags(ip);
-       fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
-       fa.fsx_projid = xfs_get_projid(ip);
-
-       if (attr) {
-               if (ip->i_afp) {
-                       if (ip->i_afp->if_flags & XFS_IFEXTENTS)
-                               fa.fsx_nextents = ip->i_afp->if_bytes /
-                                                       sizeof(xfs_bmbt_rec_t);
-                       else
-                               fa.fsx_nextents = ip->i_d.di_anextents;
-               } else
-                       fa.fsx_nextents = 0;
-       } else {
-               if (ip->i_df.if_flags & XFS_IFEXTENTS)
-                       fa.fsx_nextents = ip->i_df.if_bytes /
-                                               sizeof(xfs_bmbt_rec_t);
-               else
-                       fa.fsx_nextents = ip->i_d.di_nextents;
-       }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-       if (copy_to_user(arg, &fa, sizeof(fa)))
-               return -EFAULT;
-       return 0;
-}
-
-STATIC void
-xfs_set_diflags(
-       struct xfs_inode        *ip,
-       unsigned int            xflags)
-{
-       unsigned int            di_flags;
-
-       /* can't set PREALLOC this way, just preserve it */
-       di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-       if (xflags & XFS_XFLAG_IMMUTABLE)
-               di_flags |= XFS_DIFLAG_IMMUTABLE;
-       if (xflags & XFS_XFLAG_APPEND)
-               di_flags |= XFS_DIFLAG_APPEND;
-       if (xflags & XFS_XFLAG_SYNC)
-               di_flags |= XFS_DIFLAG_SYNC;
-       if (xflags & XFS_XFLAG_NOATIME)
-               di_flags |= XFS_DIFLAG_NOATIME;
-       if (xflags & XFS_XFLAG_NODUMP)
-               di_flags |= XFS_DIFLAG_NODUMP;
-       if (xflags & XFS_XFLAG_PROJINHERIT)
-               di_flags |= XFS_DIFLAG_PROJINHERIT;
-       if (xflags & XFS_XFLAG_NODEFRAG)
-               di_flags |= XFS_DIFLAG_NODEFRAG;
-       if (xflags & XFS_XFLAG_FILESTREAM)
-               di_flags |= XFS_DIFLAG_FILESTREAM;
-       if (S_ISDIR(ip->i_d.di_mode)) {
-               if (xflags & XFS_XFLAG_RTINHERIT)
-                       di_flags |= XFS_DIFLAG_RTINHERIT;
-               if (xflags & XFS_XFLAG_NOSYMLINKS)
-                       di_flags |= XFS_DIFLAG_NOSYMLINKS;
-               if (xflags & XFS_XFLAG_EXTSZINHERIT)
-                       di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-       } else if (S_ISREG(ip->i_d.di_mode)) {
-               if (xflags & XFS_XFLAG_REALTIME)
-                       di_flags |= XFS_DIFLAG_REALTIME;
-               if (xflags & XFS_XFLAG_EXTSIZE)
-                       di_flags |= XFS_DIFLAG_EXTSIZE;
-       }
-
-       ip->i_d.di_flags = di_flags;
-}
-
-STATIC void
-xfs_diflags_to_linux(
-       struct xfs_inode        *ip)
-{
-       struct inode            *inode = VFS_I(ip);
-       unsigned int            xflags = xfs_ip2xflags(ip);
-
-       if (xflags & XFS_XFLAG_IMMUTABLE)
-               inode->i_flags |= S_IMMUTABLE;
-       else
-               inode->i_flags &= ~S_IMMUTABLE;
-       if (xflags & XFS_XFLAG_APPEND)
-               inode->i_flags |= S_APPEND;
-       else
-               inode->i_flags &= ~S_APPEND;
-       if (xflags & XFS_XFLAG_SYNC)
-               inode->i_flags |= S_SYNC;
-       else
-               inode->i_flags &= ~S_SYNC;
-       if (xflags & XFS_XFLAG_NOATIME)
-               inode->i_flags |= S_NOATIME;
-       else
-               inode->i_flags &= ~S_NOATIME;
-}
-
-#define FSX_PROJID     1
-#define FSX_EXTSIZE    2
-#define FSX_XFLAGS     4
-#define FSX_NONBLOCK   8
-
-STATIC int
-xfs_ioctl_setattr(
-       xfs_inode_t             *ip,
-       struct fsxattr          *fa,
-       int                     mask)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_trans        *tp;
-       unsigned int            lock_flags = 0;
-       struct xfs_dquot        *udqp = NULL;
-       struct xfs_dquot        *gdqp = NULL;
-       struct xfs_dquot        *olddquot = NULL;
-       int                     code;
-
-       trace_xfs_ioctl_setattr(ip);
-
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return XFS_ERROR(EROFS);
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       /*
-        * Disallow 32bit project ids when projid32bit feature is not enabled.
-        */
-       if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
-                       !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-               return XFS_ERROR(EINVAL);
-
-       /*
-        * If disk quotas is on, we make sure that the dquots do exist on disk,
-        * before we start any other transactions. Trying to do this later
-        * is messy. We don't care to take a readlock to look at the ids
-        * in inode here, because we can't hold it across the trans_reserve.
-        * If the IDs do change before we take the ilock, we're covered
-        * because the i_*dquot fields will get updated anyway.
-        */
-       if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
-               code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
-                                        ip->i_d.di_gid, fa->fsx_projid,
-                                        XFS_QMOPT_PQUOTA, &udqp, &gdqp);
-               if (code)
-                       return code;
-       }
-
-       /*
-        * For the other attributes, we acquire the inode lock and
-        * first do an error checking pass.
-        */
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-       if (code)
-               goto error_return;
-
-       lock_flags = XFS_ILOCK_EXCL;
-       xfs_ilock(ip, lock_flags);
-
-       /*
-        * CAP_FOWNER overrides the following restrictions:
-        *
-        * The user ID of the calling process must be equal
-        * to the file owner ID, except in cases where the
-        * CAP_FSETID capability is applicable.
-        */
-       if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
-               code = XFS_ERROR(EPERM);
-               goto error_return;
-       }
-
-       /*
-        * Do a quota reservation only if projid is actually going to change.
-        */
-       if (mask & FSX_PROJID) {
-               if (XFS_IS_QUOTA_RUNNING(mp) &&
-                   XFS_IS_PQUOTA_ON(mp) &&
-                   xfs_get_projid(ip) != fa->fsx_projid) {
-                       ASSERT(tp);
-                       code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
-                                               capable(CAP_FOWNER) ?
-                                               XFS_QMOPT_FORCE_RES : 0);
-                       if (code)       /* out of quota */
-                               goto error_return;
-               }
-       }
-
-       if (mask & FSX_EXTSIZE) {
-               /*
-                * Can't change extent size if any extents are allocated.
-                */
-               if (ip->i_d.di_nextents &&
-                   ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-                    fa->fsx_extsize)) {
-                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
-                       goto error_return;
-               }
-
-               /*
-                * Extent size must be a multiple of the appropriate block
-                * size, if set at all. It must also be smaller than the
-                * maximum extent size supported by the filesystem.
-                *
-                * Also, for non-realtime files, limit the extent size hint to
-                * half the size of the AGs in the filesystem so alignment
-                * doesn't result in extents larger than an AG.
-                */
-               if (fa->fsx_extsize != 0) {
-                       xfs_extlen_t    size;
-                       xfs_fsblock_t   extsize_fsb;
-
-                       extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-                       if (extsize_fsb > MAXEXTLEN) {
-                               code = XFS_ERROR(EINVAL);
-                               goto error_return;
-                       }
-
-                       if (XFS_IS_REALTIME_INODE(ip) ||
-                           ((mask & FSX_XFLAGS) &&
-                           (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
-                               size = mp->m_sb.sb_rextsize <<
-                                      mp->m_sb.sb_blocklog;
-                       } else {
-                               size = mp->m_sb.sb_blocksize;
-                               if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                       code = XFS_ERROR(EINVAL);
-                                       goto error_return;
-                               }
-                       }
-
-                       if (fa->fsx_extsize % size) {
-                               code = XFS_ERROR(EINVAL);
-                               goto error_return;
-                       }
-               }
-       }
-
-
-       if (mask & FSX_XFLAGS) {
-               /*
-                * Can't change realtime flag if any extents are allocated.
-                */
-               if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-                   (XFS_IS_REALTIME_INODE(ip)) !=
-                   (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
-                       goto error_return;
-               }
-
-               /*
-                * If realtime flag is set then must have realtime data.
-                */
-               if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                       if ((mp->m_sb.sb_rblocks == 0) ||
-                           (mp->m_sb.sb_rextsize == 0) ||
-                           (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                               code = XFS_ERROR(EINVAL);
-                               goto error_return;
-                       }
-               }
-
-               /*
-                * Can't modify an immutable/append-only file unless
-                * we have appropriate permission.
-                */
-               if ((ip->i_d.di_flags &
-                               (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-                    (fa->fsx_xflags &
-                               (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-                   !capable(CAP_LINUX_IMMUTABLE)) {
-                       code = XFS_ERROR(EPERM);
-                       goto error_return;
-               }
-       }
-
-       xfs_trans_ijoin(tp, ip);
-
-       /*
-        * Change file ownership.  Must be the owner or privileged.
-        */
-       if (mask & FSX_PROJID) {
-               /*
-                * CAP_FSETID overrides the following restrictions:
-                *
-                * The set-user-ID and set-group-ID bits of a file will be
-                * cleared upon successful return from chown()
-                */
-               if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                   !capable(CAP_FSETID))
-                       ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-
-               /*
-                * Change the ownerships and register quota modifications
-                * in the transaction.
-                */
-               if (xfs_get_projid(ip) != fa->fsx_projid) {
-                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                               olddquot = xfs_qm_vop_chown(tp, ip,
-                                                       &ip->i_gdquot, gdqp);
-                       }
-                       xfs_set_projid(ip, fa->fsx_projid);
-
-                       /*
-                        * We may have to rev the inode as well as
-                        * the superblock version number since projids didn't
-                        * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
-                        */
-                       if (ip->i_d.di_version == 1)
-                               xfs_bump_ino_vers2(tp, ip);
-               }
-
-       }
-
-       if (mask & FSX_EXTSIZE)
-               ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
-       if (mask & FSX_XFLAGS) {
-               xfs_set_diflags(ip, fa->fsx_xflags);
-               xfs_diflags_to_linux(ip);
-       }
-
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       XFS_STATS_INC(xs_ig_attrchg);
-
-       /*
-        * If this is a synchronous mount, make sure that the
-        * transaction goes to disk before returning to the user.
-        * This is slightly sub-optimal in that truncates require
-        * two sync transactions instead of one for wsync filesystems.
-        * One for the truncate and one for the timestamps since we
-        * don't want to change the timestamps unless we're sure the
-        * truncate worked.  Truncates are less than 1% of the laddis
-        * mix so this probably isn't worth the trouble to optimize.
-        */
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(tp);
-       code = xfs_trans_commit(tp, 0);
-       xfs_iunlock(ip, lock_flags);
-
-       /*
-        * Release any dquot(s) the inode had kept before chown.
-        */
-       xfs_qm_dqrele(olddquot);
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-
-       return code;
-
- error_return:
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       xfs_trans_cancel(tp, 0);
-       if (lock_flags)
-               xfs_iunlock(ip, lock_flags);
-       return code;
-}
-
-STATIC int
-xfs_ioc_fssetxattr(
-       xfs_inode_t             *ip,
-       struct file             *filp,
-       void                    __user *arg)
-{
-       struct fsxattr          fa;
-       unsigned int            mask;
-
-       if (copy_from_user(&fa, arg, sizeof(fa)))
-               return -EFAULT;
-
-       mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
-       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-               mask |= FSX_NONBLOCK;
-
-       return -xfs_ioctl_setattr(ip, &fa, mask);
-}
-
-STATIC int
-xfs_ioc_getxflags(
-       xfs_inode_t             *ip,
-       void                    __user *arg)
-{
-       unsigned int            flags;
-
-       flags = xfs_di2lxflags(ip->i_d.di_flags);
-       if (copy_to_user(arg, &flags, sizeof(flags)))
-               return -EFAULT;
-       return 0;
-}
-
-STATIC int
-xfs_ioc_setxflags(
-       xfs_inode_t             *ip,
-       struct file             *filp,
-       void                    __user *arg)
-{
-       struct fsxattr          fa;
-       unsigned int            flags;
-       unsigned int            mask;
-
-       if (copy_from_user(&flags, arg, sizeof(flags)))
-               return -EFAULT;
-
-       if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
-                     FS_NOATIME_FL | FS_NODUMP_FL | \
-                     FS_SYNC_FL))
-               return -EOPNOTSUPP;
-
-       mask = FSX_XFLAGS;
-       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-               mask |= FSX_NONBLOCK;
-       fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
-
-       return -xfs_ioctl_setattr(ip, &fa, mask);
-}
-
-STATIC int
-xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
-{
-       struct getbmap __user   *base = *ap;
-
-       /* copy only getbmap portion (not getbmapx) */
-       if (copy_to_user(base, bmv, sizeof(struct getbmap)))
-               return XFS_ERROR(EFAULT);
-
-       *ap += sizeof(struct getbmap);
-       return 0;
-}
-
-STATIC int
-xfs_ioc_getbmap(
-       struct xfs_inode        *ip,
-       int                     ioflags,
-       unsigned int            cmd,
-       void                    __user *arg)
-{
-       struct getbmapx         bmx;
-       int                     error;
-
-       if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
-               return -XFS_ERROR(EFAULT);
-
-       if (bmx.bmv_count < 2)
-               return -XFS_ERROR(EINVAL);
-
-       bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
-       if (ioflags & IO_INVIS)
-               bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
-
-       error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
-                           (struct getbmap *)arg+1);
-       if (error)
-               return -error;
-
-       /* copy back header - only size of getbmap */
-       if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
-{
-       struct getbmapx __user  *base = *ap;
-
-       if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
-               return XFS_ERROR(EFAULT);
-
-       *ap += sizeof(struct getbmapx);
-       return 0;
-}
-
-STATIC int
-xfs_ioc_getbmapx(
-       struct xfs_inode        *ip,
-       void                    __user *arg)
-{
-       struct getbmapx         bmx;
-       int                     error;
-
-       if (copy_from_user(&bmx, arg, sizeof(bmx)))
-               return -XFS_ERROR(EFAULT);
-
-       if (bmx.bmv_count < 2)
-               return -XFS_ERROR(EINVAL);
-
-       if (bmx.bmv_iflags & (~BMV_IF_VALID))
-               return -XFS_ERROR(EINVAL);
-
-       error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
-                           (struct getbmapx *)arg+1);
-       if (error)
-               return -error;
-
-       /* copy back header */
-       if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
-               return -XFS_ERROR(EFAULT);
-
-       return 0;
-}
-
-/*
- * Note: some of the ioctl's return positive numbers as a
- * byte count indicating success, such as readlink_by_handle.
- * So we don't "sign flip" like most other routines.  This means
- * true errors need to be returned as a negative value.
- */
-long
-xfs_file_ioctl(
-       struct file             *filp,
-       unsigned int            cmd,
-       unsigned long           p)
-{
-       struct inode            *inode = filp->f_path.dentry->d_inode;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
-       int                     error;
-
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-
-       trace_xfs_file_ioctl(ip);
-
-       switch (cmd) {
-       case FITRIM:
-               return xfs_ioc_trim(mp, arg);
-       case XFS_IOC_ALLOCSP:
-       case XFS_IOC_FREESP:
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_ALLOCSP64:
-       case XFS_IOC_FREESP64:
-       case XFS_IOC_RESVSP64:
-       case XFS_IOC_UNRESVSP64:
-       case XFS_IOC_ZERO_RANGE: {
-               xfs_flock64_t           bf;
-
-               if (copy_from_user(&bf, arg, sizeof(bf)))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
-       }
-       case XFS_IOC_DIOINFO: {
-               struct dioattr  da;
-               xfs_buftarg_t   *target =
-                       XFS_IS_REALTIME_INODE(ip) ?
-                       mp->m_rtdev_targp : mp->m_ddev_targp;
-
-               da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
-               da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
-
-               if (copy_to_user(arg, &da, sizeof(da)))
-                       return -XFS_ERROR(EFAULT);
-               return 0;
-       }
-
-       case XFS_IOC_FSBULKSTAT_SINGLE:
-       case XFS_IOC_FSBULKSTAT:
-       case XFS_IOC_FSINUMBERS:
-               return xfs_ioc_bulkstat(mp, cmd, arg);
-
-       case XFS_IOC_FSGEOMETRY_V1:
-               return xfs_ioc_fsgeometry_v1(mp, arg);
-
-       case XFS_IOC_FSGEOMETRY:
-               return xfs_ioc_fsgeometry(mp, arg);
-
-       case XFS_IOC_GETVERSION:
-               return put_user(inode->i_generation, (int __user *)arg);
-
-       case XFS_IOC_FSGETXATTR:
-               return xfs_ioc_fsgetxattr(ip, 0, arg);
-       case XFS_IOC_FSGETXATTRA:
-               return xfs_ioc_fsgetxattr(ip, 1, arg);
-       case XFS_IOC_FSSETXATTR:
-               return xfs_ioc_fssetxattr(ip, filp, arg);
-       case XFS_IOC_GETXFLAGS:
-               return xfs_ioc_getxflags(ip, arg);
-       case XFS_IOC_SETXFLAGS:
-               return xfs_ioc_setxflags(ip, filp, arg);
-
-       case XFS_IOC_FSSETDM: {
-               struct fsdmidata        dmi;
-
-               if (copy_from_user(&dmi, arg, sizeof(dmi)))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
-                               dmi.fsd_dmstate);
-               return -error;
-       }
-
-       case XFS_IOC_GETBMAP:
-       case XFS_IOC_GETBMAPA:
-               return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
-
-       case XFS_IOC_GETBMAPX:
-               return xfs_ioc_getbmapx(ip, arg);
-
-       case XFS_IOC_FD_TO_HANDLE:
-       case XFS_IOC_PATH_TO_HANDLE:
-       case XFS_IOC_PATH_TO_FSHANDLE: {
-               xfs_fsop_handlereq_t    hreq;
-
-               if (copy_from_user(&hreq, arg, sizeof(hreq)))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_find_handle(cmd, &hreq);
-       }
-       case XFS_IOC_OPEN_BY_HANDLE: {
-               xfs_fsop_handlereq_t    hreq;
-
-               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_open_by_handle(filp, &hreq);
-       }
-       case XFS_IOC_FSSETDM_BY_HANDLE:
-               return xfs_fssetdm_by_handle(filp, arg);
-
-       case XFS_IOC_READLINK_BY_HANDLE: {
-               xfs_fsop_handlereq_t    hreq;
-
-               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_readlink_by_handle(filp, &hreq);
-       }
-       case XFS_IOC_ATTRLIST_BY_HANDLE:
-               return xfs_attrlist_by_handle(filp, arg);
-
-       case XFS_IOC_ATTRMULTI_BY_HANDLE:
-               return xfs_attrmulti_by_handle(filp, arg);
-
-       case XFS_IOC_SWAPEXT: {
-               struct xfs_swapext      sxp;
-
-               if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
-                       return -XFS_ERROR(EFAULT);
-               error = xfs_swapext(&sxp);
-               return -error;
-       }
-
-       case XFS_IOC_FSCOUNTS: {
-               xfs_fsop_counts_t out;
-
-               error = xfs_fs_counts(mp, &out);
-               if (error)
-                       return -error;
-
-               if (copy_to_user(arg, &out, sizeof(out)))
-                       return -XFS_ERROR(EFAULT);
-               return 0;
-       }
-
-       case XFS_IOC_SET_RESBLKS: {
-               xfs_fsop_resblks_t inout;
-               __uint64_t         in;
-
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               if (mp->m_flags & XFS_MOUNT_RDONLY)
-                       return -XFS_ERROR(EROFS);
-
-               if (copy_from_user(&inout, arg, sizeof(inout)))
-                       return -XFS_ERROR(EFAULT);
-
-               /* input parameter is passed in resblks field of structure */
-               in = inout.resblks;
-               error = xfs_reserve_blocks(mp, &in, &inout);
-               if (error)
-                       return -error;
-
-               if (copy_to_user(arg, &inout, sizeof(inout)))
-                       return -XFS_ERROR(EFAULT);
-               return 0;
-       }
-
-       case XFS_IOC_GET_RESBLKS: {
-               xfs_fsop_resblks_t out;
-
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               error = xfs_reserve_blocks(mp, NULL, &out);
-               if (error)
-                       return -error;
-
-               if (copy_to_user(arg, &out, sizeof(out)))
-                       return -XFS_ERROR(EFAULT);
-
-               return 0;
-       }
-
-       case XFS_IOC_FSGROWFSDATA: {
-               xfs_growfs_data_t in;
-
-               if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_growfs_data(mp, &in);
-               return -error;
-       }
-
-       case XFS_IOC_FSGROWFSLOG: {
-               xfs_growfs_log_t in;
-
-               if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_growfs_log(mp, &in);
-               return -error;
-       }
-
-       case XFS_IOC_FSGROWFSRT: {
-               xfs_growfs_rt_t in;
-
-               if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_growfs_rt(mp, &in);
-               return -error;
-       }
-
-       case XFS_IOC_GOINGDOWN: {
-               __uint32_t in;
-
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               if (get_user(in, (__uint32_t __user *)arg))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_fs_goingdown(mp, in);
-               return -error;
-       }
-
-       case XFS_IOC_ERROR_INJECTION: {
-               xfs_error_injection_t in;
-
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               if (copy_from_user(&in, arg, sizeof(in)))
-                       return -XFS_ERROR(EFAULT);
-
-               error = xfs_errortag_add(in.errtag, mp);
-               return -error;
-       }
-
-       case XFS_IOC_ERROR_CLEARALL:
-               if (!capable(CAP_SYS_ADMIN))
-                       return -EPERM;
-
-               error = xfs_errortag_clearall(mp, 1);
-               return -error;
-
-       default:
-               return -ENOTTY;
-       }
-}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
deleted file mode 100644 (file)
index d56173b..0000000
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2008 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IOCTL_H__
-#define __XFS_IOCTL_H__
-
-extern int
-xfs_ioc_space(
-       struct xfs_inode        *ip,
-       struct inode            *inode,
-       struct file             *filp,
-       int                     ioflags,
-       unsigned int            cmd,
-       xfs_flock64_t           *bf);
-
-extern int
-xfs_find_handle(
-       unsigned int            cmd,
-       xfs_fsop_handlereq_t    *hreq);
-
-extern int
-xfs_open_by_handle(
-       struct file             *parfilp,
-       xfs_fsop_handlereq_t    *hreq);
-
-extern int
-xfs_readlink_by_handle(
-       struct file             *parfilp,
-       xfs_fsop_handlereq_t    *hreq);
-
-extern int
-xfs_attrmulti_attr_get(
-       struct inode            *inode,
-       unsigned char           *name,
-       unsigned char           __user *ubuf,
-       __uint32_t              *len,
-       __uint32_t              flags);
-
-extern int
-xfs_attrmulti_attr_set(
-       struct inode            *inode,
-       unsigned char           *name,
-       const unsigned char     __user *ubuf,
-       __uint32_t              len,
-       __uint32_t              flags);
-
-extern int
-xfs_attrmulti_attr_remove(
-       struct inode            *inode,
-       unsigned char           *name,
-       __uint32_t              flags);
-
-extern struct dentry *
-xfs_handle_to_dentry(
-       struct file             *parfilp,
-       void __user             *uhandle,
-       u32                     hlen);
-
-extern long
-xfs_file_ioctl(
-       struct file             *filp,
-       unsigned int            cmd,
-       unsigned long           p);
-
-extern long
-xfs_file_compat_ioctl(
-       struct file             *file,
-       unsigned int            cmd,
-       unsigned long           arg);
-
-#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
deleted file mode 100644 (file)
index 54e623b..0000000
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <linux/compat.h>
-#include <linux/ioctl.h>
-#include <linux/mount.h>
-#include <linux/slab.h>
-#include <asm/uaccess.h>
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_vnode.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_error.h"
-#include "xfs_dfrag.h"
-#include "xfs_vnodeops.h"
-#include "xfs_fsops.h"
-#include "xfs_alloc.h"
-#include "xfs_rtalloc.h"
-#include "xfs_attr.h"
-#include "xfs_ioctl.h"
-#include "xfs_ioctl32.h"
-#include "xfs_trace.h"
-
-#define  _NATIVE_IOC(cmd, type) \
-         _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-
-#ifdef BROKEN_X86_ALIGNMENT
-STATIC int
-xfs_compat_flock64_copyin(
-       xfs_flock64_t           *bf,
-       compat_xfs_flock64_t    __user *arg32)
-{
-       if (get_user(bf->l_type,        &arg32->l_type) ||
-           get_user(bf->l_whence,      &arg32->l_whence) ||
-           get_user(bf->l_start,       &arg32->l_start) ||
-           get_user(bf->l_len,         &arg32->l_len) ||
-           get_user(bf->l_sysid,       &arg32->l_sysid) ||
-           get_user(bf->l_pid,         &arg32->l_pid) ||
-           copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_compat_ioc_fsgeometry_v1(
-       struct xfs_mount          *mp,
-       compat_xfs_fsop_geom_v1_t __user *arg32)
-{
-       xfs_fsop_geom_t           fsgeo;
-       int                       error;
-
-       error = xfs_fs_geometry(mp, &fsgeo, 3);
-       if (error)
-               return -error;
-       /* The 32-bit variant simply has some padding at the end */
-       if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_compat_growfs_data_copyin(
-       struct xfs_growfs_data   *in,
-       compat_xfs_growfs_data_t __user *arg32)
-{
-       if (get_user(in->newblocks, &arg32->newblocks) ||
-           get_user(in->imaxpct,   &arg32->imaxpct))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_compat_growfs_rt_copyin(
-       struct xfs_growfs_rt     *in,
-       compat_xfs_growfs_rt_t  __user *arg32)
-{
-       if (get_user(in->newblocks, &arg32->newblocks) ||
-           get_user(in->extsize,   &arg32->extsize))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-STATIC int
-xfs_inumbers_fmt_compat(
-       void                    __user *ubuffer,
-       const xfs_inogrp_t      *buffer,
-       long                    count,
-       long                    *written)
-{
-       compat_xfs_inogrp_t     __user *p32 = ubuffer;
-       long                    i;
-
-       for (i = 0; i < count; i++) {
-               if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
-                   put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
-                   put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
-                       return -XFS_ERROR(EFAULT);
-       }
-       *written = count * sizeof(*p32);
-       return 0;
-}
-
-#else
-#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#endif /* BROKEN_X86_ALIGNMENT */
-
-STATIC int
-xfs_ioctl32_bstime_copyin(
-       xfs_bstime_t            *bstime,
-       compat_xfs_bstime_t     __user *bstime32)
-{
-       compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
-
-       if (get_user(sec32,             &bstime32->tv_sec)      ||
-           get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
-               return -XFS_ERROR(EFAULT);
-       bstime->tv_sec = sec32;
-       return 0;
-}
-
-/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
-STATIC int
-xfs_ioctl32_bstat_copyin(
-       xfs_bstat_t             *bstat,
-       compat_xfs_bstat_t      __user *bstat32)
-{
-       if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
-           get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
-           get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
-           get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
-           get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
-           get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
-           get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
-           get_user(bstat->bs_size,    &bstat32->bs_size)      ||
-           xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
-           xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
-           xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
-           get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
-           get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
-           get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
-           get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
-           get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
-           get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
-           get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
-           get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
-           get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
-           get_user(bstat->bs_aextents, &bstat32->bs_aextents))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-/* XFS_IOC_FSBULKSTAT and friends */
-
-STATIC int
-xfs_bstime_store_compat(
-       compat_xfs_bstime_t     __user *p32,
-       const xfs_bstime_t      *p)
-{
-       __s32                   sec32;
-
-       sec32 = p->tv_sec;
-       if (put_user(sec32, &p32->tv_sec) ||
-           put_user(p->tv_nsec, &p32->tv_nsec))
-               return -XFS_ERROR(EFAULT);
-       return 0;
-}
-
-/* Return 0 on success or positive error (to xfs_bulkstat()) */
-STATIC int
-xfs_bulkstat_one_fmt_compat(
-       void                    __user *ubuffer,
-       int                     ubsize,
-       int                     *ubused,
-       const xfs_bstat_t       *buffer)
-{
-       compat_xfs_bstat_t      __user *p32 = ubuffer;
-
-       if (ubsize < sizeof(*p32))
-               return XFS_ERROR(ENOMEM);
-
-       if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
-           put_user(buffer->bs_mode,     &p32->bs_mode)        ||
-           put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
-           put_user(buffer->bs_uid,      &p32->bs_uid)         ||
-           put_user(buffer->bs_gid,      &p32->bs_gid)         ||
-           put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
-           put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
-           put_user(buffer->bs_size,     &p32->bs_size)        ||
-           xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
-           xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
-           xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
-           put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
-           put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
-           put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
-           put_user(buffer->bs_extents,  &p32->bs_extents)     ||
-           put_user(buffer->bs_gen,      &p32->bs_gen)         ||
-           put_user(buffer->bs_projid,   &p32->bs_projid)      ||
-           put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
-           put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
-           put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
-           put_user(buffer->bs_aextents, &p32->bs_aextents))
-               return XFS_ERROR(EFAULT);
-       if (ubused)
-               *ubused = sizeof(*p32);
-       return 0;
-}
-
-STATIC int
-xfs_bulkstat_one_compat(
-       xfs_mount_t     *mp,            /* mount point for filesystem */
-       xfs_ino_t       ino,            /* inode number to get data for */
-       void            __user *buffer, /* buffer to place output in */
-       int             ubsize,         /* size of buffer */
-       int             *ubused,        /* bytes used by me */
-       int             *stat)          /* BULKSTAT_RV_... */
-{
-       return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
-                                   xfs_bulkstat_one_fmt_compat,
-                                   ubused, stat);
-}
-
-/* copied from xfs_ioctl.c */
-STATIC int
-xfs_compat_ioc_bulkstat(
-       xfs_mount_t               *mp,
-       unsigned int              cmd,
-       compat_xfs_fsop_bulkreq_t __user *p32)
-{
-       u32                     addr;
-       xfs_fsop_bulkreq_t      bulkreq;
-       int                     count;  /* # of records returned */
-       xfs_ino_t               inlast; /* last inode number */
-       int                     done;
-       int                     error;
-
-       /* done = 1 if there are more stats to get and if bulkstat */
-       /* should be called again (unused here, but used in dmapi) */
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return -XFS_ERROR(EIO);
-
-       if (get_user(addr, &p32->lastip))
-               return -XFS_ERROR(EFAULT);
-       bulkreq.lastip = compat_ptr(addr);
-       if (get_user(bulkreq.icount, &p32->icount) ||
-           get_user(addr, &p32->ubuffer))
-               return -XFS_ERROR(EFAULT);
-       bulkreq.ubuffer = compat_ptr(addr);
-       if (get_user(addr, &p32->ocount))
-               return -XFS_ERROR(EFAULT);
-       bulkreq.ocount = compat_ptr(addr);
-
-       if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
-               return -XFS_ERROR(EFAULT);
-
-       if ((count = bulkreq.icount) <= 0)
-               return -XFS_ERROR(EINVAL);
-
-       if (bulkreq.ubuffer == NULL)
-               return -XFS_ERROR(EINVAL);
-
-       if (cmd == XFS_IOC_FSINUMBERS_32) {
-               error = xfs_inumbers(mp, &inlast, &count,
-                               bulkreq.ubuffer, xfs_inumbers_fmt_compat);
-       } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
-               int res;
-
-               error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
-                               sizeof(compat_xfs_bstat_t), 0, &res);
-       } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
-               error = xfs_bulkstat(mp, &inlast, &count,
-                       xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
-                       bulkreq.ubuffer, &done);
-       } else
-               error = XFS_ERROR(EINVAL);
-       if (error)
-               return -error;
-
-       if (bulkreq.ocount != NULL) {
-               if (copy_to_user(bulkreq.lastip, &inlast,
-                                               sizeof(xfs_ino_t)))
-                       return -XFS_ERROR(EFAULT);
-
-               if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
-                       return -XFS_ERROR(EFAULT);
-       }
-
-       return 0;
-}
-
-STATIC int
-xfs_compat_handlereq_copyin(
-       xfs_fsop_handlereq_t            *hreq,
-       compat_xfs_fsop_handlereq_t     __user *arg32)
-{
-       compat_xfs_fsop_handlereq_t     hreq32;
-
-       if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       hreq->fd = hreq32.fd;
-       hreq->path = compat_ptr(hreq32.path);
-       hreq->oflags = hreq32.oflags;
-       hreq->ihandle = compat_ptr(hreq32.ihandle);
-       hreq->ihandlen = hreq32.ihandlen;
-       hreq->ohandle = compat_ptr(hreq32.ohandle);
-       hreq->ohandlen = compat_ptr(hreq32.ohandlen);
-
-       return 0;
-}
-
-STATIC struct dentry *
-xfs_compat_handlereq_to_dentry(
-       struct file             *parfilp,
-       compat_xfs_fsop_handlereq_t *hreq)
-{
-       return xfs_handle_to_dentry(parfilp,
-                       compat_ptr(hreq->ihandle), hreq->ihandlen);
-}
-
-STATIC int
-xfs_compat_attrlist_by_handle(
-       struct file             *parfilp,
-       void                    __user *arg)
-{
-       int                     error;
-       attrlist_cursor_kern_t  *cursor;
-       compat_xfs_fsop_attrlist_handlereq_t al_hreq;
-       struct dentry           *dentry;
-       char                    *kbuf;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&al_hreq, arg,
-                          sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-       if (al_hreq.buflen > XATTR_LIST_MAX)
-               return -XFS_ERROR(EINVAL);
-
-       /*
-        * Reject flags, only allow namespaces.
-        */
-       if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
-               return -XFS_ERROR(EINVAL);
-
-       dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       error = -ENOMEM;
-       kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
-       if (!kbuf)
-               goto out_dput;
-
-       cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
-       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
-                                       al_hreq.flags, cursor);
-       if (error)
-               goto out_kfree;
-
-       if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
-               error = -EFAULT;
-
- out_kfree:
-       kfree(kbuf);
- out_dput:
-       dput(dentry);
-       return error;
-}
-
-STATIC int
-xfs_compat_attrmulti_by_handle(
-       struct file                             *parfilp,
-       void                                    __user *arg)
-{
-       int                                     error;
-       compat_xfs_attr_multiop_t               *ops;
-       compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
-       struct dentry                           *dentry;
-       unsigned int                            i, size;
-       unsigned char                           *attr_name;
-
-       if (!capable(CAP_SYS_ADMIN))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&am_hreq, arg,
-                          sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       /* overflow check */
-       if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
-               return -E2BIG;
-
-       dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       error = E2BIG;
-       size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
-       if (!size || size > 16 * PAGE_SIZE)
-               goto out_dput;
-
-       ops = memdup_user(compat_ptr(am_hreq.ops), size);
-       if (IS_ERR(ops)) {
-               error = PTR_ERR(ops);
-               goto out_dput;
-       }
-
-       attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
-       if (!attr_name)
-               goto out_kfree_ops;
-
-       error = 0;
-       for (i = 0; i < am_hreq.opcount; i++) {
-               ops[i].am_error = strncpy_from_user((char *)attr_name,
-                               compat_ptr(ops[i].am_attrname),
-                               MAXNAMELEN);
-               if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
-                       error = -ERANGE;
-               if (ops[i].am_error < 0)
-                       break;
-
-               switch (ops[i].am_opcode) {
-               case ATTR_OP_GET:
-                       ops[i].am_error = xfs_attrmulti_attr_get(
-                                       dentry->d_inode, attr_name,
-                                       compat_ptr(ops[i].am_attrvalue),
-                                       &ops[i].am_length, ops[i].am_flags);
-                       break;
-               case ATTR_OP_SET:
-                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                       if (ops[i].am_error)
-                               break;
-                       ops[i].am_error = xfs_attrmulti_attr_set(
-                                       dentry->d_inode, attr_name,
-                                       compat_ptr(ops[i].am_attrvalue),
-                                       ops[i].am_length, ops[i].am_flags);
-                       mnt_drop_write(parfilp->f_path.mnt);
-                       break;
-               case ATTR_OP_REMOVE:
-                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
-                       if (ops[i].am_error)
-                               break;
-                       ops[i].am_error = xfs_attrmulti_attr_remove(
-                                       dentry->d_inode, attr_name,
-                                       ops[i].am_flags);
-                       mnt_drop_write(parfilp->f_path.mnt);
-                       break;
-               default:
-                       ops[i].am_error = EINVAL;
-               }
-       }
-
-       if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
-               error = XFS_ERROR(EFAULT);
-
-       kfree(attr_name);
- out_kfree_ops:
-       kfree(ops);
- out_dput:
-       dput(dentry);
-       return -error;
-}
-
-STATIC int
-xfs_compat_fssetdm_by_handle(
-       struct file             *parfilp,
-       void                    __user *arg)
-{
-       int                     error;
-       struct fsdmidata        fsd;
-       compat_xfs_fsop_setdm_handlereq_t dmhreq;
-       struct dentry           *dentry;
-
-       if (!capable(CAP_MKNOD))
-               return -XFS_ERROR(EPERM);
-       if (copy_from_user(&dmhreq, arg,
-                          sizeof(compat_xfs_fsop_setdm_handlereq_t)))
-               return -XFS_ERROR(EFAULT);
-
-       dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
-       if (IS_ERR(dentry))
-               return PTR_ERR(dentry);
-
-       if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
-               error = -XFS_ERROR(EPERM);
-               goto out;
-       }
-
-       if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
-               error = -XFS_ERROR(EFAULT);
-               goto out;
-       }
-
-       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
-                                fsd.fsd_dmstate);
-
-out:
-       dput(dentry);
-       return error;
-}
-
-long
-xfs_file_compat_ioctl(
-       struct file             *filp,
-       unsigned                cmd,
-       unsigned long           p)
-{
-       struct inode            *inode = filp->f_path.dentry->d_inode;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       void                    __user *arg = (void __user *)p;
-       int                     ioflags = 0;
-       int                     error;
-
-       if (filp->f_mode & FMODE_NOCMTIME)
-               ioflags |= IO_INVIS;
-
-       trace_xfs_file_compat_ioctl(ip);
-
-       switch (cmd) {
-       /* No size or alignment issues on any arch */
-       case XFS_IOC_DIOINFO:
-       case XFS_IOC_FSGEOMETRY:
-       case XFS_IOC_FSGETXATTR:
-       case XFS_IOC_FSSETXATTR:
-       case XFS_IOC_FSGETXATTRA:
-       case XFS_IOC_FSSETDM:
-       case XFS_IOC_GETBMAP:
-       case XFS_IOC_GETBMAPA:
-       case XFS_IOC_GETBMAPX:
-       case XFS_IOC_FSCOUNTS:
-       case XFS_IOC_SET_RESBLKS:
-       case XFS_IOC_GET_RESBLKS:
-       case XFS_IOC_FSGROWFSLOG:
-       case XFS_IOC_GOINGDOWN:
-       case XFS_IOC_ERROR_INJECTION:
-       case XFS_IOC_ERROR_CLEARALL:
-               return xfs_file_ioctl(filp, cmd, p);
-#ifndef BROKEN_X86_ALIGNMENT
-       /* These are handled fine if no alignment issues */
-       case XFS_IOC_ALLOCSP:
-       case XFS_IOC_FREESP:
-       case XFS_IOC_RESVSP:
-       case XFS_IOC_UNRESVSP:
-       case XFS_IOC_ALLOCSP64:
-       case XFS_IOC_FREESP64:
-       case XFS_IOC_RESVSP64:
-       case XFS_IOC_UNRESVSP64:
-       case XFS_IOC_FSGEOMETRY_V1:
-       case XFS_IOC_FSGROWFSDATA:
-       case XFS_IOC_FSGROWFSRT:
-       case XFS_IOC_ZERO_RANGE:
-               return xfs_file_ioctl(filp, cmd, p);
-#else
-       case XFS_IOC_ALLOCSP_32:
-       case XFS_IOC_FREESP_32:
-       case XFS_IOC_ALLOCSP64_32:
-       case XFS_IOC_FREESP64_32:
-       case XFS_IOC_RESVSP_32:
-       case XFS_IOC_UNRESVSP_32:
-       case XFS_IOC_RESVSP64_32:
-       case XFS_IOC_UNRESVSP64_32:
-       case XFS_IOC_ZERO_RANGE_32: {
-               struct xfs_flock64      bf;
-
-               if (xfs_compat_flock64_copyin(&bf, arg))
-                       return -XFS_ERROR(EFAULT);
-               cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
-               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
-       }
-       case XFS_IOC_FSGEOMETRY_V1_32:
-               return xfs_compat_ioc_fsgeometry_v1(mp, arg);
-       case XFS_IOC_FSGROWFSDATA_32: {
-               struct xfs_growfs_data  in;
-
-               if (xfs_compat_growfs_data_copyin(&in, arg))
-                       return -XFS_ERROR(EFAULT);
-               error = xfs_growfs_data(mp, &in);
-               return -error;
-       }
-       case XFS_IOC_FSGROWFSRT_32: {
-               struct xfs_growfs_rt    in;
-
-               if (xfs_compat_growfs_rt_copyin(&in, arg))
-                       return -XFS_ERROR(EFAULT);
-               error = xfs_growfs_rt(mp, &in);
-               return -error;
-       }
-#endif
-       /* long changes size, but xfs only copiese out 32 bits */
-       case XFS_IOC_GETXFLAGS_32:
-       case XFS_IOC_SETXFLAGS_32:
-       case XFS_IOC_GETVERSION_32:
-               cmd = _NATIVE_IOC(cmd, long);
-               return xfs_file_ioctl(filp, cmd, p);
-       case XFS_IOC_SWAPEXT_32: {
-               struct xfs_swapext        sxp;
-               struct compat_xfs_swapext __user *sxu = arg;
-
-               /* Bulk copy in up to the sx_stat field, then copy bstat */
-               if (copy_from_user(&sxp, sxu,
-                                  offsetof(struct xfs_swapext, sx_stat)) ||
-                   xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
-                       return -XFS_ERROR(EFAULT);
-               error = xfs_swapext(&sxp);
-               return -error;
-       }
-       case XFS_IOC_FSBULKSTAT_32:
-       case XFS_IOC_FSBULKSTAT_SINGLE_32:
-       case XFS_IOC_FSINUMBERS_32:
-               return xfs_compat_ioc_bulkstat(mp, cmd, arg);
-       case XFS_IOC_FD_TO_HANDLE_32:
-       case XFS_IOC_PATH_TO_HANDLE_32:
-       case XFS_IOC_PATH_TO_FSHANDLE_32: {
-               struct xfs_fsop_handlereq       hreq;
-
-               if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
-               cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
-               return xfs_find_handle(cmd, &hreq);
-       }
-       case XFS_IOC_OPEN_BY_HANDLE_32: {
-               struct xfs_fsop_handlereq       hreq;
-
-               if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_open_by_handle(filp, &hreq);
-       }
-       case XFS_IOC_READLINK_BY_HANDLE_32: {
-               struct xfs_fsop_handlereq       hreq;
-
-               if (xfs_compat_handlereq_copyin(&hreq, arg))
-                       return -XFS_ERROR(EFAULT);
-               return xfs_readlink_by_handle(filp, &hreq);
-       }
-       case XFS_IOC_ATTRLIST_BY_HANDLE_32:
-               return xfs_compat_attrlist_by_handle(filp, arg);
-       case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
-               return xfs_compat_attrmulti_by_handle(filp, arg);
-       case XFS_IOC_FSSETDM_BY_HANDLE_32:
-               return xfs_compat_fssetdm_by_handle(filp, arg);
-       default:
-               return -XFS_ERROR(ENOIOCTLCMD);
-       }
-}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
deleted file mode 100644 (file)
index 80f4060..0000000
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2004-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IOCTL32_H__
-#define __XFS_IOCTL32_H__
-
-#include <linux/compat.h>
-
-/*
- * on 32-bit arches, ioctl argument structures may have different sizes
- * and/or alignment.  We define compat structures which match the
- * 32-bit sizes/alignments here, and their associated ioctl numbers.
- *
- * xfs_ioctl32.c contains routines to copy these structures in and out.
- */
-
-/* stock kernel-level ioctls we support */
-#define XFS_IOC_GETXFLAGS_32   FS_IOC32_GETFLAGS
-#define XFS_IOC_SETXFLAGS_32   FS_IOC32_SETFLAGS
-#define XFS_IOC_GETVERSION_32  FS_IOC32_GETVERSION
-
-/*
- * On intel, even if sizes match, alignment and/or padding may differ.
- */
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-#define BROKEN_X86_ALIGNMENT
-#define __compat_packed __attribute__((packed))
-#else
-#define __compat_packed
-#endif
-
-typedef struct compat_xfs_bstime {
-       compat_time_t   tv_sec;         /* seconds              */
-       __s32           tv_nsec;        /* and nanoseconds      */
-} compat_xfs_bstime_t;
-
-typedef struct compat_xfs_bstat {
-       __u64           bs_ino;         /* inode number                 */
-       __u16           bs_mode;        /* type and mode                */
-       __u16           bs_nlink;       /* number of links              */
-       __u32           bs_uid;         /* user id                      */
-       __u32           bs_gid;         /* group id                     */
-       __u32           bs_rdev;        /* device value                 */
-       __s32           bs_blksize;     /* block size                   */
-       __s64           bs_size;        /* file size                    */
-       compat_xfs_bstime_t bs_atime;   /* access time                  */
-       compat_xfs_bstime_t bs_mtime;   /* modify time                  */
-       compat_xfs_bstime_t bs_ctime;   /* inode change time            */
-       int64_t         bs_blocks;      /* number of blocks             */
-       __u32           bs_xflags;      /* extended flags               */
-       __s32           bs_extsize;     /* extent size                  */
-       __s32           bs_extents;     /* number of extents            */
-       __u32           bs_gen;         /* generation count             */
-       __u16           bs_projid_lo;   /* lower part of project id     */
-#define        bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
-       __u16           bs_projid_hi;   /* high part of project id      */
-       unsigned char   bs_pad[12];     /* pad space, unused            */
-       __u32           bs_dmevmask;    /* DMIG event mask              */
-       __u16           bs_dmstate;     /* DMIG state info              */
-       __u16           bs_aextents;    /* attribute number of extents  */
-} __compat_packed compat_xfs_bstat_t;
-
-typedef struct compat_xfs_fsop_bulkreq {
-       compat_uptr_t   lastip;         /* last inode # pointer         */
-       __s32           icount;         /* count of entries in buffer   */
-       compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
-       compat_uptr_t   ocount;         /* output count pointer         */
-} compat_xfs_fsop_bulkreq_t;
-
-#define XFS_IOC_FSBULKSTAT_32 \
-       _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
-       _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSINUMBERS_32 \
-       _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
-
-typedef struct compat_xfs_fsop_handlereq {
-       __u32           fd;             /* fd for FD_TO_HANDLE          */
-       compat_uptr_t   path;           /* user pathname                */
-       __u32           oflags;         /* open flags                   */
-       compat_uptr_t   ihandle;        /* user supplied handle         */
-       __u32           ihandlen;       /* user supplied length         */
-       compat_uptr_t   ohandle;        /* user buffer for handle       */
-       compat_uptr_t   ohandlen;       /* user buffer length           */
-} compat_xfs_fsop_handlereq_t;
-
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
-       _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_PATH_TO_HANDLE_32 \
-       _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_FD_TO_HANDLE_32 \
-       _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
-       _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
-       _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-
-/* The bstat field in the swapext struct needs translation */
-typedef struct compat_xfs_swapext {
-       __int64_t               sx_version;     /* version */
-       __int64_t               sx_fdtarget;    /* fd of target file */
-       __int64_t               sx_fdtmp;       /* fd of tmp file */
-       xfs_off_t               sx_offset;      /* offset into file */
-       xfs_off_t               sx_length;      /* leng from offset */
-       char                    sx_pad[16];     /* pad space, unused */
-       compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
-} __compat_packed compat_xfs_swapext_t;
-
-#define XFS_IOC_SWAPEXT_32     _IOWR('X', 109, struct compat_xfs_swapext)
-
-typedef struct compat_xfs_fsop_attrlist_handlereq {
-       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
-       struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
-       __u32                           flags;  /* which namespace to use */
-       __u32                           buflen; /* length of buffer supplied */
-       compat_uptr_t                   buffer; /* returned names */
-} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
-
-/* Note: actually this is read/write */
-#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
-       _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
-
-/* am_opcodes defined in xfs_fs.h */
-typedef struct compat_xfs_attr_multiop {
-       __u32           am_opcode;
-       __s32           am_error;
-       compat_uptr_t   am_attrname;
-       compat_uptr_t   am_attrvalue;
-       __u32           am_length;
-       __u32           am_flags;
-} compat_xfs_attr_multiop_t;
-
-typedef struct compat_xfs_fsop_attrmulti_handlereq {
-       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
-       __u32                           opcount;/* count of following multiop */
-       /* ptr to compat_xfs_attr_multiop */
-       compat_uptr_t                   ops; /* attr_multi data */
-} compat_xfs_fsop_attrmulti_handlereq_t;
-
-#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
-       _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
-
-typedef struct compat_xfs_fsop_setdm_handlereq {
-       struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
-       /* ptr to struct fsdmidata */
-       compat_uptr_t                   data;   /* DMAPI data   */
-} compat_xfs_fsop_setdm_handlereq_t;
-
-#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
-       _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
-
-#ifdef BROKEN_X86_ALIGNMENT
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct compat_xfs_flock64 {
-       __s16           l_type;
-       __s16           l_whence;
-       __s64           l_start __attribute__((packed));
-                       /* len == 0 means until end of file */
-       __s64           l_len __attribute__((packed));
-       __s32           l_sysid;
-       __u32           l_pid;
-       __s32           l_pad[4];       /* reserve area */
-} compat_xfs_flock64_t;
-
-#define XFS_IOC_ALLOCSP_32     _IOW('X', 10, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP_32      _IOW('X', 11, struct compat_xfs_flock64)
-#define XFS_IOC_ALLOCSP64_32   _IOW('X', 36, struct compat_xfs_flock64)
-#define XFS_IOC_FREESP64_32    _IOW('X', 37, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP_32      _IOW('X', 40, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP_32    _IOW('X', 41, struct compat_xfs_flock64)
-#define XFS_IOC_RESVSP64_32    _IOW('X', 42, struct compat_xfs_flock64)
-#define XFS_IOC_UNRESVSP64_32  _IOW('X', 43, struct compat_xfs_flock64)
-#define XFS_IOC_ZERO_RANGE_32  _IOW('X', 57, struct compat_xfs_flock64)
-
-typedef struct compat_xfs_fsop_geom_v1 {
-       __u32           blocksize;      /* filesystem (data) block size */
-       __u32           rtextsize;      /* realtime extent size         */
-       __u32           agblocks;       /* fsblocks in an AG            */
-       __u32           agcount;        /* number of allocation groups  */
-       __u32           logblocks;      /* fsblocks in the log          */
-       __u32           sectsize;       /* (data) sector size, bytes    */
-       __u32           inodesize;      /* inode size in bytes          */
-       __u32           imaxpct;        /* max allowed inode space(%)   */
-       __u64           datablocks;     /* fsblocks in data subvolume   */
-       __u64           rtblocks;       /* fsblocks in realtime subvol  */
-       __u64           rtextents;      /* rt extents in realtime subvol*/
-       __u64           logstart;       /* starting fsblock of the log  */
-       unsigned char   uuid[16];       /* unique id of the filesystem  */
-       __u32           sunit;          /* stripe unit, fsblocks        */
-       __u32           swidth;         /* stripe width, fsblocks       */
-       __s32           version;        /* structure version            */
-       __u32           flags;          /* superblock version flags     */
-       __u32           logsectsize;    /* log sector size, bytes       */
-       __u32           rtsectsize;     /* realtime sector size, bytes  */
-       __u32           dirblocksize;   /* directory block size, bytes  */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-
-#define XFS_IOC_FSGEOMETRY_V1_32  \
-       _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
-
-typedef struct compat_xfs_inogrp {
-       __u64           xi_startino;    /* starting inode number        */
-       __s32           xi_alloccount;  /* # bits set in allocmask      */
-       __u64           xi_allocmask;   /* mask of allocated inodes     */
-} __attribute__((packed)) compat_xfs_inogrp_t;
-
-/* These growfs input structures have padding on the end, so must translate */
-typedef struct compat_xfs_growfs_data {
-       __u64           newblocks;      /* new data subvol size, fsblocks */
-       __u32           imaxpct;        /* new inode space percentage limit */
-} __attribute__((packed)) compat_xfs_growfs_data_t;
-
-typedef struct compat_xfs_growfs_rt {
-       __u64           newblocks;      /* new realtime size, fsblocks */
-       __u32           extsize;        /* new realtime extent size, fsblocks */
-} __attribute__((packed)) compat_xfs_growfs_rt_t;
-
-#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
-#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
-
-#endif /* BROKEN_X86_ALIGNMENT */
-
-#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
deleted file mode 100644 (file)
index b9c172b..0000000
+++ /dev/null
@@ -1,1210 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_acl.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_rw.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_trace.h"
-
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/namei.h>
-#include <linux/posix_acl.h>
-#include <linux/security.h>
-#include <linux/fiemap.h>
-#include <linux/slab.h>
-
-/*
- * Bring the timestamps in the XFS inode uptodate.
- *
- * Used before writing the inode to disk.
- */
-void
-xfs_synchronize_times(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
-       ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
-       ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
-       ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
-       ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
-       ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
-}
-
-/*
- * If the linux inode is valid, mark it dirty.
- * Used when committing a dirty inode into a transaction so that
- * the inode will get written back by the linux code
- */
-void
-xfs_mark_inode_dirty_sync(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-               mark_inode_dirty_sync(inode);
-}
-
-void
-xfs_mark_inode_dirty(
-       xfs_inode_t     *ip)
-{
-       struct inode    *inode = VFS_I(ip);
-
-       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
-               mark_inode_dirty(inode);
-}
-
-/*
- * Hook in SELinux.  This is not quite correct yet, what we really need
- * here (as we do for default ACLs) is a mechanism by which creation of
- * these attrs can be journalled at inode creation time (along with the
- * inode, of course, such that log replay can't cause these to be lost).
- */
-STATIC int
-xfs_init_security(
-       struct inode    *inode,
-       struct inode    *dir,
-       const struct qstr *qstr)
-{
-       struct xfs_inode *ip = XFS_I(inode);
-       size_t          length;
-       void            *value;
-       unsigned char   *name;
-       int             error;
-
-       error = security_inode_init_security(inode, dir, qstr, (char **)&name,
-                                            &value, &length);
-       if (error) {
-               if (error == -EOPNOTSUPP)
-                       return 0;
-               return -error;
-       }
-
-       error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
-
-       kfree(name);
-       kfree(value);
-       return error;
-}
-
-static void
-xfs_dentry_to_name(
-       struct xfs_name *namep,
-       struct dentry   *dentry)
-{
-       namep->name = dentry->d_name.name;
-       namep->len = dentry->d_name.len;
-}
-
-STATIC void
-xfs_cleanup_inode(
-       struct inode    *dir,
-       struct inode    *inode,
-       struct dentry   *dentry)
-{
-       struct xfs_name teardown;
-
-       /* Oh, the horror.
-        * If we can't add the ACL or we fail in
-        * xfs_init_security we must back out.
-        * ENOSPC can hit here, among other things.
-        */
-       xfs_dentry_to_name(&teardown, dentry);
-
-       xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-       iput(inode);
-}
-
-STATIC int
-xfs_vn_mknod(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       int             mode,
-       dev_t           rdev)
-{
-       struct inode    *inode;
-       struct xfs_inode *ip = NULL;
-       struct posix_acl *default_acl = NULL;
-       struct xfs_name name;
-       int             error;
-
-       /*
-        * Irix uses Missed'em'V split, but doesn't want to see
-        * the upper 5 bits of (14bit) major.
-        */
-       if (S_ISCHR(mode) || S_ISBLK(mode)) {
-               if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
-                       return -EINVAL;
-               rdev = sysv_encode_dev(rdev);
-       } else {
-               rdev = 0;
-       }
-
-       if (IS_POSIXACL(dir)) {
-               default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
-               if (IS_ERR(default_acl))
-                       return PTR_ERR(default_acl);
-
-               if (!default_acl)
-                       mode &= ~current_umask();
-       }
-
-       xfs_dentry_to_name(&name, dentry);
-       error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
-       if (unlikely(error))
-               goto out_free_acl;
-
-       inode = VFS_I(ip);
-
-       error = xfs_init_security(inode, dir, &dentry->d_name);
-       if (unlikely(error))
-               goto out_cleanup_inode;
-
-       if (default_acl) {
-               error = -xfs_inherit_acl(inode, default_acl);
-               default_acl = NULL;
-               if (unlikely(error))
-                       goto out_cleanup_inode;
-       }
-
-
-       d_instantiate(dentry, inode);
-       return -error;
-
- out_cleanup_inode:
-       xfs_cleanup_inode(dir, inode, dentry);
- out_free_acl:
-       posix_acl_release(default_acl);
-       return -error;
-}
-
-STATIC int
-xfs_vn_create(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       int             mode,
-       struct nameidata *nd)
-{
-       return xfs_vn_mknod(dir, dentry, mode, 0);
-}
-
-STATIC int
-xfs_vn_mkdir(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       int             mode)
-{
-       return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
-}
-
-STATIC struct dentry *
-xfs_vn_lookup(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       struct nameidata *nd)
-{
-       struct xfs_inode *cip;
-       struct xfs_name name;
-       int             error;
-
-       if (dentry->d_name.len >= MAXNAMELEN)
-               return ERR_PTR(-ENAMETOOLONG);
-
-       xfs_dentry_to_name(&name, dentry);
-       error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
-       if (unlikely(error)) {
-               if (unlikely(error != ENOENT))
-                       return ERR_PTR(-error);
-               d_add(dentry, NULL);
-               return NULL;
-       }
-
-       return d_splice_alias(VFS_I(cip), dentry);
-}
-
-STATIC struct dentry *
-xfs_vn_ci_lookup(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       struct nameidata *nd)
-{
-       struct xfs_inode *ip;
-       struct xfs_name xname;
-       struct xfs_name ci_name;
-       struct qstr     dname;
-       int             error;
-
-       if (dentry->d_name.len >= MAXNAMELEN)
-               return ERR_PTR(-ENAMETOOLONG);
-
-       xfs_dentry_to_name(&xname, dentry);
-       error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
-       if (unlikely(error)) {
-               if (unlikely(error != ENOENT))
-                       return ERR_PTR(-error);
-               /*
-                * call d_add(dentry, NULL) here when d_drop_negative_children
-                * is called in xfs_vn_mknod (ie. allow negative dentries
-                * with CI filesystems).
-                */
-               return NULL;
-       }
-
-       /* if exact match, just splice and exit */
-       if (!ci_name.name)
-               return d_splice_alias(VFS_I(ip), dentry);
-
-       /* else case-insensitive match... */
-       dname.name = ci_name.name;
-       dname.len = ci_name.len;
-       dentry = d_add_ci(dentry, VFS_I(ip), &dname);
-       kmem_free(ci_name.name);
-       return dentry;
-}
-
-STATIC int
-xfs_vn_link(
-       struct dentry   *old_dentry,
-       struct inode    *dir,
-       struct dentry   *dentry)
-{
-       struct inode    *inode = old_dentry->d_inode;
-       struct xfs_name name;
-       int             error;
-
-       xfs_dentry_to_name(&name, dentry);
-
-       error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
-       if (unlikely(error))
-               return -error;
-
-       ihold(inode);
-       d_instantiate(dentry, inode);
-       return 0;
-}
-
-STATIC int
-xfs_vn_unlink(
-       struct inode    *dir,
-       struct dentry   *dentry)
-{
-       struct xfs_name name;
-       int             error;
-
-       xfs_dentry_to_name(&name, dentry);
-
-       error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
-       if (error)
-               return error;
-
-       /*
-        * With unlink, the VFS makes the dentry "negative": no inode,
-        * but still hashed. This is incompatible with case-insensitive
-        * mode, so invalidate (unhash) the dentry in CI-mode.
-        */
-       if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
-               d_invalidate(dentry);
-       return 0;
-}
-
-STATIC int
-xfs_vn_symlink(
-       struct inode    *dir,
-       struct dentry   *dentry,
-       const char      *symname)
-{
-       struct inode    *inode;
-       struct xfs_inode *cip = NULL;
-       struct xfs_name name;
-       int             error;
-       mode_t          mode;
-
-       mode = S_IFLNK |
-               (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
-       xfs_dentry_to_name(&name, dentry);
-
-       error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
-       if (unlikely(error))
-               goto out;
-
-       inode = VFS_I(cip);
-
-       error = xfs_init_security(inode, dir, &dentry->d_name);
-       if (unlikely(error))
-               goto out_cleanup_inode;
-
-       d_instantiate(dentry, inode);
-       return 0;
-
- out_cleanup_inode:
-       xfs_cleanup_inode(dir, inode, dentry);
- out:
-       return -error;
-}
-
-STATIC int
-xfs_vn_rename(
-       struct inode    *odir,
-       struct dentry   *odentry,
-       struct inode    *ndir,
-       struct dentry   *ndentry)
-{
-       struct inode    *new_inode = ndentry->d_inode;
-       struct xfs_name oname;
-       struct xfs_name nname;
-
-       xfs_dentry_to_name(&oname, odentry);
-       xfs_dentry_to_name(&nname, ndentry);
-
-       return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                          XFS_I(ndir), &nname, new_inode ?
-                                               XFS_I(new_inode) : NULL);
-}
-
-/*
- * careful here - this function can get called recursively, so
- * we need to be very careful about how much stack we use.
- * uio is kmalloced for this reason...
- */
-STATIC void *
-xfs_vn_follow_link(
-       struct dentry           *dentry,
-       struct nameidata        *nd)
-{
-       char                    *link;
-       int                     error = -ENOMEM;
-
-       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
-       if (!link)
-               goto out_err;
-
-       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
-       if (unlikely(error))
-               goto out_kfree;
-
-       nd_set_link(nd, link);
-       return NULL;
-
- out_kfree:
-       kfree(link);
- out_err:
-       nd_set_link(nd, ERR_PTR(error));
-       return NULL;
-}
-
-STATIC void
-xfs_vn_put_link(
-       struct dentry   *dentry,
-       struct nameidata *nd,
-       void            *p)
-{
-       char            *s = nd_get_link(nd);
-
-       if (!IS_ERR(s))
-               kfree(s);
-}
-
-STATIC int
-xfs_vn_getattr(
-       struct vfsmount         *mnt,
-       struct dentry           *dentry,
-       struct kstat            *stat)
-{
-       struct inode            *inode = dentry->d_inode;
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-
-       trace_xfs_getattr(ip);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       stat->size = XFS_ISIZE(ip);
-       stat->dev = inode->i_sb->s_dev;
-       stat->mode = ip->i_d.di_mode;
-       stat->nlink = ip->i_d.di_nlink;
-       stat->uid = ip->i_d.di_uid;
-       stat->gid = ip->i_d.di_gid;
-       stat->ino = ip->i_ino;
-       stat->atime = inode->i_atime;
-       stat->mtime = inode->i_mtime;
-       stat->ctime = inode->i_ctime;
-       stat->blocks =
-               XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
-
-
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFBLK:
-       case S_IFCHR:
-               stat->blksize = BLKDEV_IOSIZE;
-               stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-                                  sysv_minor(ip->i_df.if_u2.if_rdev));
-               break;
-       default:
-               if (XFS_IS_REALTIME_INODE(ip)) {
-                       /*
-                        * If the file blocks are being allocated from a
-                        * realtime volume, then return the inode's realtime
-                        * extent size or the realtime volume's extent size.
-                        */
-                       stat->blksize =
-                               xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
-               } else
-                       stat->blksize = xfs_preferred_iosize(mp);
-               stat->rdev = 0;
-               break;
-       }
-
-       return 0;
-}
-
-int
-xfs_setattr_nonsize(
-       struct xfs_inode        *ip,
-       struct iattr            *iattr,
-       int                     flags)
-{
-       xfs_mount_t             *mp = ip->i_mount;
-       struct inode            *inode = VFS_I(ip);
-       int                     mask = iattr->ia_valid;
-       xfs_trans_t             *tp;
-       int                     error;
-       uid_t                   uid = 0, iuid = 0;
-       gid_t                   gid = 0, igid = 0;
-       struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
-       struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
-
-       trace_xfs_setattr(ip);
-
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return XFS_ERROR(EROFS);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = -inode_change_ok(inode, iattr);
-       if (error)
-               return XFS_ERROR(error);
-
-       ASSERT((mask & ATTR_SIZE) == 0);
-
-       /*
-        * If disk quotas is on, we make sure that the dquots do exist on disk,
-        * before we start any other transactions. Trying to do this later
-        * is messy. We don't care to take a readlock to look at the ids
-        * in inode here, because we can't hold it across the trans_reserve.
-        * If the IDs do change before we take the ilock, we're covered
-        * because the i_*dquot fields will get updated anyway.
-        */
-       if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
-               uint    qflags = 0;
-
-               if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
-                       uid = iattr->ia_uid;
-                       qflags |= XFS_QMOPT_UQUOTA;
-               } else {
-                       uid = ip->i_d.di_uid;
-               }
-               if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
-                       gid = iattr->ia_gid;
-                       qflags |= XFS_QMOPT_GQUOTA;
-               }  else {
-                       gid = ip->i_d.di_gid;
-               }
-
-               /*
-                * We take a reference when we initialize udqp and gdqp,
-                * so it is important that we never blindly double trip on
-                * the same variable. See xfs_create() for an example.
-                */
-               ASSERT(udqp == NULL);
-               ASSERT(gdqp == NULL);
-               error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
-                                        qflags, &udqp, &gdqp);
-               if (error)
-                       return error;
-       }
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-       if (error)
-               goto out_dqrele;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-       /*
-        * Change file ownership.  Must be the owner or privileged.
-        */
-       if (mask & (ATTR_UID|ATTR_GID)) {
-               /*
-                * These IDs could have changed since we last looked at them.
-                * But, we're assured that if the ownership did change
-                * while we didn't have the inode locked, inode's dquot(s)
-                * would have changed also.
-                */
-               iuid = ip->i_d.di_uid;
-               igid = ip->i_d.di_gid;
-               gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
-               uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
-
-               /*
-                * Do a quota reservation only if uid/gid is actually
-                * going to change.
-                */
-               if (XFS_IS_QUOTA_RUNNING(mp) &&
-                   ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
-                    (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
-                       ASSERT(tp);
-                       error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
-                                               capable(CAP_FOWNER) ?
-                                               XFS_QMOPT_FORCE_RES : 0);
-                       if (error)      /* out of quota */
-                               goto out_trans_cancel;
-               }
-       }
-
-       xfs_trans_ijoin(tp, ip);
-
-       /*
-        * Change file ownership.  Must be the owner or privileged.
-        */
-       if (mask & (ATTR_UID|ATTR_GID)) {
-               /*
-                * CAP_FSETID overrides the following restrictions:
-                *
-                * The set-user-ID and set-group-ID bits of a file will be
-                * cleared upon successful return from chown()
-                */
-               if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                   !capable(CAP_FSETID))
-                       ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-
-               /*
-                * Change the ownerships and register quota modifications
-                * in the transaction.
-                */
-               if (iuid != uid) {
-                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
-                               ASSERT(mask & ATTR_UID);
-                               ASSERT(udqp);
-                               olddquot1 = xfs_qm_vop_chown(tp, ip,
-                                                       &ip->i_udquot, udqp);
-                       }
-                       ip->i_d.di_uid = uid;
-                       inode->i_uid = uid;
-               }
-               if (igid != gid) {
-                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
-                               ASSERT(!XFS_IS_PQUOTA_ON(mp));
-                               ASSERT(mask & ATTR_GID);
-                               ASSERT(gdqp);
-                               olddquot2 = xfs_qm_vop_chown(tp, ip,
-                                                       &ip->i_gdquot, gdqp);
-                       }
-                       ip->i_d.di_gid = gid;
-                       inode->i_gid = gid;
-               }
-       }
-
-       /*
-        * Change file access modes.
-        */
-       if (mask & ATTR_MODE) {
-               umode_t mode = iattr->ia_mode;
-
-               if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
-                       mode &= ~S_ISGID;
-
-               ip->i_d.di_mode &= S_IFMT;
-               ip->i_d.di_mode |= mode & ~S_IFMT;
-
-               inode->i_mode &= S_IFMT;
-               inode->i_mode |= mode & ~S_IFMT;
-       }
-
-       /*
-        * Change file access or modified times.
-        */
-       if (mask & ATTR_ATIME) {
-               inode->i_atime = iattr->ia_atime;
-               ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
-               ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
-               ip->i_update_core = 1;
-       }
-       if (mask & ATTR_CTIME) {
-               inode->i_ctime = iattr->ia_ctime;
-               ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-               ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-               ip->i_update_core = 1;
-       }
-       if (mask & ATTR_MTIME) {
-               inode->i_mtime = iattr->ia_mtime;
-               ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-               ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-               ip->i_update_core = 1;
-       }
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       XFS_STATS_INC(xs_ig_attrchg);
-
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       /*
-        * Release any dquot(s) the inode had kept before chown.
-        */
-       xfs_qm_dqrele(olddquot1);
-       xfs_qm_dqrele(olddquot2);
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-
-       if (error)
-               return XFS_ERROR(error);
-
-       /*
-        * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
-        *           update.  We could avoid this with linked transactions
-        *           and passing down the transaction pointer all the way
-        *           to attr_set.  No previous user of the generic
-        *           Posix ACL code seems to care about this issue either.
-        */
-       if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
-               error = -xfs_acl_chmod(inode);
-               if (error)
-                       return XFS_ERROR(error);
-       }
-
-       return 0;
-
-out_trans_cancel:
-       xfs_trans_cancel(tp, 0);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-out_dqrele:
-       xfs_qm_dqrele(udqp);
-       xfs_qm_dqrele(gdqp);
-       return error;
-}
-
-/*
- * Truncate file.  Must have write permission and not be a directory.
- */
-int
-xfs_setattr_size(
-       struct xfs_inode        *ip,
-       struct iattr            *iattr,
-       int                     flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct inode            *inode = VFS_I(ip);
-       int                     mask = iattr->ia_valid;
-       struct xfs_trans        *tp;
-       int                     error;
-       uint                    lock_flags;
-       uint                    commit_flags = 0;
-
-       trace_xfs_setattr(ip);
-
-       if (mp->m_flags & XFS_MOUNT_RDONLY)
-               return XFS_ERROR(EROFS);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       error = -inode_change_ok(inode, iattr);
-       if (error)
-               return XFS_ERROR(error);
-
-       ASSERT(S_ISREG(ip->i_d.di_mode));
-       ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
-                       ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
-                       ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
-
-       lock_flags = XFS_ILOCK_EXCL;
-       if (!(flags & XFS_ATTR_NOLOCK))
-               lock_flags |= XFS_IOLOCK_EXCL;
-       xfs_ilock(ip, lock_flags);
-
-       /*
-        * Short circuit the truncate case for zero length files.
-        */
-       if (iattr->ia_size == 0 &&
-           ip->i_size == 0 && ip->i_d.di_nextents == 0) {
-               if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
-                       goto out_unlock;
-
-               /*
-                * Use the regular setattr path to update the timestamps.
-                */
-               xfs_iunlock(ip, lock_flags);
-               iattr->ia_valid &= ~ATTR_SIZE;
-               return xfs_setattr_nonsize(ip, iattr, 0);
-       }
-
-       /*
-        * Make sure that the dquots are attached to the inode.
-        */
-       error = xfs_qm_dqattach_locked(ip, 0);
-       if (error)
-               goto out_unlock;
-
-       /*
-        * Now we can make the changes.  Before we join the inode to the
-        * transaction, take care of the part of the truncation that must be
-        * done without the inode lock.  This needs to be done before joining
-        * the inode to the transaction, because the inode cannot be unlocked
-        * once it is a part of the transaction.
-        */
-       if (iattr->ia_size > ip->i_size) {
-               /*
-                * Do the first part of growing a file: zero any data in the
-                * last block that is beyond the old EOF.  We need to do this
-                * before the inode is joined to the transaction to modify
-                * i_size.
-                */
-               error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
-               if (error)
-                       goto out_unlock;
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       lock_flags &= ~XFS_ILOCK_EXCL;
-
-       /*
-        * We are going to log the inode size change in this transaction so
-        * any previous writes that are beyond the on disk EOF and the new
-        * EOF that have not been written out need to be written here.  If we
-        * do not write the data out, we expose ourselves to the null files
-        * problem.
-        *
-        * Only flush from the on disk size to the smaller of the in memory
-        * file size or the new size as that's the range we really care about
-        * here and prevents waiting for other data not within the range we
-        * care about here.
-        */
-       if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
-               error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
-                                       XBF_ASYNC, FI_NONE);
-               if (error)
-                       goto out_unlock;
-       }
-
-       /*
-        * Wait for all I/O to complete.
-        */
-       xfs_ioend_wait(ip);
-
-       error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
-                                    xfs_get_blocks);
-       if (error)
-               goto out_unlock;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                XFS_TRANS_PERM_LOG_RES,
-                                XFS_ITRUNCATE_LOG_COUNT);
-       if (error)
-               goto out_trans_cancel;
-
-       truncate_setsize(inode, iattr->ia_size);
-
-       commit_flags = XFS_TRANS_RELEASE_LOG_RES;
-       lock_flags |= XFS_ILOCK_EXCL;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-       xfs_trans_ijoin(tp, ip);
-
-       /*
-        * Only change the c/mtime if we are changing the size or we are
-        * explicitly asked to change it.  This handles the semantic difference
-        * between truncate() and ftruncate() as implemented in the VFS.
-        *
-        * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
-        * special case where we need to update the times despite not having
-        * these flags set.  For all other operations the VFS set these flags
-        * explicitly if it wants a timestamp update.
-        */
-       if (iattr->ia_size != ip->i_size &&
-           (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
-               iattr->ia_ctime = iattr->ia_mtime =
-                       current_fs_time(inode->i_sb);
-               mask |= ATTR_CTIME | ATTR_MTIME;
-       }
-
-       if (iattr->ia_size > ip->i_size) {
-               ip->i_d.di_size = iattr->ia_size;
-               ip->i_size = iattr->ia_size;
-       } else if (iattr->ia_size <= ip->i_size ||
-                  (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
-               error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
-               if (error)
-                       goto out_trans_abort;
-
-               /*
-                * Truncated "down", so we're removing references to old data
-                * here - if we delay flushing for a long time, we expose
-                * ourselves unduly to the notorious NULL files problem.  So,
-                * we mark this inode and flush it when the file is closed,
-                * and do not wait the usual (long) time for writeout.
-                */
-               xfs_iflags_set(ip, XFS_ITRUNCATED);
-       }
-
-       if (mask & ATTR_CTIME) {
-               inode->i_ctime = iattr->ia_ctime;
-               ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
-               ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
-               ip->i_update_core = 1;
-       }
-       if (mask & ATTR_MTIME) {
-               inode->i_mtime = iattr->ia_mtime;
-               ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
-               ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
-               ip->i_update_core = 1;
-       }
-
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-       XFS_STATS_INC(xs_ig_attrchg);
-
-       if (mp->m_flags & XFS_MOUNT_WSYNC)
-               xfs_trans_set_sync(tp);
-
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-out_unlock:
-       if (lock_flags)
-               xfs_iunlock(ip, lock_flags);
-       return error;
-
-out_trans_abort:
-       commit_flags |= XFS_TRANS_ABORT;
-out_trans_cancel:
-       xfs_trans_cancel(tp, commit_flags);
-       goto out_unlock;
-}
-
-STATIC int
-xfs_vn_setattr(
-       struct dentry   *dentry,
-       struct iattr    *iattr)
-{
-       if (iattr->ia_valid & ATTR_SIZE)
-               return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
-       return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
-}
-
-#define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
-
-/*
- * Call fiemap helper to fill in user data.
- * Returns positive errors to xfs_getbmap.
- */
-STATIC int
-xfs_fiemap_format(
-       void                    **arg,
-       struct getbmapx         *bmv,
-       int                     *full)
-{
-       int                     error;
-       struct fiemap_extent_info *fieinfo = *arg;
-       u32                     fiemap_flags = 0;
-       u64                     logical, physical, length;
-
-       /* Do nothing for a hole */
-       if (bmv->bmv_block == -1LL)
-               return 0;
-
-       logical = BBTOB(bmv->bmv_offset);
-       physical = BBTOB(bmv->bmv_block);
-       length = BBTOB(bmv->bmv_length);
-
-       if (bmv->bmv_oflags & BMV_OF_PREALLOC)
-               fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
-       else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
-               fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
-               physical = 0;   /* no block yet */
-       }
-       if (bmv->bmv_oflags & BMV_OF_LAST)
-               fiemap_flags |= FIEMAP_EXTENT_LAST;
-
-       error = fiemap_fill_next_extent(fieinfo, logical, physical,
-                                       length, fiemap_flags);
-       if (error > 0) {
-               error = 0;
-               *full = 1;      /* user array now full */
-       }
-
-       return -error;
-}
-
-STATIC int
-xfs_vn_fiemap(
-       struct inode            *inode,
-       struct fiemap_extent_info *fieinfo,
-       u64                     start,
-       u64                     length)
-{
-       xfs_inode_t             *ip = XFS_I(inode);
-       struct getbmapx         bm;
-       int                     error;
-
-       error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
-       if (error)
-               return error;
-
-       /* Set up bmap header for xfs internal routine */
-       bm.bmv_offset = BTOBB(start);
-       /* Special case for whole file */
-       if (length == FIEMAP_MAX_OFFSET)
-               bm.bmv_length = -1LL;
-       else
-               bm.bmv_length = BTOBB(length);
-
-       /* We add one because in getbmap world count includes the header */
-       bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
-                                       fieinfo->fi_extents_max + 1;
-       bm.bmv_count = min_t(__s32, bm.bmv_count,
-                            (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-       bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
-       if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
-               bm.bmv_iflags |= BMV_IF_ATTRFORK;
-       if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
-               bm.bmv_iflags |= BMV_IF_DELALLOC;
-
-       error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
-       if (error)
-               return -error;
-
-       return 0;
-}
-
-static const struct inode_operations xfs_inode_operations = {
-       .get_acl                = xfs_get_acl,
-       .getattr                = xfs_vn_getattr,
-       .setattr                = xfs_vn_setattr,
-       .setxattr               = generic_setxattr,
-       .getxattr               = generic_getxattr,
-       .removexattr            = generic_removexattr,
-       .listxattr              = xfs_vn_listxattr,
-       .fiemap                 = xfs_vn_fiemap,
-};
-
-static const struct inode_operations xfs_dir_inode_operations = {
-       .create                 = xfs_vn_create,
-       .lookup                 = xfs_vn_lookup,
-       .link                   = xfs_vn_link,
-       .unlink                 = xfs_vn_unlink,
-       .symlink                = xfs_vn_symlink,
-       .mkdir                  = xfs_vn_mkdir,
-       /*
-        * Yes, XFS uses the same method for rmdir and unlink.
-        *
-        * There are some subtile differences deeper in the code,
-        * but we use S_ISDIR to check for those.
-        */
-       .rmdir                  = xfs_vn_unlink,
-       .mknod                  = xfs_vn_mknod,
-       .rename                 = xfs_vn_rename,
-       .get_acl                = xfs_get_acl,
-       .getattr                = xfs_vn_getattr,
-       .setattr                = xfs_vn_setattr,
-       .setxattr               = generic_setxattr,
-       .getxattr               = generic_getxattr,
-       .removexattr            = generic_removexattr,
-       .listxattr              = xfs_vn_listxattr,
-};
-
-static const struct inode_operations xfs_dir_ci_inode_operations = {
-       .create                 = xfs_vn_create,
-       .lookup                 = xfs_vn_ci_lookup,
-       .link                   = xfs_vn_link,
-       .unlink                 = xfs_vn_unlink,
-       .symlink                = xfs_vn_symlink,
-       .mkdir                  = xfs_vn_mkdir,
-       /*
-        * Yes, XFS uses the same method for rmdir and unlink.
-        *
-        * There are some subtile differences deeper in the code,
-        * but we use S_ISDIR to check for those.
-        */
-       .rmdir                  = xfs_vn_unlink,
-       .mknod                  = xfs_vn_mknod,
-       .rename                 = xfs_vn_rename,
-       .get_acl                = xfs_get_acl,
-       .getattr                = xfs_vn_getattr,
-       .setattr                = xfs_vn_setattr,
-       .setxattr               = generic_setxattr,
-       .getxattr               = generic_getxattr,
-       .removexattr            = generic_removexattr,
-       .listxattr              = xfs_vn_listxattr,
-};
-
-static const struct inode_operations xfs_symlink_inode_operations = {
-       .readlink               = generic_readlink,
-       .follow_link            = xfs_vn_follow_link,
-       .put_link               = xfs_vn_put_link,
-       .get_acl                = xfs_get_acl,
-       .getattr                = xfs_vn_getattr,
-       .setattr                = xfs_vn_setattr,
-       .setxattr               = generic_setxattr,
-       .getxattr               = generic_getxattr,
-       .removexattr            = generic_removexattr,
-       .listxattr              = xfs_vn_listxattr,
-};
-
-STATIC void
-xfs_diflags_to_iflags(
-       struct inode            *inode,
-       struct xfs_inode        *ip)
-{
-       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
-               inode->i_flags |= S_IMMUTABLE;
-       else
-               inode->i_flags &= ~S_IMMUTABLE;
-       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
-               inode->i_flags |= S_APPEND;
-       else
-               inode->i_flags &= ~S_APPEND;
-       if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
-               inode->i_flags |= S_SYNC;
-       else
-               inode->i_flags &= ~S_SYNC;
-       if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
-               inode->i_flags |= S_NOATIME;
-       else
-               inode->i_flags &= ~S_NOATIME;
-}
-
-/*
- * Initialize the Linux inode, set up the operation vectors and
- * unlock the inode.
- *
- * When reading existing inodes from disk this is called directly
- * from xfs_iget, when creating a new inode it is called from
- * xfs_ialloc after setting up the inode.
- *
- * We are always called with an uninitialised linux inode here.
- * We need to initialise the necessary fields and take a reference
- * on it.
- */
-void
-xfs_setup_inode(
-       struct xfs_inode        *ip)
-{
-       struct inode            *inode = &ip->i_vnode;
-
-       inode->i_ino = ip->i_ino;
-       inode->i_state = I_NEW;
-
-       inode_sb_list_add(inode);
-       /* make the inode look hashed for the writeback code */
-       hlist_add_fake(&inode->i_hash);
-
-       inode->i_mode   = ip->i_d.di_mode;
-       inode->i_nlink  = ip->i_d.di_nlink;
-       inode->i_uid    = ip->i_d.di_uid;
-       inode->i_gid    = ip->i_d.di_gid;
-
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFBLK:
-       case S_IFCHR:
-               inode->i_rdev =
-                       MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
-                             sysv_minor(ip->i_df.if_u2.if_rdev));
-               break;
-       default:
-               inode->i_rdev = 0;
-               break;
-       }
-
-       inode->i_generation = ip->i_d.di_gen;
-       i_size_write(inode, ip->i_d.di_size);
-       inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
-       inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
-       inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
-       inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
-       inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
-       inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
-       xfs_diflags_to_iflags(inode, ip);
-
-       switch (inode->i_mode & S_IFMT) {
-       case S_IFREG:
-               inode->i_op = &xfs_inode_operations;
-               inode->i_fop = &xfs_file_operations;
-               inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       case S_IFDIR:
-               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
-                       inode->i_op = &xfs_dir_ci_inode_operations;
-               else
-                       inode->i_op = &xfs_dir_inode_operations;
-               inode->i_fop = &xfs_dir_file_operations;
-               break;
-       case S_IFLNK:
-               inode->i_op = &xfs_symlink_inode_operations;
-               if (!(ip->i_df.if_flags & XFS_IFINLINE))
-                       inode->i_mapping->a_ops = &xfs_address_space_operations;
-               break;
-       default:
-               inode->i_op = &xfs_inode_operations;
-               init_special_inode(inode, inode->i_mode, inode->i_rdev);
-               break;
-       }
-
-       /*
-        * If there is no attribute fork no ACL can exist on this inode,
-        * and it can't have any file capabilities attached to it either.
-        */
-       if (!XFS_IFORK_Q(ip)) {
-               inode_has_no_xattr(inode);
-               cache_no_acl(inode);
-       }
-
-       xfs_iflags_clear(ip, XFS_INEW);
-       barrier();
-
-       unlock_new_inode(inode);
-}
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
deleted file mode 100644 (file)
index ef41c92..0000000
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_IOPS_H__
-#define __XFS_IOPS_H__
-
-struct xfs_inode;
-
-extern const struct file_operations xfs_file_operations;
-extern const struct file_operations xfs_dir_file_operations;
-
-extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
-
-extern void xfs_setup_inode(struct xfs_inode *);
-
-#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
deleted file mode 100644 (file)
index d42f814..0000000
+++ /dev/null
@@ -1,310 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_LINUX__
-#define __XFS_LINUX__
-
-#include <linux/types.h>
-
-/*
- * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
- */
-#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
-# define XFS_BIG_BLKNOS        1
-# define XFS_BIG_INUMS 1
-#else
-# define XFS_BIG_BLKNOS        0
-# define XFS_BIG_INUMS 0
-#endif
-
-#include <xfs_types.h>
-
-#include <kmem.h>
-#include <mrlock.h>
-#include <time.h>
-
-#include <support/uuid.h>
-
-#include <linux/semaphore.h>
-#include <linux/mm.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/file.h>
-#include <linux/swap.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/bitops.h>
-#include <linux/major.h>
-#include <linux/pagemap.h>
-#include <linux/vfs.h>
-#include <linux/seq_file.h>
-#include <linux/init.h>
-#include <linux/list.h>
-#include <linux/proc_fs.h>
-#include <linux/sort.h>
-#include <linux/cpu.h>
-#include <linux/notifier.h>
-#include <linux/delay.h>
-#include <linux/log2.h>
-#include <linux/spinlock.h>
-#include <linux/random.h>
-#include <linux/ctype.h>
-#include <linux/writeback.h>
-#include <linux/capability.h>
-#include <linux/list_sort.h>
-
-#include <asm/page.h>
-#include <asm/div64.h>
-#include <asm/param.h>
-#include <asm/uaccess.h>
-#include <asm/byteorder.h>
-#include <asm/unaligned.h>
-
-#include <xfs_vnode.h>
-#include <xfs_stats.h>
-#include <xfs_sysctl.h>
-#include <xfs_iops.h>
-#include <xfs_aops.h>
-#include <xfs_super.h>
-#include <xfs_buf.h>
-#include <xfs_message.h>
-
-#ifdef __BIG_ENDIAN
-#define XFS_NATIVE_HOST 1
-#else
-#undef XFS_NATIVE_HOST
-#endif
-
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#else
-#undef  HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
-#endif
-
-#define irix_sgid_inherit      xfs_params.sgid_inherit.val
-#define irix_symlink_mode      xfs_params.symlink_mode.val
-#define xfs_panic_mask         xfs_params.panic_mask.val
-#define xfs_error_level                xfs_params.error_level.val
-#define xfs_syncd_centisecs    xfs_params.syncd_timer.val
-#define xfs_stats_clear                xfs_params.stats_clear.val
-#define xfs_inherit_sync       xfs_params.inherit_sync.val
-#define xfs_inherit_nodump     xfs_params.inherit_nodump.val
-#define xfs_inherit_noatime    xfs_params.inherit_noatim.val
-#define xfs_buf_timer_centisecs        xfs_params.xfs_buf_timer.val
-#define xfs_buf_age_centisecs  xfs_params.xfs_buf_age.val
-#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
-#define xfs_rotorstep          xfs_params.rotorstep.val
-#define xfs_inherit_nodefrag   xfs_params.inherit_nodfrg.val
-#define xfs_fstrm_centisecs    xfs_params.fstrm_timer.val
-
-#define current_cpu()          (raw_smp_processor_id())
-#define current_pid()          (current->pid)
-#define current_test_flags(f)  (current->flags & (f))
-#define current_set_flags_nested(sp, f)                \
-               (*(sp) = current->flags, current->flags |= (f))
-#define current_clear_flags_nested(sp, f)      \
-               (*(sp) = current->flags, current->flags &= ~(f))
-#define current_restore_flags_nested(sp, f)    \
-               (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
-
-#define spinlock_destroy(lock)
-
-#define NBBY           8               /* number of bits per byte */
-
-/*
- * Size of block device i/o is parameterized here.
- * Currently the system supports page-sized i/o.
- */
-#define        BLKDEV_IOSHIFT          PAGE_CACHE_SHIFT
-#define        BLKDEV_IOSIZE           (1<<BLKDEV_IOSHIFT)
-/* number of BB's per block device block */
-#define        BLKDEV_BB               BTOBB(BLKDEV_IOSIZE)
-
-#define ENOATTR                ENODATA         /* Attribute not found */
-#define EWRONGFS       EINVAL          /* Mount with wrong filesystem type */
-#define EFSCORRUPTED   EUCLEAN         /* Filesystem is corrupted */
-
-#define SYNCHRONIZE()  barrier()
-#define __return_address __builtin_return_address(0)
-
-#define XFS_PROJID_DEFAULT     0
-#define MAXPATHLEN     1024
-
-#define MIN(a,b)       (min(a,b))
-#define MAX(a,b)       (max(a,b))
-#define howmany(x, y)  (((x)+((y)-1))/(y))
-
-/*
- * Various platform dependent calls that don't fit anywhere else
- */
-#define xfs_sort(a,n,s,fn)     sort(a,n,s,fn,NULL)
-#define xfs_stack_trace()      dump_stack()
-
-
-/* Move the kernel do_div definition off to one side */
-
-#if defined __i386__
-/* For ia32 we need to pull some tricks to get past various versions
- * of the compiler which do not like us using do_div in the middle
- * of large functions.
- */
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
-       __u32   mod;
-
-       switch (n) {
-               case 4:
-                       mod = *(__u32 *)a % b;
-                       *(__u32 *)a = *(__u32 *)a / b;
-                       return mod;
-               case 8:
-                       {
-                       unsigned long __upper, __low, __high, __mod;
-                       __u64   c = *(__u64 *)a;
-                       __upper = __high = c >> 32;
-                       __low = c;
-                       if (__high) {
-                               __upper = __high % (b);
-                               __high = __high / (b);
-                       }
-                       asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
-                       asm("":"=A" (c):"a" (__low),"d" (__high));
-                       *(__u64 *)a = c;
-                       return __mod;
-                       }
-       }
-
-       /* NOTREACHED */
-       return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
-       switch (n) {
-               case 4:
-                       return *(__u32 *)a % b;
-               case 8:
-                       {
-                       unsigned long __upper, __low, __high, __mod;
-                       __u64   c = *(__u64 *)a;
-                       __upper = __high = c >> 32;
-                       __low = c;
-                       if (__high) {
-                               __upper = __high % (b);
-                               __high = __high / (b);
-                       }
-                       asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
-                       asm("":"=A" (c):"a" (__low),"d" (__high));
-                       return __mod;
-                       }
-       }
-
-       /* NOTREACHED */
-       return 0;
-}
-#else
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
-       __u32   mod;
-
-       switch (n) {
-               case 4:
-                       mod = *(__u32 *)a % b;
-                       *(__u32 *)a = *(__u32 *)a / b;
-                       return mod;
-               case 8:
-                       mod = do_div(*(__u64 *)a, b);
-                       return mod;
-       }
-
-       /* NOTREACHED */
-       return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
-       switch (n) {
-               case 4:
-                       return *(__u32 *)a % b;
-               case 8:
-                       {
-                       __u64   c = *(__u64 *)a;
-                       return do_div(c, b);
-                       }
-       }
-
-       /* NOTREACHED */
-       return 0;
-}
-#endif
-
-#undef do_div
-#define do_div(a, b)   xfs_do_div(&(a), (b), sizeof(a))
-#define do_mod(a, b)   xfs_do_mod(&(a), (b), sizeof(a))
-
-static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
-{
-       x += y - 1;
-       do_div(x, y);
-       return(x * y);
-}
-
-static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
-{
-       x += y - 1;
-       do_div(x, y);
-       return x;
-}
-
-/* ARM old ABI has some weird alignment/padding */
-#if defined(__arm__) && !defined(__ARM_EABI__)
-#define __arch_pack __attribute__((packed))
-#else
-#define __arch_pack
-#endif
-
-#define ASSERT_ALWAYS(expr)    \
-       (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef DEBUG
-#define ASSERT(expr)   ((void)0)
-
-#ifndef STATIC
-# define STATIC static noinline
-#endif
-
-#else /* DEBUG */
-
-#define ASSERT(expr)   \
-       (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
-
-#ifndef STATIC
-# define STATIC noinline
-#endif
-
-#endif /* DEBUG */
-
-#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c
deleted file mode 100644 (file)
index bd672de..0000000
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-
-/*
- * XFS logging functions
- */
-static void
-__xfs_printk(
-       const char              *level,
-       const struct xfs_mount  *mp,
-       struct va_format        *vaf)
-{
-       if (mp && mp->m_fsname) {
-               printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
-               return;
-       }
-       printk("%sXFS: %pV\n", level, vaf);
-}
-
-#define define_xfs_printk_level(func, kern_level)              \
-void func(const struct xfs_mount *mp, const char *fmt, ...)    \
-{                                                              \
-       struct va_format        vaf;                            \
-       va_list                 args;                           \
-                                                               \
-       va_start(args, fmt);                                    \
-                                                               \
-       vaf.fmt = fmt;                                          \
-       vaf.va = &args;                                         \
-                                                               \
-       __xfs_printk(kern_level, mp, &vaf);                     \
-       va_end(args);                                           \
-}                                                              \
-
-define_xfs_printk_level(xfs_emerg, KERN_EMERG);
-define_xfs_printk_level(xfs_alert, KERN_ALERT);
-define_xfs_printk_level(xfs_crit, KERN_CRIT);
-define_xfs_printk_level(xfs_err, KERN_ERR);
-define_xfs_printk_level(xfs_warn, KERN_WARNING);
-define_xfs_printk_level(xfs_notice, KERN_NOTICE);
-define_xfs_printk_level(xfs_info, KERN_INFO);
-#ifdef DEBUG
-define_xfs_printk_level(xfs_debug, KERN_DEBUG);
-#endif
-
-void
-xfs_alert_tag(
-       const struct xfs_mount  *mp,
-       int                     panic_tag,
-       const char              *fmt, ...)
-{
-       struct va_format        vaf;
-       va_list                 args;
-       int                     do_panic = 0;
-
-       if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
-               xfs_alert(mp, "Transforming an alert into a BUG.");
-               do_panic = 1;
-       }
-
-       va_start(args, fmt);
-
-       vaf.fmt = fmt;
-       vaf.va = &args;
-
-       __xfs_printk(KERN_ALERT, mp, &vaf);
-       va_end(args);
-
-       BUG_ON(do_panic);
-}
-
-void
-assfail(char *expr, char *file, int line)
-{
-       xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
-               expr, file, line);
-       BUG();
-}
-
-void
-xfs_hex_dump(void *p, int length)
-{
-       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
-}
diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h
deleted file mode 100644 (file)
index 7fb7ea0..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __XFS_MESSAGE_H
-#define __XFS_MESSAGE_H 1
-
-struct xfs_mount;
-
-extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
-                        const char *fmt, ...)
-        __attribute__ ((format (printf, 3, 4)));
-extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-
-#ifdef DEBUG
-extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-        __attribute__ ((format (printf, 2, 3)));
-#else
-static inline void
-__attribute__ ((format (printf, 2, 3)))
-xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
-{
-}
-#endif
-
-extern void assfail(char *expr, char *f, int l);
-
-extern void xfs_hex_dump(void *p, int length);
-
-#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
deleted file mode 100644 (file)
index 29b9d64..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2008, Christoph Hellwig
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_sb.h"
-#include "xfs_inum.h"
-#include "xfs_log.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_quota.h"
-#include "xfs_trans.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "quota/xfs_qm.h"
-#include <linux/quota.h>
-
-
-STATIC int
-xfs_quota_type(int type)
-{
-       switch (type) {
-       case USRQUOTA:
-               return XFS_DQ_USER;
-       case GRPQUOTA:
-               return XFS_DQ_GROUP;
-       default:
-               return XFS_DQ_PROJ;
-       }
-}
-
-STATIC int
-xfs_fs_get_xstate(
-       struct super_block      *sb,
-       struct fs_quota_stat    *fqs)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       if (!XFS_IS_QUOTA_RUNNING(mp))
-               return -ENOSYS;
-       return -xfs_qm_scall_getqstat(mp, fqs);
-}
-
-STATIC int
-xfs_fs_set_xstate(
-       struct super_block      *sb,
-       unsigned int            uflags,
-       int                     op)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-       unsigned int            flags = 0;
-
-       if (sb->s_flags & MS_RDONLY)
-               return -EROFS;
-       if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
-               return -ENOSYS;
-
-       if (uflags & FS_QUOTA_UDQ_ACCT)
-               flags |= XFS_UQUOTA_ACCT;
-       if (uflags & FS_QUOTA_PDQ_ACCT)
-               flags |= XFS_PQUOTA_ACCT;
-       if (uflags & FS_QUOTA_GDQ_ACCT)
-               flags |= XFS_GQUOTA_ACCT;
-       if (uflags & FS_QUOTA_UDQ_ENFD)
-               flags |= XFS_UQUOTA_ENFD;
-       if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
-               flags |= XFS_OQUOTA_ENFD;
-
-       switch (op) {
-       case Q_XQUOTAON:
-               return -xfs_qm_scall_quotaon(mp, flags);
-       case Q_XQUOTAOFF:
-               if (!XFS_IS_QUOTA_ON(mp))
-                       return -EINVAL;
-               return -xfs_qm_scall_quotaoff(mp, flags);
-       case Q_XQUOTARM:
-               if (XFS_IS_QUOTA_ON(mp))
-                       return -EINVAL;
-               return -xfs_qm_scall_trunc_qfiles(mp, flags);
-       }
-
-       return -EINVAL;
-}
-
-STATIC int
-xfs_fs_get_dqblk(
-       struct super_block      *sb,
-       int                     type,
-       qid_t                   id,
-       struct fs_disk_quota    *fdq)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       if (!XFS_IS_QUOTA_RUNNING(mp))
-               return -ENOSYS;
-       if (!XFS_IS_QUOTA_ON(mp))
-               return -ESRCH;
-
-       return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
-}
-
-STATIC int
-xfs_fs_set_dqblk(
-       struct super_block      *sb,
-       int                     type,
-       qid_t                   id,
-       struct fs_disk_quota    *fdq)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       if (sb->s_flags & MS_RDONLY)
-               return -EROFS;
-       if (!XFS_IS_QUOTA_RUNNING(mp))
-               return -ENOSYS;
-       if (!XFS_IS_QUOTA_ON(mp))
-               return -ESRCH;
-
-       return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
-}
-
-const struct quotactl_ops xfs_quotactl_operations = {
-       .get_xstate             = xfs_fs_get_xstate,
-       .set_xstate             = xfs_fs_set_xstate,
-       .get_dqblk              = xfs_fs_get_dqblk,
-       .set_dqblk              = xfs_fs_set_dqblk,
-};
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
deleted file mode 100644 (file)
index 76fdc58..0000000
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include <linux/proc_fs.h>
-
-DEFINE_PER_CPU(struct xfsstats, xfsstats);
-
-static int xfs_stat_proc_show(struct seq_file *m, void *v)
-{
-       int             c, i, j, val;
-       __uint64_t      xs_xstrat_bytes = 0;
-       __uint64_t      xs_write_bytes = 0;
-       __uint64_t      xs_read_bytes = 0;
-
-       static const struct xstats_entry {
-               char    *desc;
-               int     endpoint;
-       } xstats[] = {
-               { "extent_alloc",       XFSSTAT_END_EXTENT_ALLOC        },
-               { "abt",                XFSSTAT_END_ALLOC_BTREE         },
-               { "blk_map",            XFSSTAT_END_BLOCK_MAPPING       },
-               { "bmbt",               XFSSTAT_END_BLOCK_MAP_BTREE     },
-               { "dir",                XFSSTAT_END_DIRECTORY_OPS       },
-               { "trans",              XFSSTAT_END_TRANSACTIONS        },
-               { "ig",                 XFSSTAT_END_INODE_OPS           },
-               { "log",                XFSSTAT_END_LOG_OPS             },
-               { "push_ail",           XFSSTAT_END_TAIL_PUSHING        },
-               { "xstrat",             XFSSTAT_END_WRITE_CONVERT       },
-               { "rw",                 XFSSTAT_END_READ_WRITE_OPS      },
-               { "attr",               XFSSTAT_END_ATTRIBUTE_OPS       },
-               { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
-               { "vnodes",             XFSSTAT_END_VNODE_OPS           },
-               { "buf",                XFSSTAT_END_BUF                 },
-               { "abtb2",              XFSSTAT_END_ABTB_V2             },
-               { "abtc2",              XFSSTAT_END_ABTC_V2             },
-               { "bmbt2",              XFSSTAT_END_BMBT_V2             },
-               { "ibt2",               XFSSTAT_END_IBT_V2              },
-       };
-
-       /* Loop over all stats groups */
-       for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
-               seq_printf(m, "%s", xstats[i].desc);
-               /* inner loop does each group */
-               while (j < xstats[i].endpoint) {
-                       val = 0;
-                       /* sum over all cpus */
-                       for_each_possible_cpu(c)
-                               val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-                       seq_printf(m, " %u", val);
-                       j++;
-               }
-               seq_putc(m, '\n');
-       }
-       /* extra precision counters */
-       for_each_possible_cpu(i) {
-               xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
-               xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
-               xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
-       }
-
-       seq_printf(m, "xpc %Lu %Lu %Lu\n",
-                       xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-       seq_printf(m, "debug %u\n",
-#if defined(DEBUG)
-               1);
-#else
-               0);
-#endif
-       return 0;
-}
-
-static int xfs_stat_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xfs_stat_proc_show, NULL);
-}
-
-static const struct file_operations xfs_stat_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = xfs_stat_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-int
-xfs_init_procfs(void)
-{
-       if (!proc_mkdir("fs/xfs", NULL))
-               goto out;
-
-       if (!proc_create("fs/xfs/stat", 0, NULL,
-                        &xfs_stat_proc_fops))
-               goto out_remove_entry;
-       return 0;
-
- out_remove_entry:
-       remove_proc_entry("fs/xfs", NULL);
- out:
-       return -ENOMEM;
-}
-
-void
-xfs_cleanup_procfs(void)
-{
-       remove_proc_entry("fs/xfs/stat", NULL);
-       remove_proc_entry("fs/xfs", NULL);
-}
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
deleted file mode 100644 (file)
index 736854b..0000000
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_STATS_H__
-#define __XFS_STATS_H__
-
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-#include <linux/percpu.h>
-
-/*
- * XFS global statistics
- */
-struct xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC      4
-       __uint32_t              xs_allocx;
-       __uint32_t              xs_allocb;
-       __uint32_t              xs_freex;
-       __uint32_t              xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE       (XFSSTAT_END_EXTENT_ALLOC+4)
-       __uint32_t              xs_abt_lookup;
-       __uint32_t              xs_abt_compare;
-       __uint32_t              xs_abt_insrec;
-       __uint32_t              xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING     (XFSSTAT_END_ALLOC_BTREE+7)
-       __uint32_t              xs_blk_mapr;
-       __uint32_t              xs_blk_mapw;
-       __uint32_t              xs_blk_unmap;
-       __uint32_t              xs_add_exlist;
-       __uint32_t              xs_del_exlist;
-       __uint32_t              xs_look_exlist;
-       __uint32_t              xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE   (XFSSTAT_END_BLOCK_MAPPING+4)
-       __uint32_t              xs_bmbt_lookup;
-       __uint32_t              xs_bmbt_compare;
-       __uint32_t              xs_bmbt_insrec;
-       __uint32_t              xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS     (XFSSTAT_END_BLOCK_MAP_BTREE+4)
-       __uint32_t              xs_dir_lookup;
-       __uint32_t              xs_dir_create;
-       __uint32_t              xs_dir_remove;
-       __uint32_t              xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS      (XFSSTAT_END_DIRECTORY_OPS+3)
-       __uint32_t              xs_trans_sync;
-       __uint32_t              xs_trans_async;
-       __uint32_t              xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS         (XFSSTAT_END_TRANSACTIONS+7)
-       __uint32_t              xs_ig_attempts;
-       __uint32_t              xs_ig_found;
-       __uint32_t              xs_ig_frecycle;
-       __uint32_t              xs_ig_missed;
-       __uint32_t              xs_ig_dup;
-       __uint32_t              xs_ig_reclaims;
-       __uint32_t              xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS           (XFSSTAT_END_INODE_OPS+5)
-       __uint32_t              xs_log_writes;
-       __uint32_t              xs_log_blocks;
-       __uint32_t              xs_log_noiclogs;
-       __uint32_t              xs_log_force;
-       __uint32_t              xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING      (XFSSTAT_END_LOG_OPS+10)
-       __uint32_t              xs_try_logspace;
-       __uint32_t              xs_sleep_logspace;
-       __uint32_t              xs_push_ail;
-       __uint32_t              xs_push_ail_success;
-       __uint32_t              xs_push_ail_pushbuf;
-       __uint32_t              xs_push_ail_pinned;
-       __uint32_t              xs_push_ail_locked;
-       __uint32_t              xs_push_ail_flushing;
-       __uint32_t              xs_push_ail_restarts;
-       __uint32_t              xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT     (XFSSTAT_END_TAIL_PUSHING+2)
-       __uint32_t              xs_xstrat_quick;
-       __uint32_t              xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS    (XFSSTAT_END_WRITE_CONVERT+2)
-       __uint32_t              xs_write_calls;
-       __uint32_t              xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS     (XFSSTAT_END_READ_WRITE_OPS+4)
-       __uint32_t              xs_attr_get;
-       __uint32_t              xs_attr_set;
-       __uint32_t              xs_attr_remove;
-       __uint32_t              xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER     (XFSSTAT_END_ATTRIBUTE_OPS+3)
-       __uint32_t              xs_iflush_count;
-       __uint32_t              xs_icluster_flushcnt;
-       __uint32_t              xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS         (XFSSTAT_END_INODE_CLUSTER+8)
-       __uint32_t              vn_active;      /* # vnodes not on free lists */
-       __uint32_t              vn_alloc;       /* # times vn_alloc called */
-       __uint32_t              vn_get;         /* # times vn_get called */
-       __uint32_t              vn_hold;        /* # times vn_hold called */
-       __uint32_t              vn_rele;        /* # times vn_rele called */
-       __uint32_t              vn_reclaim;     /* # times vn_reclaim called */
-       __uint32_t              vn_remove;      /* # times vn_remove called */
-       __uint32_t              vn_free;        /* # times vn_free called */
-#define XFSSTAT_END_BUF                        (XFSSTAT_END_VNODE_OPS+9)
-       __uint32_t              xb_get;
-       __uint32_t              xb_create;
-       __uint32_t              xb_get_locked;
-       __uint32_t              xb_get_locked_waited;
-       __uint32_t              xb_busy_locked;
-       __uint32_t              xb_miss_locked;
-       __uint32_t              xb_page_retries;
-       __uint32_t              xb_page_found;
-       __uint32_t              xb_get_read;
-/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2            (XFSSTAT_END_BUF+15)
-       __uint32_t              xs_abtb_2_lookup;
-       __uint32_t              xs_abtb_2_compare;
-       __uint32_t              xs_abtb_2_insrec;
-       __uint32_t              xs_abtb_2_delrec;
-       __uint32_t              xs_abtb_2_newroot;
-       __uint32_t              xs_abtb_2_killroot;
-       __uint32_t              xs_abtb_2_increment;
-       __uint32_t              xs_abtb_2_decrement;
-       __uint32_t              xs_abtb_2_lshift;
-       __uint32_t              xs_abtb_2_rshift;
-       __uint32_t              xs_abtb_2_split;
-       __uint32_t              xs_abtb_2_join;
-       __uint32_t              xs_abtb_2_alloc;
-       __uint32_t              xs_abtb_2_free;
-       __uint32_t              xs_abtb_2_moves;
-#define XFSSTAT_END_ABTC_V2            (XFSSTAT_END_ABTB_V2+15)
-       __uint32_t              xs_abtc_2_lookup;
-       __uint32_t              xs_abtc_2_compare;
-       __uint32_t              xs_abtc_2_insrec;
-       __uint32_t              xs_abtc_2_delrec;
-       __uint32_t              xs_abtc_2_newroot;
-       __uint32_t              xs_abtc_2_killroot;
-       __uint32_t              xs_abtc_2_increment;
-       __uint32_t              xs_abtc_2_decrement;
-       __uint32_t              xs_abtc_2_lshift;
-       __uint32_t              xs_abtc_2_rshift;
-       __uint32_t              xs_abtc_2_split;
-       __uint32_t              xs_abtc_2_join;
-       __uint32_t              xs_abtc_2_alloc;
-       __uint32_t              xs_abtc_2_free;
-       __uint32_t              xs_abtc_2_moves;
-#define XFSSTAT_END_BMBT_V2            (XFSSTAT_END_ABTC_V2+15)
-       __uint32_t              xs_bmbt_2_lookup;
-       __uint32_t              xs_bmbt_2_compare;
-       __uint32_t              xs_bmbt_2_insrec;
-       __uint32_t              xs_bmbt_2_delrec;
-       __uint32_t              xs_bmbt_2_newroot;
-       __uint32_t              xs_bmbt_2_killroot;
-       __uint32_t              xs_bmbt_2_increment;
-       __uint32_t              xs_bmbt_2_decrement;
-       __uint32_t              xs_bmbt_2_lshift;
-       __uint32_t              xs_bmbt_2_rshift;
-       __uint32_t              xs_bmbt_2_split;
-       __uint32_t              xs_bmbt_2_join;
-       __uint32_t              xs_bmbt_2_alloc;
-       __uint32_t              xs_bmbt_2_free;
-       __uint32_t              xs_bmbt_2_moves;
-#define XFSSTAT_END_IBT_V2             (XFSSTAT_END_BMBT_V2+15)
-       __uint32_t              xs_ibt_2_lookup;
-       __uint32_t              xs_ibt_2_compare;
-       __uint32_t              xs_ibt_2_insrec;
-       __uint32_t              xs_ibt_2_delrec;
-       __uint32_t              xs_ibt_2_newroot;
-       __uint32_t              xs_ibt_2_killroot;
-       __uint32_t              xs_ibt_2_increment;
-       __uint32_t              xs_ibt_2_decrement;
-       __uint32_t              xs_ibt_2_lshift;
-       __uint32_t              xs_ibt_2_rshift;
-       __uint32_t              xs_ibt_2_split;
-       __uint32_t              xs_ibt_2_join;
-       __uint32_t              xs_ibt_2_alloc;
-       __uint32_t              xs_ibt_2_free;
-       __uint32_t              xs_ibt_2_moves;
-/* Extra precision counters */
-       __uint64_t              xs_xstrat_bytes;
-       __uint64_t              xs_write_bytes;
-       __uint64_t              xs_read_bytes;
-};
-
-DECLARE_PER_CPU(struct xfsstats, xfsstats);
-
-/*
- * We don't disable preempt, not too worried about poking the
- * wrong CPU's stat for now (also aggregated before reporting).
- */
-#define XFS_STATS_INC(v)       (per_cpu(xfsstats, current_cpu()).v++)
-#define XFS_STATS_DEC(v)       (per_cpu(xfsstats, current_cpu()).v--)
-#define XFS_STATS_ADD(v, inc)  (per_cpu(xfsstats, current_cpu()).v += (inc))
-
-extern int xfs_init_procfs(void);
-extern void xfs_cleanup_procfs(void);
-
-
-#else  /* !CONFIG_PROC_FS */
-
-# define XFS_STATS_INC(count)
-# define XFS_STATS_DEC(count)
-# define XFS_STATS_ADD(count, inc)
-
-static inline int xfs_init_procfs(void)
-{
-       return 0;
-}
-
-static inline void xfs_cleanup_procfs(void)
-{
-}
-
-#endif /* !CONFIG_PROC_FS */
-
-#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
deleted file mode 100644 (file)
index 9a72dda..0000000
+++ /dev/null
@@ -1,1773 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_fsops.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_vnodeops.h"
-#include "xfs_log_priv.h"
-#include "xfs_trans_priv.h"
-#include "xfs_filestream.h"
-#include "xfs_da_btree.h"
-#include "xfs_extfree_item.h"
-#include "xfs_mru_cache.h"
-#include "xfs_inode_item.h"
-#include "xfs_sync.h"
-#include "xfs_trace.h"
-
-#include <linux/namei.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/mount.h>
-#include <linux/mempool.h>
-#include <linux/writeback.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-#include <linux/parser.h>
-
-static const struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_ioend_zone;
-mempool_t *xfs_ioend_pool;
-
-#define MNTOPT_LOGBUFS "logbufs"       /* number of XFS log buffers */
-#define MNTOPT_LOGBSIZE        "logbsize"      /* size of XFS log buffers */
-#define MNTOPT_LOGDEV  "logdev"        /* log device */
-#define MNTOPT_RTDEV   "rtdev"         /* realtime I/O device */
-#define MNTOPT_BIOSIZE "biosize"       /* log2 of preferred buffered io size */
-#define MNTOPT_WSYNC   "wsync"         /* safe-mode nfs compatible mount */
-#define MNTOPT_NOALIGN "noalign"       /* turn off stripe alignment */
-#define MNTOPT_SWALLOC "swalloc"       /* turn on stripe width allocation */
-#define MNTOPT_SUNIT   "sunit"         /* data volume stripe unit */
-#define MNTOPT_SWIDTH  "swidth"        /* data volume stripe width */
-#define MNTOPT_NOUUID  "nouuid"        /* ignore filesystem UUID */
-#define MNTOPT_MTPT    "mtpt"          /* filesystem mount point */
-#define MNTOPT_GRPID   "grpid"         /* group-ID from parent directory */
-#define MNTOPT_NOGRPID "nogrpid"       /* group-ID from current process */
-#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
-#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
-#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
-#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
-#define MNTOPT_BARRIER "barrier"       /* use writer barriers for log write and
-                                        * unwritten extent conversion */
-#define MNTOPT_NOBARRIER "nobarrier"   /* .. disable */
-#define MNTOPT_64BITINODE   "inode64"  /* inodes can be allocated anywhere */
-#define MNTOPT_IKEEP   "ikeep"         /* do not free empty inode clusters */
-#define MNTOPT_NOIKEEP "noikeep"       /* free empty inode clusters */
-#define MNTOPT_LARGEIO    "largeio"    /* report large I/O sizes in stat() */
-#define MNTOPT_NOLARGEIO   "nolargeio" /* do not report large I/O sizes
-                                        * in stat(). */
-#define MNTOPT_ATTR2   "attr2"         /* do use attr2 attribute format */
-#define MNTOPT_NOATTR2 "noattr2"       /* do not use attr2 attribute format */
-#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
-#define MNTOPT_QUOTA   "quota"         /* disk quotas (user) */
-#define MNTOPT_NOQUOTA "noquota"       /* no quotas */
-#define MNTOPT_USRQUOTA        "usrquota"      /* user quota enabled */
-#define MNTOPT_GRPQUOTA        "grpquota"      /* group quota enabled */
-#define MNTOPT_PRJQUOTA        "prjquota"      /* project quota enabled */
-#define MNTOPT_UQUOTA  "uquota"        /* user quota (IRIX variant) */
-#define MNTOPT_GQUOTA  "gquota"        /* group quota (IRIX variant) */
-#define MNTOPT_PQUOTA  "pquota"        /* project quota (IRIX variant) */
-#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
-#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
-#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
-#define MNTOPT_QUOTANOENF  "qnoenforce"        /* same as uqnoenforce */
-#define MNTOPT_DELAYLOG    "delaylog"  /* Delayed logging enabled */
-#define MNTOPT_NODELAYLOG  "nodelaylog"        /* Delayed logging disabled */
-#define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
-#define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
-
-/*
- * Table driven mount option parser.
- *
- * Currently only used for remount, but it will be used for mount
- * in the future, too.
- */
-enum {
-       Opt_barrier, Opt_nobarrier, Opt_err
-};
-
-static const match_table_t tokens = {
-       {Opt_barrier, "barrier"},
-       {Opt_nobarrier, "nobarrier"},
-       {Opt_err, NULL}
-};
-
-
-STATIC unsigned long
-suffix_strtoul(char *s, char **endp, unsigned int base)
-{
-       int     last, shift_left_factor = 0;
-       char    *value = s;
-
-       last = strlen(value) - 1;
-       if (value[last] == 'K' || value[last] == 'k') {
-               shift_left_factor = 10;
-               value[last] = '\0';
-       }
-       if (value[last] == 'M' || value[last] == 'm') {
-               shift_left_factor = 20;
-               value[last] = '\0';
-       }
-       if (value[last] == 'G' || value[last] == 'g') {
-               shift_left_factor = 30;
-               value[last] = '\0';
-       }
-
-       return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- *
- * Note that this function leaks the various device name allocations on
- * failure.  The caller takes care of them.
- */
-STATIC int
-xfs_parseargs(
-       struct xfs_mount        *mp,
-       char                    *options)
-{
-       struct super_block      *sb = mp->m_super;
-       char                    *this_char, *value, *eov;
-       int                     dsunit = 0;
-       int                     dswidth = 0;
-       int                     iosize = 0;
-       __uint8_t               iosizelog = 0;
-
-       /*
-        * set up the mount name first so all the errors will refer to the
-        * correct device.
-        */
-       mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
-       if (!mp->m_fsname)
-               return ENOMEM;
-       mp->m_fsname_len = strlen(mp->m_fsname) + 1;
-
-       /*
-        * Copy binary VFS mount flags we are interested in.
-        */
-       if (sb->s_flags & MS_RDONLY)
-               mp->m_flags |= XFS_MOUNT_RDONLY;
-       if (sb->s_flags & MS_DIRSYNC)
-               mp->m_flags |= XFS_MOUNT_DIRSYNC;
-       if (sb->s_flags & MS_SYNCHRONOUS)
-               mp->m_flags |= XFS_MOUNT_WSYNC;
-
-       /*
-        * Set some default flags that could be cleared by the mount option
-        * parsing.
-        */
-       mp->m_flags |= XFS_MOUNT_BARRIER;
-       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-       mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
-       mp->m_flags |= XFS_MOUNT_DELAYLOG;
-
-       /*
-        * These can be overridden by the mount option parsing.
-        */
-       mp->m_logbufs = -1;
-       mp->m_logbsize = -1;
-
-       if (!options)
-               goto done;
-
-       while ((this_char = strsep(&options, ",")) != NULL) {
-               if (!*this_char)
-                       continue;
-               if ((value = strchr(this_char, '=')) != NULL)
-                       *value++ = 0;
-
-               if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       mp->m_logbufs = simple_strtoul(value, &eov, 10);
-               } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       mp->m_logbsize = suffix_strtoul(value, &eov, 10);
-               } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
-                       if (!mp->m_logname)
-                               return ENOMEM;
-               } else if (!strcmp(this_char, MNTOPT_MTPT)) {
-                       xfs_warn(mp, "%s option not allowed on this system",
-                               this_char);
-                       return EINVAL;
-               } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
-                       if (!mp->m_rtname)
-                               return ENOMEM;
-               } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       iosize = simple_strtoul(value, &eov, 10);
-                       iosizelog = ffs(iosize) - 1;
-               } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       iosize = suffix_strtoul(value, &eov, 10);
-                       iosizelog = ffs(iosize) - 1;
-               } else if (!strcmp(this_char, MNTOPT_GRPID) ||
-                          !strcmp(this_char, MNTOPT_BSDGROUPS)) {
-                       mp->m_flags |= XFS_MOUNT_GRPID;
-               } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
-                          !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
-                       mp->m_flags &= ~XFS_MOUNT_GRPID;
-               } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
-                       mp->m_flags |= XFS_MOUNT_WSYNC;
-               } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
-                       mp->m_flags |= XFS_MOUNT_NORECOVERY;
-               } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
-                       mp->m_flags |= XFS_MOUNT_NOALIGN;
-               } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
-                       mp->m_flags |= XFS_MOUNT_SWALLOC;
-               } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       dsunit = simple_strtoul(value, &eov, 10);
-               } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
-                       if (!value || !*value) {
-                               xfs_warn(mp, "%s option requires an argument",
-                                       this_char);
-                               return EINVAL;
-                       }
-                       dswidth = simple_strtoul(value, &eov, 10);
-               } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
-                       mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
-#if !XFS_BIG_INUMS
-                       xfs_warn(mp, "%s option not allowed on this system",
-                               this_char);
-                       return EINVAL;
-#endif
-               } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
-                       mp->m_flags |= XFS_MOUNT_NOUUID;
-               } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
-                       mp->m_flags |= XFS_MOUNT_BARRIER;
-               } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
-                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
-               } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
-                       mp->m_flags |= XFS_MOUNT_IKEEP;
-               } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
-                       mp->m_flags &= ~XFS_MOUNT_IKEEP;
-               } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
-                       mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
-               } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
-                       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-               } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
-                       mp->m_flags |= XFS_MOUNT_ATTR2;
-               } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
-                       mp->m_flags &= ~XFS_MOUNT_ATTR2;
-                       mp->m_flags |= XFS_MOUNT_NOATTR2;
-               } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
-                       mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-               } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
-                       mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                                         XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-                                         XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-                                         XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
-               } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
-                          !strcmp(this_char, MNTOPT_UQUOTA) ||
-                          !strcmp(this_char, MNTOPT_USRQUOTA)) {
-                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
-                                        XFS_UQUOTA_ENFD);
-               } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
-                          !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
-                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
-                       mp->m_qflags &= ~XFS_UQUOTA_ENFD;
-               } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
-                          !strcmp(this_char, MNTOPT_PRJQUOTA)) {
-                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
-                                        XFS_OQUOTA_ENFD);
-               } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
-                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
-                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
-               } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
-                          !strcmp(this_char, MNTOPT_GRPQUOTA)) {
-                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
-                                        XFS_OQUOTA_ENFD);
-               } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
-                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
-                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
-               } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
-                       mp->m_flags |= XFS_MOUNT_DELAYLOG;
-               } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
-                       mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
-               } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
-                       mp->m_flags |= XFS_MOUNT_DISCARD;
-               } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
-                       mp->m_flags &= ~XFS_MOUNT_DISCARD;
-               } else if (!strcmp(this_char, "ihashsize")) {
-                       xfs_warn(mp,
-       "ihashsize no longer used, option is deprecated.");
-               } else if (!strcmp(this_char, "osyncisdsync")) {
-                       xfs_warn(mp,
-       "osyncisdsync has no effect, option is deprecated.");
-               } else if (!strcmp(this_char, "osyncisosync")) {
-                       xfs_warn(mp,
-       "osyncisosync has no effect, option is deprecated.");
-               } else if (!strcmp(this_char, "irixsgid")) {
-                       xfs_warn(mp,
-       "irixsgid is now a sysctl(2) variable, option is deprecated.");
-               } else {
-                       xfs_warn(mp, "unknown mount option [%s].", this_char);
-                       return EINVAL;
-               }
-       }
-
-       /*
-        * no recovery flag requires a read-only mount
-        */
-       if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
-           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               xfs_warn(mp, "no-recovery mounts must be read-only.");
-               return EINVAL;
-       }
-
-       if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
-               xfs_warn(mp,
-       "sunit and swidth options incompatible with the noalign option");
-               return EINVAL;
-       }
-
-       if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
-           !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
-               xfs_warn(mp,
-       "the discard option is incompatible with the nodelaylog option");
-               return EINVAL;
-       }
-
-#ifndef CONFIG_XFS_QUOTA
-       if (XFS_IS_QUOTA_RUNNING(mp)) {
-               xfs_warn(mp, "quota support not available in this kernel.");
-               return EINVAL;
-       }
-#endif
-
-       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
-           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
-               xfs_warn(mp, "cannot mount with both project and group quota");
-               return EINVAL;
-       }
-
-       if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
-               xfs_warn(mp, "sunit and swidth must be specified together");
-               return EINVAL;
-       }
-
-       if (dsunit && (dswidth % dsunit != 0)) {
-               xfs_warn(mp,
-       "stripe width (%d) must be a multiple of the stripe unit (%d)",
-                       dswidth, dsunit);
-               return EINVAL;
-       }
-
-done:
-       if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
-               /*
-                * At this point the superblock has not been read
-                * in, therefore we do not know the block size.
-                * Before the mount call ends we will convert
-                * these to FSBs.
-                */
-               if (dsunit) {
-                       mp->m_dalign = dsunit;
-                       mp->m_flags |= XFS_MOUNT_RETERR;
-               }
-
-               if (dswidth)
-                       mp->m_swidth = dswidth;
-       }
-
-       if (mp->m_logbufs != -1 &&
-           mp->m_logbufs != 0 &&
-           (mp->m_logbufs < XLOG_MIN_ICLOGS ||
-            mp->m_logbufs > XLOG_MAX_ICLOGS)) {
-               xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
-                       mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
-               return XFS_ERROR(EINVAL);
-       }
-       if (mp->m_logbsize != -1 &&
-           mp->m_logbsize !=  0 &&
-           (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
-            mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
-            !is_power_of_2(mp->m_logbsize))) {
-               xfs_warn(mp,
-                       "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
-                       mp->m_logbsize);
-               return XFS_ERROR(EINVAL);
-       }
-
-       if (iosizelog) {
-               if (iosizelog > XFS_MAX_IO_LOG ||
-                   iosizelog < XFS_MIN_IO_LOG) {
-                       xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
-                               iosizelog, XFS_MIN_IO_LOG,
-                               XFS_MAX_IO_LOG);
-                       return XFS_ERROR(EINVAL);
-               }
-
-               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
-               mp->m_readio_log = iosizelog;
-               mp->m_writeio_log = iosizelog;
-       }
-
-       return 0;
-}
-
-struct proc_xfs_info {
-       int     flag;
-       char    *str;
-};
-
-STATIC int
-xfs_showargs(
-       struct xfs_mount        *mp,
-       struct seq_file         *m)
-{
-       static struct proc_xfs_info xfs_info_set[] = {
-               /* the few simple ones we can get from the mount struct */
-               { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
-               { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
-               { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
-               { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
-               { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
-               { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
-               { XFS_MOUNT_ATTR2,              "," MNTOPT_ATTR2 },
-               { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
-               { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
-               { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
-               { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
-               { 0, NULL }
-       };
-       static struct proc_xfs_info xfs_info_unset[] = {
-               /* the few simple ones we can get from the mount struct */
-               { XFS_MOUNT_COMPAT_IOSIZE,      "," MNTOPT_LARGEIO },
-               { XFS_MOUNT_BARRIER,            "," MNTOPT_NOBARRIER },
-               { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_64BITINODE },
-               { 0, NULL }
-       };
-       struct proc_xfs_info    *xfs_infop;
-
-       for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
-               if (mp->m_flags & xfs_infop->flag)
-                       seq_puts(m, xfs_infop->str);
-       }
-       for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
-               if (!(mp->m_flags & xfs_infop->flag))
-                       seq_puts(m, xfs_infop->str);
-       }
-
-       if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
-               seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
-                               (int)(1 << mp->m_writeio_log) >> 10);
-
-       if (mp->m_logbufs > 0)
-               seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
-       if (mp->m_logbsize > 0)
-               seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
-
-       if (mp->m_logname)
-               seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
-       if (mp->m_rtname)
-               seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
-
-       if (mp->m_dalign > 0)
-               seq_printf(m, "," MNTOPT_SUNIT "=%d",
-                               (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
-       if (mp->m_swidth > 0)
-               seq_printf(m, "," MNTOPT_SWIDTH "=%d",
-                               (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
-
-       if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
-               seq_puts(m, "," MNTOPT_USRQUOTA);
-       else if (mp->m_qflags & XFS_UQUOTA_ACCT)
-               seq_puts(m, "," MNTOPT_UQUOTANOENF);
-
-       /* Either project or group quotas can be active, not both */
-
-       if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-               if (mp->m_qflags & XFS_OQUOTA_ENFD)
-                       seq_puts(m, "," MNTOPT_PRJQUOTA);
-               else
-                       seq_puts(m, "," MNTOPT_PQUOTANOENF);
-       } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-               if (mp->m_qflags & XFS_OQUOTA_ENFD)
-                       seq_puts(m, "," MNTOPT_GRPQUOTA);
-               else
-                       seq_puts(m, "," MNTOPT_GQUOTANOENF);
-       }
-
-       if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
-               seq_puts(m, "," MNTOPT_NOQUOTA);
-
-       return 0;
-}
-__uint64_t
-xfs_max_file_offset(
-       unsigned int            blockshift)
-{
-       unsigned int            pagefactor = 1;
-       unsigned int            bitshift = BITS_PER_LONG - 1;
-
-       /* Figure out maximum filesize, on Linux this can depend on
-        * the filesystem blocksize (on 32 bit platforms).
-        * __block_write_begin does this in an [unsigned] long...
-        *      page->index << (PAGE_CACHE_SHIFT - bbits)
-        * So, for page sized blocks (4K on 32 bit platforms),
-        * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
-        *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
-        * but for smaller blocksizes it is less (bbits = log2 bsize).
-        * Note1: get_block_t takes a long (implicit cast from above)
-        * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
-        * can optionally convert the [unsigned] long from above into
-        * an [unsigned] long long.
-        */
-
-#if BITS_PER_LONG == 32
-# if defined(CONFIG_LBDAF)
-       ASSERT(sizeof(sector_t) == 8);
-       pagefactor = PAGE_CACHE_SIZE;
-       bitshift = BITS_PER_LONG;
-# else
-       pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
-# endif
-#endif
-
-       return (((__uint64_t)pagefactor) << bitshift) - 1;
-}
-
-STATIC int
-xfs_blkdev_get(
-       xfs_mount_t             *mp,
-       const char              *name,
-       struct block_device     **bdevp)
-{
-       int                     error = 0;
-
-       *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-                                   mp);
-       if (IS_ERR(*bdevp)) {
-               error = PTR_ERR(*bdevp);
-               xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
-       }
-
-       return -error;
-}
-
-STATIC void
-xfs_blkdev_put(
-       struct block_device     *bdev)
-{
-       if (bdev)
-               blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-void
-xfs_blkdev_issue_flush(
-       xfs_buftarg_t           *buftarg)
-{
-       blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
-}
-
-STATIC void
-xfs_close_devices(
-       struct xfs_mount        *mp)
-{
-       if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
-               struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
-               xfs_free_buftarg(mp, mp->m_logdev_targp);
-               xfs_blkdev_put(logdev);
-       }
-       if (mp->m_rtdev_targp) {
-               struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
-               xfs_free_buftarg(mp, mp->m_rtdev_targp);
-               xfs_blkdev_put(rtdev);
-       }
-       xfs_free_buftarg(mp, mp->m_ddev_targp);
-}
-
-/*
- * The file system configurations are:
- *     (1) device (partition) with data and internal log
- *     (2) logical volume with data and log subvolumes.
- *     (3) logical volume with data, log, and realtime subvolumes.
- *
- * We only have to handle opening the log and realtime volumes here if
- * they are present.  The data subvolume has already been opened by
- * get_sb_bdev() and is stored in sb->s_bdev.
- */
-STATIC int
-xfs_open_devices(
-       struct xfs_mount        *mp)
-{
-       struct block_device     *ddev = mp->m_super->s_bdev;
-       struct block_device     *logdev = NULL, *rtdev = NULL;
-       int                     error;
-
-       /*
-        * Open real time and log devices - order is important.
-        */
-       if (mp->m_logname) {
-               error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
-               if (error)
-                       goto out;
-       }
-
-       if (mp->m_rtname) {
-               error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
-               if (error)
-                       goto out_close_logdev;
-
-               if (rtdev == ddev || rtdev == logdev) {
-                       xfs_warn(mp,
-       "Cannot mount filesystem with identical rtdev and ddev/logdev.");
-                       error = EINVAL;
-                       goto out_close_rtdev;
-               }
-       }
-
-       /*
-        * Setup xfs_mount buffer target pointers
-        */
-       error = ENOMEM;
-       mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
-       if (!mp->m_ddev_targp)
-               goto out_close_rtdev;
-
-       if (rtdev) {
-               mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
-                                                       mp->m_fsname);
-               if (!mp->m_rtdev_targp)
-                       goto out_free_ddev_targ;
-       }
-
-       if (logdev && logdev != ddev) {
-               mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
-                                                       mp->m_fsname);
-               if (!mp->m_logdev_targp)
-                       goto out_free_rtdev_targ;
-       } else {
-               mp->m_logdev_targp = mp->m_ddev_targp;
-       }
-
-       return 0;
-
- out_free_rtdev_targ:
-       if (mp->m_rtdev_targp)
-               xfs_free_buftarg(mp, mp->m_rtdev_targp);
- out_free_ddev_targ:
-       xfs_free_buftarg(mp, mp->m_ddev_targp);
- out_close_rtdev:
-       if (rtdev)
-               xfs_blkdev_put(rtdev);
- out_close_logdev:
-       if (logdev && logdev != ddev)
-               xfs_blkdev_put(logdev);
- out:
-       return error;
-}
-
-/*
- * Setup xfs_mount buffer target pointers based on superblock
- */
-STATIC int
-xfs_setup_devices(
-       struct xfs_mount        *mp)
-{
-       int                     error;
-
-       error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
-                                   mp->m_sb.sb_sectsize);
-       if (error)
-               return error;
-
-       if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
-               unsigned int    log_sector_size = BBSIZE;
-
-               if (xfs_sb_version_hassector(&mp->m_sb))
-                       log_sector_size = mp->m_sb.sb_logsectsize;
-               error = xfs_setsize_buftarg(mp->m_logdev_targp,
-                                           mp->m_sb.sb_blocksize,
-                                           log_sector_size);
-               if (error)
-                       return error;
-       }
-       if (mp->m_rtdev_targp) {
-               error = xfs_setsize_buftarg(mp->m_rtdev_targp,
-                                           mp->m_sb.sb_blocksize,
-                                           mp->m_sb.sb_sectsize);
-               if (error)
-                       return error;
-       }
-
-       return 0;
-}
-
-/* Catch misguided souls that try to use this interface on XFS */
-STATIC struct inode *
-xfs_fs_alloc_inode(
-       struct super_block      *sb)
-{
-       BUG();
-       return NULL;
-}
-
-/*
- * Now that the generic code is guaranteed not to be accessing
- * the linux inode, we can reclaim the inode.
- */
-STATIC void
-xfs_fs_destroy_inode(
-       struct inode            *inode)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-
-       trace_xfs_destroy_inode(ip);
-
-       XFS_STATS_INC(vn_reclaim);
-
-       /* bad inode, get out here ASAP */
-       if (is_bad_inode(inode))
-               goto out_reclaim;
-
-       xfs_ioend_wait(ip);
-
-       ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
-
-       /*
-        * We should never get here with one of the reclaim flags already set.
-        */
-       ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-       ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
-
-       /*
-        * We always use background reclaim here because even if the
-        * inode is clean, it still may be under IO and hence we have
-        * to take the flush lock. The background reclaim path handles
-        * this more efficiently than we can here, so simply let background
-        * reclaim tear down all inodes.
-        */
-out_reclaim:
-       xfs_inode_set_reclaim_tag(ip);
-}
-
-/*
- * Slab object creation initialisation for the XFS inode.
- * This covers only the idempotent fields in the XFS inode;
- * all other fields need to be initialised on allocation
- * from the slab. This avoids the need to repeatedly initialise
- * fields in the xfs inode that left in the initialise state
- * when freeing the inode.
- */
-STATIC void
-xfs_fs_inode_init_once(
-       void                    *inode)
-{
-       struct xfs_inode        *ip = inode;
-
-       memset(ip, 0, sizeof(struct xfs_inode));
-
-       /* vfs inode */
-       inode_init_once(VFS_I(ip));
-
-       /* xfs inode */
-       atomic_set(&ip->i_iocount, 0);
-       atomic_set(&ip->i_pincount, 0);
-       spin_lock_init(&ip->i_flags_lock);
-       init_waitqueue_head(&ip->i_ipin_wait);
-       /*
-        * Because we want to use a counting completion, complete
-        * the flush completion once to allow a single access to
-        * the flush completion without blocking.
-        */
-       init_completion(&ip->i_flush);
-       complete(&ip->i_flush);
-
-       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-                    "xfsino", ip->i_ino);
-}
-
-/*
- * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
- * we catch unlogged VFS level updates to the inode.
- *
- * We need the barrier() to maintain correct ordering between unlogged
- * updates and the transaction commit code that clears the i_update_core
- * field. This requires all updates to be completed before marking the
- * inode dirty.
- */
-STATIC void
-xfs_fs_dirty_inode(
-       struct inode    *inode,
-       int             flags)
-{
-       barrier();
-       XFS_I(inode)->i_update_core = 1;
-}
-
-STATIC int
-xfs_log_inode(
-       struct xfs_inode        *ip)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_trans        *tp;
-       int                     error;
-
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
-       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
-
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               /* we need to return with the lock hold shared */
-               xfs_ilock(ip, XFS_ILOCK_SHARED);
-               return error;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-       /*
-        * Note - it's possible that we might have pushed ourselves out of the
-        * way during trans_reserve which would flush the inode.  But there's
-        * no guarantee that the inode buffer has actually gone out yet (it's
-        * delwri).  Plus the buffer could be pinned anyway if it's part of
-        * an inode in another recent transaction.  So we play it safe and
-        * fire off the transaction anyway.
-        */
-       xfs_trans_ijoin(tp, ip);
-       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-       error = xfs_trans_commit(tp, 0);
-       xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
-
-       return error;
-}
-
-STATIC int
-xfs_fs_write_inode(
-       struct inode            *inode,
-       struct writeback_control *wbc)
-{
-       struct xfs_inode        *ip = XFS_I(inode);
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     error = EAGAIN;
-
-       trace_xfs_write_inode(ip);
-
-       if (XFS_FORCED_SHUTDOWN(mp))
-               return XFS_ERROR(EIO);
-
-       if (wbc->sync_mode == WB_SYNC_ALL) {
-               /*
-                * Make sure the inode has made it it into the log.  Instead
-                * of forcing it all the way to stable storage using a
-                * synchronous transaction we let the log force inside the
-                * ->sync_fs call do that for thus, which reduces the number
-                * of synchronous log foces dramatically.
-                */
-               xfs_ioend_wait(ip);
-               xfs_ilock(ip, XFS_ILOCK_SHARED);
-               if (ip->i_update_core) {
-                       error = xfs_log_inode(ip);
-                       if (error)
-                               goto out_unlock;
-               }
-       } else {
-               /*
-                * We make this non-blocking if the inode is contended, return
-                * EAGAIN to indicate to the caller that they did not succeed.
-                * This prevents the flush path from blocking on inodes inside
-                * another operation right now, they get caught later by
-                * xfs_sync.
-                */
-               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
-                       goto out;
-
-               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
-                       goto out_unlock;
-
-               /*
-                * Now we have the flush lock and the inode is not pinned, we
-                * can check if the inode is really clean as we know that
-                * there are no pending transaction completions, it is not
-                * waiting on the delayed write queue and there is no IO in
-                * progress.
-                */
-               if (xfs_inode_clean(ip)) {
-                       xfs_ifunlock(ip);
-                       error = 0;
-                       goto out_unlock;
-               }
-               error = xfs_iflush(ip, SYNC_TRYLOCK);
-       }
-
- out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
- out:
-       /*
-        * if we failed to write out the inode then mark
-        * it dirty again so we'll try again later.
-        */
-       if (error)
-               xfs_mark_inode_dirty_sync(ip);
-       return -error;
-}
-
-STATIC void
-xfs_fs_evict_inode(
-       struct inode            *inode)
-{
-       xfs_inode_t             *ip = XFS_I(inode);
-
-       trace_xfs_evict_inode(ip);
-
-       truncate_inode_pages(&inode->i_data, 0);
-       end_writeback(inode);
-       XFS_STATS_INC(vn_rele);
-       XFS_STATS_INC(vn_remove);
-       XFS_STATS_DEC(vn_active);
-
-       /*
-        * The iolock is used by the file system to coordinate reads,
-        * writes, and block truncates.  Up to this point the lock
-        * protected concurrent accesses by users of the inode.  But
-        * from here forward we're doing some final processing of the
-        * inode because we're done with it, and although we reuse the
-        * iolock for protection it is really a distinct lock class
-        * (in the lockdep sense) from before.  To keep lockdep happy
-        * (and basically indicate what we are doing), we explicitly
-        * re-init the iolock here.
-        */
-       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-       lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
-                       &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
-
-       xfs_inactive(ip);
-}
-
-STATIC void
-xfs_free_fsname(
-       struct xfs_mount        *mp)
-{
-       kfree(mp->m_fsname);
-       kfree(mp->m_rtname);
-       kfree(mp->m_logname);
-}
-
-STATIC void
-xfs_fs_put_super(
-       struct super_block      *sb)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       xfs_syncd_stop(mp);
-
-       /*
-        * Blow away any referenced inode in the filestreams cache.
-        * This can and will cause log traffic as inodes go inactive
-        * here.
-        */
-       xfs_filestream_unmount(mp);
-
-       XFS_bflush(mp->m_ddev_targp);
-
-       xfs_unmountfs(mp);
-       xfs_freesb(mp);
-       xfs_icsb_destroy_counters(mp);
-       xfs_close_devices(mp);
-       xfs_free_fsname(mp);
-       kfree(mp);
-}
-
-STATIC int
-xfs_fs_sync_fs(
-       struct super_block      *sb,
-       int                     wait)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-       int                     error;
-
-       /*
-        * Not much we can do for the first async pass.  Writing out the
-        * superblock would be counter-productive as we are going to redirty
-        * when writing out other data and metadata (and writing out a single
-        * block is quite fast anyway).
-        *
-        * Try to asynchronously kick off quota syncing at least.
-        */
-       if (!wait) {
-               xfs_qm_sync(mp, SYNC_TRYLOCK);
-               return 0;
-       }
-
-       error = xfs_quiesce_data(mp);
-       if (error)
-               return -error;
-
-       if (laptop_mode) {
-               /*
-                * The disk must be active because we're syncing.
-                * We schedule xfssyncd now (now that the disk is
-                * active) instead of later (when it might not be).
-                */
-               flush_delayed_work_sync(&mp->m_sync_work);
-       }
-
-       return 0;
-}
-
-STATIC int
-xfs_fs_statfs(
-       struct dentry           *dentry,
-       struct kstatfs          *statp)
-{
-       struct xfs_mount        *mp = XFS_M(dentry->d_sb);
-       xfs_sb_t                *sbp = &mp->m_sb;
-       struct xfs_inode        *ip = XFS_I(dentry->d_inode);
-       __uint64_t              fakeinos, id;
-       xfs_extlen_t            lsize;
-       __int64_t               ffree;
-
-       statp->f_type = XFS_SB_MAGIC;
-       statp->f_namelen = MAXNAMELEN - 1;
-
-       id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
-       statp->f_fsid.val[0] = (u32)id;
-       statp->f_fsid.val[1] = (u32)(id >> 32);
-
-       xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
-
-       spin_lock(&mp->m_sb_lock);
-       statp->f_bsize = sbp->sb_blocksize;
-       lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
-       statp->f_blocks = sbp->sb_dblocks - lsize;
-       statp->f_bfree = statp->f_bavail =
-                               sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-       fakeinos = statp->f_bfree << sbp->sb_inopblog;
-       statp->f_files =
-           MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
-       if (mp->m_maxicount)
-               statp->f_files = min_t(typeof(statp->f_files),
-                                       statp->f_files,
-                                       mp->m_maxicount);
-
-       /* make sure statp->f_ffree does not underflow */
-       ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
-       statp->f_ffree = max_t(__int64_t, ffree, 0);
-
-       spin_unlock(&mp->m_sb_lock);
-
-       if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
-           ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
-                             (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
-               xfs_qm_statvfs(ip, statp);
-       return 0;
-}
-
-STATIC void
-xfs_save_resvblks(struct xfs_mount *mp)
-{
-       __uint64_t resblks = 0;
-
-       mp->m_resblks_save = mp->m_resblks;
-       xfs_reserve_blocks(mp, &resblks, NULL);
-}
-
-STATIC void
-xfs_restore_resvblks(struct xfs_mount *mp)
-{
-       __uint64_t resblks;
-
-       if (mp->m_resblks_save) {
-               resblks = mp->m_resblks_save;
-               mp->m_resblks_save = 0;
-       } else
-               resblks = xfs_default_resblks(mp);
-
-       xfs_reserve_blocks(mp, &resblks, NULL);
-}
-
-STATIC int
-xfs_fs_remount(
-       struct super_block      *sb,
-       int                     *flags,
-       char                    *options)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-       substring_t             args[MAX_OPT_ARGS];
-       char                    *p;
-       int                     error;
-
-       while ((p = strsep(&options, ",")) != NULL) {
-               int token;
-
-               if (!*p)
-                       continue;
-
-               token = match_token(p, tokens, args);
-               switch (token) {
-               case Opt_barrier:
-                       mp->m_flags |= XFS_MOUNT_BARRIER;
-                       break;
-               case Opt_nobarrier:
-                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
-                       break;
-               default:
-                       /*
-                        * Logically we would return an error here to prevent
-                        * users from believing they might have changed
-                        * mount options using remount which can't be changed.
-                        *
-                        * But unfortunately mount(8) adds all options from
-                        * mtab and fstab to the mount arguments in some cases
-                        * so we can't blindly reject options, but have to
-                        * check for each specified option if it actually
-                        * differs from the currently set option and only
-                        * reject it if that's the case.
-                        *
-                        * Until that is implemented we return success for
-                        * every remount request, and silently ignore all
-                        * options that we can't actually change.
-                        */
-#if 0
-                       xfs_info(mp,
-               "mount option \"%s\" not supported for remount\n", p);
-                       return -EINVAL;
-#else
-                       break;
-#endif
-               }
-       }
-
-       /* ro -> rw */
-       if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
-               mp->m_flags &= ~XFS_MOUNT_RDONLY;
-
-               /*
-                * If this is the first remount to writeable state we
-                * might have some superblock changes to update.
-                */
-               if (mp->m_update_flags) {
-                       error = xfs_mount_log_sb(mp, mp->m_update_flags);
-                       if (error) {
-                               xfs_warn(mp, "failed to write sb changes");
-                               return error;
-                       }
-                       mp->m_update_flags = 0;
-               }
-
-               /*
-                * Fill out the reserve pool if it is empty. Use the stashed
-                * value if it is non-zero, otherwise go with the default.
-                */
-               xfs_restore_resvblks(mp);
-       }
-
-       /* rw -> ro */
-       if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
-               /*
-                * After we have synced the data but before we sync the
-                * metadata, we need to free up the reserve block pool so that
-                * the used block count in the superblock on disk is correct at
-                * the end of the remount. Stash the current reserve pool size
-                * so that if we get remounted rw, we can return it to the same
-                * size.
-                */
-
-               xfs_quiesce_data(mp);
-               xfs_save_resvblks(mp);
-               xfs_quiesce_attr(mp);
-               mp->m_flags |= XFS_MOUNT_RDONLY;
-       }
-
-       return 0;
-}
-
-/*
- * Second stage of a freeze. The data is already frozen so we only
- * need to take care of the metadata. Once that's done write a dummy
- * record to dirty the log in case of a crash while frozen.
- */
-STATIC int
-xfs_fs_freeze(
-       struct super_block      *sb)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       xfs_save_resvblks(mp);
-       xfs_quiesce_attr(mp);
-       return -xfs_fs_log_dummy(mp);
-}
-
-STATIC int
-xfs_fs_unfreeze(
-       struct super_block      *sb)
-{
-       struct xfs_mount        *mp = XFS_M(sb);
-
-       xfs_restore_resvblks(mp);
-       return 0;
-}
-
-STATIC int
-xfs_fs_show_options(
-       struct seq_file         *m,
-       struct vfsmount         *mnt)
-{
-       return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock _has_ now been read in.
- */
-STATIC int
-xfs_finish_flags(
-       struct xfs_mount        *mp)
-{
-       int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
-
-       /* Fail a mount where the logbuf is smaller than the log stripe */
-       if (xfs_sb_version_haslogv2(&mp->m_sb)) {
-               if (mp->m_logbsize <= 0 &&
-                   mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
-                       mp->m_logbsize = mp->m_sb.sb_logsunit;
-               } else if (mp->m_logbsize > 0 &&
-                          mp->m_logbsize < mp->m_sb.sb_logsunit) {
-                       xfs_warn(mp,
-               "logbuf size must be greater than or equal to log stripe size");
-                       return XFS_ERROR(EINVAL);
-               }
-       } else {
-               /* Fail a mount if the logbuf is larger than 32K */
-               if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
-                       xfs_warn(mp,
-               "logbuf size for version 1 logs must be 16K or 32K");
-                       return XFS_ERROR(EINVAL);
-               }
-       }
-
-       /*
-        * mkfs'ed attr2 will turn on attr2 mount unless explicitly
-        * told by noattr2 to turn it off
-        */
-       if (xfs_sb_version_hasattr2(&mp->m_sb) &&
-           !(mp->m_flags & XFS_MOUNT_NOATTR2))
-               mp->m_flags |= XFS_MOUNT_ATTR2;
-
-       /*
-        * prohibit r/w mounts of read-only filesystems
-        */
-       if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
-               xfs_warn(mp,
-                       "cannot mount a read-only filesystem as read-write");
-               return XFS_ERROR(EROFS);
-       }
-
-       return 0;
-}
-
-STATIC int
-xfs_fs_fill_super(
-       struct super_block      *sb,
-       void                    *data,
-       int                     silent)
-{
-       struct inode            *root;
-       struct xfs_mount        *mp = NULL;
-       int                     flags = 0, error = ENOMEM;
-
-       mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
-       if (!mp)
-               goto out;
-
-       spin_lock_init(&mp->m_sb_lock);
-       mutex_init(&mp->m_growlock);
-       atomic_set(&mp->m_active_trans, 0);
-
-       mp->m_super = sb;
-       sb->s_fs_info = mp;
-
-       error = xfs_parseargs(mp, (char *)data);
-       if (error)
-               goto out_free_fsname;
-
-       sb_min_blocksize(sb, BBSIZE);
-       sb->s_xattr = xfs_xattr_handlers;
-       sb->s_export_op = &xfs_export_operations;
-#ifdef CONFIG_XFS_QUOTA
-       sb->s_qcop = &xfs_quotactl_operations;
-#endif
-       sb->s_op = &xfs_super_operations;
-
-       if (silent)
-               flags |= XFS_MFSI_QUIET;
-
-       error = xfs_open_devices(mp);
-       if (error)
-               goto out_free_fsname;
-
-       error = xfs_icsb_init_counters(mp);
-       if (error)
-               goto out_close_devices;
-
-       error = xfs_readsb(mp, flags);
-       if (error)
-               goto out_destroy_counters;
-
-       error = xfs_finish_flags(mp);
-       if (error)
-               goto out_free_sb;
-
-       error = xfs_setup_devices(mp);
-       if (error)
-               goto out_free_sb;
-
-       error = xfs_filestream_mount(mp);
-       if (error)
-               goto out_free_sb;
-
-       /*
-        * we must configure the block size in the superblock before we run the
-        * full mount process as the mount process can lookup and cache inodes.
-        * For the same reason we must also initialise the syncd and register
-        * the inode cache shrinker so that inodes can be reclaimed during
-        * operations like a quotacheck that iterate all inodes in the
-        * filesystem.
-        */
-       sb->s_magic = XFS_SB_MAGIC;
-       sb->s_blocksize = mp->m_sb.sb_blocksize;
-       sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
-       sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
-       sb->s_time_gran = 1;
-       set_posix_acl_flag(sb);
-
-       error = xfs_mountfs(mp);
-       if (error)
-               goto out_filestream_unmount;
-
-       error = xfs_syncd_init(mp);
-       if (error)
-               goto out_unmount;
-
-       root = igrab(VFS_I(mp->m_rootip));
-       if (!root) {
-               error = ENOENT;
-               goto out_syncd_stop;
-       }
-       if (is_bad_inode(root)) {
-               error = EINVAL;
-               goto out_syncd_stop;
-       }
-       sb->s_root = d_alloc_root(root);
-       if (!sb->s_root) {
-               error = ENOMEM;
-               goto out_iput;
-       }
-
-       return 0;
-
- out_filestream_unmount:
-       xfs_filestream_unmount(mp);
- out_free_sb:
-       xfs_freesb(mp);
- out_destroy_counters:
-       xfs_icsb_destroy_counters(mp);
- out_close_devices:
-       xfs_close_devices(mp);
- out_free_fsname:
-       xfs_free_fsname(mp);
-       kfree(mp);
- out:
-       return -error;
-
- out_iput:
-       iput(root);
- out_syncd_stop:
-       xfs_syncd_stop(mp);
- out_unmount:
-       /*
-        * Blow away any referenced inode in the filestreams cache.
-        * This can and will cause log traffic as inodes go inactive
-        * here.
-        */
-       xfs_filestream_unmount(mp);
-
-       XFS_bflush(mp->m_ddev_targp);
-
-       xfs_unmountfs(mp);
-       goto out_free_sb;
-}
-
-STATIC struct dentry *
-xfs_fs_mount(
-       struct file_system_type *fs_type,
-       int                     flags,
-       const char              *dev_name,
-       void                    *data)
-{
-       return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
-}
-
-static int
-xfs_fs_nr_cached_objects(
-       struct super_block      *sb)
-{
-       return xfs_reclaim_inodes_count(XFS_M(sb));
-}
-
-static void
-xfs_fs_free_cached_objects(
-       struct super_block      *sb,
-       int                     nr_to_scan)
-{
-       xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
-}
-
-static const struct super_operations xfs_super_operations = {
-       .alloc_inode            = xfs_fs_alloc_inode,
-       .destroy_inode          = xfs_fs_destroy_inode,
-       .dirty_inode            = xfs_fs_dirty_inode,
-       .write_inode            = xfs_fs_write_inode,
-       .evict_inode            = xfs_fs_evict_inode,
-       .put_super              = xfs_fs_put_super,
-       .sync_fs                = xfs_fs_sync_fs,
-       .freeze_fs              = xfs_fs_freeze,
-       .unfreeze_fs            = xfs_fs_unfreeze,
-       .statfs                 = xfs_fs_statfs,
-       .remount_fs             = xfs_fs_remount,
-       .show_options           = xfs_fs_show_options,
-       .nr_cached_objects      = xfs_fs_nr_cached_objects,
-       .free_cached_objects    = xfs_fs_free_cached_objects,
-};
-
-static struct file_system_type xfs_fs_type = {
-       .owner                  = THIS_MODULE,
-       .name                   = "xfs",
-       .mount                  = xfs_fs_mount,
-       .kill_sb                = kill_block_super,
-       .fs_flags               = FS_REQUIRES_DEV,
-};
-
-STATIC int __init
-xfs_init_zones(void)
-{
-
-       xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
-       if (!xfs_ioend_zone)
-               goto out;
-
-       xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
-                                                 xfs_ioend_zone);
-       if (!xfs_ioend_pool)
-               goto out_destroy_ioend_zone;
-
-       xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
-                                               "xfs_log_ticket");
-       if (!xfs_log_ticket_zone)
-               goto out_destroy_ioend_pool;
-
-       xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
-                                               "xfs_bmap_free_item");
-       if (!xfs_bmap_free_item_zone)
-               goto out_destroy_log_ticket_zone;
-
-       xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
-                                               "xfs_btree_cur");
-       if (!xfs_btree_cur_zone)
-               goto out_destroy_bmap_free_item_zone;
-
-       xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
-                                               "xfs_da_state");
-       if (!xfs_da_state_zone)
-               goto out_destroy_btree_cur_zone;
-
-       xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
-       if (!xfs_dabuf_zone)
-               goto out_destroy_da_state_zone;
-
-       xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
-       if (!xfs_ifork_zone)
-               goto out_destroy_dabuf_zone;
-
-       xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
-       if (!xfs_trans_zone)
-               goto out_destroy_ifork_zone;
-
-       xfs_log_item_desc_zone =
-               kmem_zone_init(sizeof(struct xfs_log_item_desc),
-                              "xfs_log_item_desc");
-       if (!xfs_log_item_desc_zone)
-               goto out_destroy_trans_zone;
-
-       /*
-        * The size of the zone allocated buf log item is the maximum
-        * size possible under XFS.  This wastes a little bit of memory,
-        * but it is much faster.
-        */
-       xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                               (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
-                                 NBWORD) * sizeof(int))), "xfs_buf_item");
-       if (!xfs_buf_item_zone)
-               goto out_destroy_log_item_desc_zone;
-
-       xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
-                       ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
-                                sizeof(xfs_extent_t))), "xfs_efd_item");
-       if (!xfs_efd_zone)
-               goto out_destroy_buf_item_zone;
-
-       xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
-                       ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
-                               sizeof(xfs_extent_t))), "xfs_efi_item");
-       if (!xfs_efi_zone)
-               goto out_destroy_efd_zone;
-
-       xfs_inode_zone =
-               kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-                       xfs_fs_inode_init_once);
-       if (!xfs_inode_zone)
-               goto out_destroy_efi_zone;
-
-       xfs_ili_zone =
-               kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
-                                       KM_ZONE_SPREAD, NULL);
-       if (!xfs_ili_zone)
-               goto out_destroy_inode_zone;
-
-       return 0;
-
- out_destroy_inode_zone:
-       kmem_zone_destroy(xfs_inode_zone);
- out_destroy_efi_zone:
-       kmem_zone_destroy(xfs_efi_zone);
- out_destroy_efd_zone:
-       kmem_zone_destroy(xfs_efd_zone);
- out_destroy_buf_item_zone:
-       kmem_zone_destroy(xfs_buf_item_zone);
- out_destroy_log_item_desc_zone:
-       kmem_zone_destroy(xfs_log_item_desc_zone);
- out_destroy_trans_zone:
-       kmem_zone_destroy(xfs_trans_zone);
- out_destroy_ifork_zone:
-       kmem_zone_destroy(xfs_ifork_zone);
- out_destroy_dabuf_zone:
-       kmem_zone_destroy(xfs_dabuf_zone);
- out_destroy_da_state_zone:
-       kmem_zone_destroy(xfs_da_state_zone);
- out_destroy_btree_cur_zone:
-       kmem_zone_destroy(xfs_btree_cur_zone);
- out_destroy_bmap_free_item_zone:
-       kmem_zone_destroy(xfs_bmap_free_item_zone);
- out_destroy_log_ticket_zone:
-       kmem_zone_destroy(xfs_log_ticket_zone);
- out_destroy_ioend_pool:
-       mempool_destroy(xfs_ioend_pool);
- out_destroy_ioend_zone:
-       kmem_zone_destroy(xfs_ioend_zone);
- out:
-       return -ENOMEM;
-}
-
-STATIC void
-xfs_destroy_zones(void)
-{
-       kmem_zone_destroy(xfs_ili_zone);
-       kmem_zone_destroy(xfs_inode_zone);
-       kmem_zone_destroy(xfs_efi_zone);
-       kmem_zone_destroy(xfs_efd_zone);
-       kmem_zone_destroy(xfs_buf_item_zone);
-       kmem_zone_destroy(xfs_log_item_desc_zone);
-       kmem_zone_destroy(xfs_trans_zone);
-       kmem_zone_destroy(xfs_ifork_zone);
-       kmem_zone_destroy(xfs_dabuf_zone);
-       kmem_zone_destroy(xfs_da_state_zone);
-       kmem_zone_destroy(xfs_btree_cur_zone);
-       kmem_zone_destroy(xfs_bmap_free_item_zone);
-       kmem_zone_destroy(xfs_log_ticket_zone);
-       mempool_destroy(xfs_ioend_pool);
-       kmem_zone_destroy(xfs_ioend_zone);
-
-}
-
-STATIC int __init
-xfs_init_workqueues(void)
-{
-       /*
-        * max_active is set to 8 to give enough concurency to allow
-        * multiple work operations on each CPU to run. This allows multiple
-        * filesystems to be running sync work concurrently, and scales with
-        * the number of CPUs in the system.
-        */
-       xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
-       if (!xfs_syncd_wq)
-               goto out;
-
-       xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
-       if (!xfs_ail_wq)
-               goto out_destroy_syncd;
-
-       return 0;
-
-out_destroy_syncd:
-       destroy_workqueue(xfs_syncd_wq);
-out:
-       return -ENOMEM;
-}
-
-STATIC void
-xfs_destroy_workqueues(void)
-{
-       destroy_workqueue(xfs_ail_wq);
-       destroy_workqueue(xfs_syncd_wq);
-}
-
-STATIC int __init
-init_xfs_fs(void)
-{
-       int                     error;
-
-       printk(KERN_INFO XFS_VERSION_STRING " with "
-                        XFS_BUILD_OPTIONS " enabled\n");
-
-       xfs_ioend_init();
-       xfs_dir_startup();
-
-       error = xfs_init_zones();
-       if (error)
-               goto out;
-
-       error = xfs_init_workqueues();
-       if (error)
-               goto out_destroy_zones;
-
-       error = xfs_mru_cache_init();
-       if (error)
-               goto out_destroy_wq;
-
-       error = xfs_filestream_init();
-       if (error)
-               goto out_mru_cache_uninit;
-
-       error = xfs_buf_init();
-       if (error)
-               goto out_filestream_uninit;
-
-       error = xfs_init_procfs();
-       if (error)
-               goto out_buf_terminate;
-
-       error = xfs_sysctl_register();
-       if (error)
-               goto out_cleanup_procfs;
-
-       vfs_initquota();
-
-       error = register_filesystem(&xfs_fs_type);
-       if (error)
-               goto out_sysctl_unregister;
-       return 0;
-
- out_sysctl_unregister:
-       xfs_sysctl_unregister();
- out_cleanup_procfs:
-       xfs_cleanup_procfs();
- out_buf_terminate:
-       xfs_buf_terminate();
- out_filestream_uninit:
-       xfs_filestream_uninit();
- out_mru_cache_uninit:
-       xfs_mru_cache_uninit();
- out_destroy_wq:
-       xfs_destroy_workqueues();
- out_destroy_zones:
-       xfs_destroy_zones();
- out:
-       return error;
-}
-
-STATIC void __exit
-exit_xfs_fs(void)
-{
-       vfs_exitquota();
-       unregister_filesystem(&xfs_fs_type);
-       xfs_sysctl_unregister();
-       xfs_cleanup_procfs();
-       xfs_buf_terminate();
-       xfs_filestream_uninit();
-       xfs_mru_cache_uninit();
-       xfs_destroy_workqueues();
-       xfs_destroy_zones();
-}
-
-module_init(init_xfs_fs);
-module_exit(exit_xfs_fs);
-
-MODULE_AUTHOR("Silicon Graphics, Inc.");
-MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
-MODULE_LICENSE("GPL");
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
deleted file mode 100644 (file)
index 50a3266..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPER_H__
-#define __XFS_SUPER_H__
-
-#include <linux/exportfs.h>
-
-#ifdef CONFIG_XFS_QUOTA
-extern void xfs_qm_init(void);
-extern void xfs_qm_exit(void);
-# define vfs_initquota()       xfs_qm_init()
-# define vfs_exitquota()       xfs_qm_exit()
-#else
-# define vfs_initquota()       do { } while (0)
-# define vfs_exitquota()       do { } while (0)
-#endif
-
-#ifdef CONFIG_XFS_POSIX_ACL
-# define XFS_ACL_STRING                "ACLs, "
-# define set_posix_acl_flag(sb)        ((sb)->s_flags |= MS_POSIXACL)
-#else
-# define XFS_ACL_STRING
-# define set_posix_acl_flag(sb)        do { } while (0)
-#endif
-
-#define XFS_SECURITY_STRING    "security attributes, "
-
-#ifdef CONFIG_XFS_RT
-# define XFS_REALTIME_STRING   "realtime, "
-#else
-# define XFS_REALTIME_STRING
-#endif
-
-#if XFS_BIG_BLKNOS
-# if XFS_BIG_INUMS
-#  define XFS_BIGFS_STRING     "large block/inode numbers, "
-# else
-#  define XFS_BIGFS_STRING     "large block numbers, "
-# endif
-#else
-# define XFS_BIGFS_STRING
-#endif
-
-#ifdef DEBUG
-# define XFS_DBG_STRING                "debug"
-#else
-# define XFS_DBG_STRING                "no debug"
-#endif
-
-#define XFS_VERSION_STRING     "SGI XFS"
-#define XFS_BUILD_OPTIONS      XFS_ACL_STRING \
-                               XFS_SECURITY_STRING \
-                               XFS_REALTIME_STRING \
-                               XFS_BIGFS_STRING \
-                               XFS_DBG_STRING /* DBG must be last */
-
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_buftarg;
-struct block_device;
-
-extern __uint64_t xfs_max_file_offset(unsigned int);
-
-extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
-
-extern const struct export_operations xfs_export_operations;
-extern const struct xattr_handler *xfs_xattr_handlers[];
-extern const struct quotactl_ops xfs_quotactl_operations;
-
-#define XFS_M(sb)              ((struct xfs_mount *)((sb)->s_fs_info))
-
-#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
deleted file mode 100644 (file)
index 4604f90..0000000
+++ /dev/null
@@ -1,1065 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dinode.h"
-#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_fsops.h"
-
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-
-struct workqueue_struct        *xfs_syncd_wq;  /* sync workqueue */
-
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH       32
-
-STATIC int
-xfs_inode_ag_walk_grab(
-       struct xfs_inode        *ip)
-{
-       struct inode            *inode = VFS_I(ip);
-
-       ASSERT(rcu_read_lock_held());
-
-       /*
-        * check for stale RCU freed inode
-        *
-        * If the inode has been reallocated, it doesn't matter if it's not in
-        * the AG we are walking - we are walking for writeback, so if it
-        * passes all the "valid inode" checks and is dirty, then we'll write
-        * it back anyway.  If it has been reallocated and still being
-        * initialised, the XFS_INEW check below will catch it.
-        */
-       spin_lock(&ip->i_flags_lock);
-       if (!ip->i_ino)
-               goto out_unlock_noent;
-
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               goto out_unlock_noent;
-       spin_unlock(&ip->i_flags_lock);
-
-       /* nothing to sync during shutdown */
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return EFSCORRUPTED;
-
-       /* If we can't grab the inode, it must on it's way to reclaim. */
-       if (!igrab(inode))
-               return ENOENT;
-
-       if (is_bad_inode(inode)) {
-               IRELE(ip);
-               return ENOENT;
-       }
-
-       /* inode is valid */
-       return 0;
-
-out_unlock_noent:
-       spin_unlock(&ip->i_flags_lock);
-       return ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
-       struct xfs_mount        *mp,
-       struct xfs_perag        *pag,
-       int                     (*execute)(struct xfs_inode *ip,
-                                          struct xfs_perag *pag, int flags),
-       int                     flags)
-{
-       uint32_t                first_index;
-       int                     last_error = 0;
-       int                     skipped;
-       int                     done;
-       int                     nr_found;
-
-restart:
-       done = 0;
-       skipped = 0;
-       first_index = 0;
-       nr_found = 0;
-       do {
-               struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-               int             error = 0;
-               int             i;
-
-               rcu_read_lock();
-               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                       (void **)batch, first_index,
-                                       XFS_LOOKUP_BATCH);
-               if (!nr_found) {
-                       rcu_read_unlock();
-                       break;
-               }
-
-               /*
-                * Grab the inodes before we drop the lock. if we found
-                * nothing, nr == 0 and the loop will be skipped.
-                */
-               for (i = 0; i < nr_found; i++) {
-                       struct xfs_inode *ip = batch[i];
-
-                       if (done || xfs_inode_ag_walk_grab(ip))
-                               batch[i] = NULL;
-
-                       /*
-                        * Update the index for the next lookup. Catch
-                        * overflows into the next AG range which can occur if
-                        * we have inodes in the last block of the AG and we
-                        * are currently pointing to the last inode.
-                        *
-                        * Because we may see inodes that are from the wrong AG
-                        * due to RCU freeing and reallocation, only update the
-                        * index if it lies in this AG. It was a race that lead
-                        * us to see this inode, so another lookup from the
-                        * same index will not find it again.
-                        */
-                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-                               continue;
-                       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                       if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                               done = 1;
-               }
-
-               /* unlock now we've grabbed the inodes. */
-               rcu_read_unlock();
-
-               for (i = 0; i < nr_found; i++) {
-                       if (!batch[i])
-                               continue;
-                       error = execute(batch[i], pag, flags);
-                       IRELE(batch[i]);
-                       if (error == EAGAIN) {
-                               skipped++;
-                               continue;
-                       }
-                       if (error && last_error != EFSCORRUPTED)
-                               last_error = error;
-               }
-
-               /* bail out if the filesystem is corrupted.  */
-               if (error == EFSCORRUPTED)
-                       break;
-
-               cond_resched();
-
-       } while (nr_found && !done);
-
-       if (skipped) {
-               delay(1);
-               goto restart;
-       }
-       return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
-       struct xfs_mount        *mp,
-       int                     (*execute)(struct xfs_inode *ip,
-                                          struct xfs_perag *pag, int flags),
-       int                     flags)
-{
-       struct xfs_perag        *pag;
-       int                     error = 0;
-       int                     last_error = 0;
-       xfs_agnumber_t          ag;
-
-       ag = 0;
-       while ((pag = xfs_perag_get(mp, ag))) {
-               ag = pag->pag_agno + 1;
-               error = xfs_inode_ag_walk(mp, pag, execute, flags);
-               xfs_perag_put(pag);
-               if (error) {
-                       last_error = error;
-                       if (error == EFSCORRUPTED)
-                               break;
-               }
-       }
-       return XFS_ERROR(last_error);
-}
-
-STATIC int
-xfs_sync_inode_data(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     flags)
-{
-       struct inode            *inode = VFS_I(ip);
-       struct address_space *mapping = inode->i_mapping;
-       int                     error = 0;
-
-       if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-               goto out_wait;
-
-       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-               if (flags & SYNC_TRYLOCK)
-                       goto out_wait;
-               xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       }
-
-       error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-                               0 : XBF_ASYNC, FI_NONE);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-
- out_wait:
-       if (flags & SYNC_WAIT)
-               xfs_ioend_wait(ip);
-       return error;
-}
-
-STATIC int
-xfs_sync_inode_attr(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     flags)
-{
-       int                     error = 0;
-
-       xfs_ilock(ip, XFS_ILOCK_SHARED);
-       if (xfs_inode_clean(ip))
-               goto out_unlock;
-       if (!xfs_iflock_nowait(ip)) {
-               if (!(flags & SYNC_WAIT))
-                       goto out_unlock;
-               xfs_iflock(ip);
-       }
-
-       if (xfs_inode_clean(ip)) {
-               xfs_ifunlock(ip);
-               goto out_unlock;
-       }
-
-       error = xfs_iflush(ip, flags);
-
-       /*
-        * We don't want to try again on non-blocking flushes that can't run
-        * again immediately. If an inode really must be written, then that's
-        * what the SYNC_WAIT flag is for.
-        */
-       if (error == EAGAIN) {
-               ASSERT(!(flags & SYNC_WAIT));
-               error = 0;
-       }
-
- out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
-       return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-       struct xfs_mount        *mp,
-       int                     flags)
-{
-       int                     error;
-
-       ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-       error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-       if (error)
-               return XFS_ERROR(error);
-
-       xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-       return 0;
-}
-
-/*
- * Write out inode metadata (attributes) for the whole filesystem.
- */
-STATIC int
-xfs_sync_attr(
-       struct xfs_mount        *mp,
-       int                     flags)
-{
-       ASSERT((flags & ~SYNC_WAIT) == 0);
-
-       return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
-}
-
-STATIC int
-xfs_sync_fsdata(
-       struct xfs_mount        *mp)
-{
-       struct xfs_buf          *bp;
-
-       /*
-        * If the buffer is pinned then push on the log so we won't get stuck
-        * waiting in the write for someone, maybe ourselves, to flush the log.
-        *
-        * Even though we just pushed the log above, we did not have the
-        * superblock buffer locked at that point so it can become pinned in
-        * between there and here.
-        */
-       bp = xfs_getsb(mp, 0);
-       if (xfs_buf_ispinned(bp))
-               xfs_log_force(mp, 0);
-
-       return xfs_bwrite(mp, bp);
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother flushing the buftarg
- * because it'll just get dirty again.
- */
-int
-xfs_quiesce_data(
-       struct xfs_mount        *mp)
-{
-       int                     error, error2 = 0;
-
-       xfs_qm_sync(mp, SYNC_TRYLOCK);
-       xfs_qm_sync(mp, SYNC_WAIT);
-
-       /* force out the newly dirtied log buffers */
-       xfs_log_force(mp, XFS_LOG_SYNC);
-
-       /* write superblock and hoover up shutdown errors */
-       error = xfs_sync_fsdata(mp);
-
-       /* make sure all delwri buffers are written out */
-       xfs_flush_buftarg(mp->m_ddev_targp, 1);
-
-       /* mark the log as covered if needed */
-       if (xfs_log_need_covered(mp))
-               error2 = xfs_fs_log_dummy(mp);
-
-       /* flush data-only devices */
-       if (mp->m_rtdev_targp)
-               XFS_bflush(mp->m_rtdev_targp);
-
-       return error ? error : error2;
-}
-
-STATIC void
-xfs_quiesce_fs(
-       struct xfs_mount        *mp)
-{
-       int     count = 0, pincount;
-
-       xfs_reclaim_inodes(mp, 0);
-       xfs_flush_buftarg(mp->m_ddev_targp, 0);
-
-       /*
-        * This loop must run at least twice.  The first instance of the loop
-        * will flush most meta data but that will generate more meta data
-        * (typically directory updates).  Which then must be flushed and
-        * logged before we can write the unmount record. We also so sync
-        * reclaim of inodes to catch any that the above delwri flush skipped.
-        */
-       do {
-               xfs_reclaim_inodes(mp, SYNC_WAIT);
-               xfs_sync_attr(mp, SYNC_WAIT);
-               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
-               if (!pincount) {
-                       delay(50);
-                       count++;
-               }
-       } while (count < 2);
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
-       struct xfs_mount        *mp)
-{
-       int     error = 0;
-
-       /* wait for all modifications to complete */
-       while (atomic_read(&mp->m_active_trans) > 0)
-               delay(100);
-
-       /* flush inodes and push all remaining buffers out to disk */
-       xfs_quiesce_fs(mp);
-
-       /*
-        * Just warn here till VFS can correctly support
-        * read-only remount without racing.
-        */
-       WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
-       /* Push the superblock and write an unmount record */
-       error = xfs_log_sbcount(mp);
-       if (error)
-               xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                               "Frozen image may not be consistent.");
-       xfs_log_unmount_write(mp);
-       xfs_unmountfs_writesb(mp);
-}
-
-static void
-xfs_syncd_queue_sync(
-       struct xfs_mount        *mp)
-{
-       queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-                               msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_sync_work);
-       int             error;
-
-       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               /* dgc: errors ignored here */
-               if (mp->m_super->s_frozen == SB_UNFROZEN &&
-                   xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp);
-               else
-                       xfs_log_force(mp, 0);
-               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-
-               /* start pushing all the metadata that is currently dirty */
-               xfs_ail_push_all(mp->m_ail);
-       }
-
-       /* queue us up again */
-       xfs_syncd_queue_sync(mp);
-}
-
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_syncd_queue_reclaim(
-       struct xfs_mount        *mp)
-{
-
-       /*
-        * We can have inodes enter reclaim after we've shut down the syncd
-        * workqueue during unmount, so don't allow reclaim work to be queued
-        * during unmount.
-        */
-       if (!(mp->m_super->s_flags & MS_ACTIVE))
-               return;
-
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-               queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-       }
-       rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-STATIC void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_syncd_queue_reclaim(mp);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-       struct xfs_inode        *ip)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       queue_work(xfs_syncd_wq, &mp->m_flush_work);
-       flush_work_sync(&mp->m_flush_work);
-}
-
-STATIC void
-xfs_flush_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(work,
-                                       struct xfs_mount, m_flush_work);
-
-       xfs_sync_data(mp, SYNC_TRYLOCK);
-       xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
-int
-xfs_syncd_init(
-       struct xfs_mount        *mp)
-{
-       INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-       INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-       INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-       xfs_syncd_queue_sync(mp);
-       xfs_syncd_queue_reclaim(mp);
-
-       return 0;
-}
-
-void
-xfs_syncd_stop(
-       struct xfs_mount        *mp)
-{
-       cancel_delayed_work_sync(&mp->m_sync_work);
-       cancel_delayed_work_sync(&mp->m_reclaim_work);
-       cancel_work_sync(&mp->m_flush_work);
-}
-
-void
-__xfs_inode_set_reclaim_tag(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip)
-{
-       radix_tree_tag_set(&pag->pag_ici_root,
-                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                          XFS_ICI_RECLAIM_TAG);
-
-       if (!pag->pag_ici_reclaimable) {
-               /* propagate the reclaim tag up into the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-
-               /* schedule periodic background inode reclaim */
-               xfs_syncd_queue_reclaim(ip->i_mount);
-
-               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-       pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-       xfs_inode_t     *ip)
-{
-       struct xfs_mount *mp = ip->i_mount;
-       struct xfs_perag *pag;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       spin_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       __xfs_inode_set_reclaim_tag(pag, ip);
-       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-       spin_unlock(&ip->i_flags_lock);
-       spin_unlock(&pag->pag_ici_lock);
-       xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       pag->pag_ici_reclaimable--;
-       if (!pag->pag_ici_reclaimable) {
-               /* clear the reclaim tag from the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-}
-
-void
-__xfs_inode_clear_reclaim_tag(
-       xfs_mount_t     *mp,
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       radix_tree_tag_clear(&pag->pag_ici_root,
-                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-       __xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
- * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
- */
-STATIC int
-xfs_reclaim_inode_grab(
-       struct xfs_inode        *ip,
-       int                     flags)
-{
-       ASSERT(rcu_read_lock_held());
-
-       /* quick check for stale RCU freed inode */
-       if (!ip->i_ino)
-               return 1;
-
-       /*
-        * do some unlocked checks first to avoid unnecessary lock traffic.
-        * The first is a flush lock check, the second is a already in reclaim
-        * check. Only do these checks if we are not going to block on locks.
-        */
-       if ((flags & SYNC_TRYLOCK) &&
-           (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
-               return 1;
-       }
-
-       /*
-        * The radix tree lock here protects a thread in xfs_iget from racing
-        * with us starting reclaim on the inode.  Once we have the
-        * XFS_IRECLAIM flag set it will not touch us.
-        *
-        * Due to RCU lookup, we may find inodes that have been freed and only
-        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-        * aren't candidates for reclaim at all, so we must check the
-        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-        */
-       spin_lock(&ip->i_flags_lock);
-       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* not a reclaim candidate. */
-               spin_unlock(&ip->i_flags_lock);
-               return 1;
-       }
-       __xfs_iflags_set(ip, XFS_IRECLAIM);
-       spin_unlock(&ip->i_flags_lock);
-       return 0;
-}
-
-/*
- * Inodes in different states need to be treated differently, and the return
- * value of xfs_iflush is not sufficient to get this right. The following table
- * lists the inode states and the reclaim actions necessary for non-blocking
- * reclaim:
- *
- *
- *     inode state          iflush ret         required action
- *      ---------------      ----------         ---------------
- *     bad                     -               reclaim
- *     shutdown                EIO             unpin and reclaim
- *     clean, unpinned         0               reclaim
- *     stale, unpinned         0               reclaim
- *     clean, pinned(*)        0               requeue
- *     stale, pinned           EAGAIN          requeue
- *     dirty, delwri ok        0               requeue
- *     dirty, delwri blocked   EAGAIN          requeue
- *     dirty, sync flush       0               reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * As can be seen from the table, the return value of xfs_iflush() is not
- * sufficient to correctly decide the reclaim action here. The checks in
- * xfs_iflush() might look like duplicates, but they are not.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean. The clean inode check needs to be done before flushing
- * the inode delwri otherwise we would loop forever requeuing clean inodes as
- * we cannot tell apart a successful delwri flush and a clean inode from the
- * return value of xfs_iflush().
- *
- * Note that because the inode is flushed delayed write by background
- * writeback, the flush lock may already be held here and waiting on it can
- * result in very long latencies. Hence for sync reclaims, where we wait on the
- * flush lock, the caller should push out delayed write inodes first before
- * trying to reclaim them to minimise the amount of time spent waiting. For
- * background relaim, we just requeue the inode for the next pass.
- *
- * Hence the order of actions after gaining the locks should be:
- *     bad             => reclaim
- *     shutdown        => unpin and reclaim
- *     pinned, delwri  => requeue
- *     pinned, sync    => unpin
- *     stale           => reclaim
- *     clean           => reclaim
- *     dirty, delwri   => flush and requeue
- *     dirty, sync     => flush, wait and reclaim
- */
-STATIC int
-xfs_reclaim_inode(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     sync_mode)
-{
-       int     error;
-
-restart:
-       error = 0;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (!xfs_iflock_nowait(ip)) {
-               if (!(sync_mode & SYNC_WAIT))
-                       goto out;
-               xfs_iflock(ip);
-       }
-
-       if (is_bad_inode(VFS_I(ip)))
-               goto reclaim;
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               xfs_iunpin_wait(ip);
-               goto reclaim;
-       }
-       if (xfs_ipincount(ip)) {
-               if (!(sync_mode & SYNC_WAIT)) {
-                       xfs_ifunlock(ip);
-                       goto out;
-               }
-               xfs_iunpin_wait(ip);
-       }
-       if (xfs_iflags_test(ip, XFS_ISTALE))
-               goto reclaim;
-       if (xfs_inode_clean(ip))
-               goto reclaim;
-
-       /*
-        * Now we have an inode that needs flushing.
-        *
-        * We do a nonblocking flush here even if we are doing a SYNC_WAIT
-        * reclaim as we can deadlock with inode cluster removal.
-        * xfs_ifree_cluster() can lock the inode buffer before it locks the
-        * ip->i_lock, and we are doing the exact opposite here. As a result,
-        * doing a blocking xfs_itobp() to get the cluster buffer will result
-        * in an ABBA deadlock with xfs_ifree_cluster().
-        *
-        * As xfs_ifree_cluser() must gather all inodes that are active in the
-        * cache to mark them stale, if we hit this case we don't actually want
-        * to do IO here - we want the inode marked stale so we can simply
-        * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
-        * just unlock the inode, back off and try again. Hopefully the next
-        * pass through will see the stale flag set on the inode.
-        */
-       error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
-       if (sync_mode & SYNC_WAIT) {
-               if (error == EAGAIN) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       /* backoff longer than in xfs_ifree_cluster */
-                       delay(2);
-                       goto restart;
-               }
-               xfs_iflock(ip);
-               goto reclaim;
-       }
-
-       /*
-        * When we have to flush an inode but don't have SYNC_WAIT set, we
-        * flush the inode out using a delwri buffer and wait for the next
-        * call into reclaim to find it in a clean state instead of waiting for
-        * it now. We also don't return errors here - if the error is transient
-        * then the next reclaim pass will flush the inode, and if the error
-        * is permanent then the next sync reclaim will reclaim the inode and
-        * pass on the error.
-        */
-       if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               xfs_warn(ip->i_mount,
-                       "inode 0x%llx background reclaim flush failed with %d",
-                       (long long)ip->i_ino, error);
-       }
-out:
-       xfs_iflags_clear(ip, XFS_IRECLAIM);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * We could return EAGAIN here to make reclaim rescan the inode tree in
-        * a short while. However, this just burns CPU time scanning the tree
-        * waiting for IO to complete and xfssyncd never goes back to the idle
-        * state. Instead, return 0 to let the next scheduled background reclaim
-        * attempt to reclaim the inode again.
-        */
-       return 0;
-
-reclaim:
-       xfs_ifunlock(ip);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       XFS_STATS_INC(xs_ig_reclaims);
-       /*
-        * Remove the inode from the per-AG radix tree.
-        *
-        * Because radix_tree_delete won't complain even if the item was never
-        * added to the tree assert that it's been there before to catch
-        * problems with the inode life time early on.
-        */
-       spin_lock(&pag->pag_ici_lock);
-       if (!radix_tree_delete(&pag->pag_ici_root,
-                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
-               ASSERT(0);
-       __xfs_inode_clear_reclaim(pag, ip);
-       spin_unlock(&pag->pag_ici_lock);
-
-       /*
-        * Here we do an (almost) spurious inode lock in order to coordinate
-        * with inode cache radix tree lookups.  This is because the lookup
-        * can reference the inodes in the cache without taking references.
-        *
-        * We make that OK here by ensuring that we wait until the inode is
-        * unlocked after the lookup before we go ahead and free it.  We get
-        * both the ilock and the iolock because the code may need to drop the
-        * ilock one but will still hold the iolock.
-        */
-       xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-       xfs_qm_dqdetach(ip);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
-       xfs_inode_free(ip);
-       return error;
-
-}
-
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-int
-xfs_reclaim_inodes_ag(
-       struct xfs_mount        *mp,
-       int                     flags,
-       int                     *nr_to_scan)
-{
-       struct xfs_perag        *pag;
-       int                     error = 0;
-       int                     last_error = 0;
-       xfs_agnumber_t          ag;
-       int                     trylock = flags & SYNC_TRYLOCK;
-       int                     skipped;
-
-restart:
-       ag = 0;
-       skipped = 0;
-       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-               unsigned long   first_index = 0;
-               int             done = 0;
-               int             nr_found = 0;
-
-               ag = pag->pag_agno + 1;
-
-               if (trylock) {
-                       if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-                               skipped++;
-                               xfs_perag_put(pag);
-                               continue;
-                       }
-                       first_index = pag->pag_ici_reclaim_cursor;
-               } else
-                       mutex_lock(&pag->pag_ici_reclaim_lock);
-
-               do {
-                       struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-                       int     i;
-
-                       rcu_read_lock();
-                       nr_found = radix_tree_gang_lookup_tag(
-                                       &pag->pag_ici_root,
-                                       (void **)batch, first_index,
-                                       XFS_LOOKUP_BATCH,
-                                       XFS_ICI_RECLAIM_TAG);
-                       if (!nr_found) {
-                               done = 1;
-                               rcu_read_unlock();
-                               break;
-                       }
-
-                       /*
-                        * Grab the inodes before we drop the lock. if we found
-                        * nothing, nr == 0 and the loop will be skipped.
-                        */
-                       for (i = 0; i < nr_found; i++) {
-                               struct xfs_inode *ip = batch[i];
-
-                               if (done || xfs_reclaim_inode_grab(ip, flags))
-                                       batch[i] = NULL;
-
-                               /*
-                                * Update the index for the next lookup. Catch
-                                * overflows into the next AG range which can
-                                * occur if we have inodes in the last block of
-                                * the AG and we are currently pointing to the
-                                * last inode.
-                                *
-                                * Because we may see inodes that are from the
-                                * wrong AG due to RCU freeing and
-                                * reallocation, only update the index if it
-                                * lies in this AG. It was a race that lead us
-                                * to see this inode, so another lookup from
-                                * the same index will not find it again.
-                                */
-                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-                                                               pag->pag_agno)
-                                       continue;
-                               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                                       done = 1;
-                       }
-
-                       /* unlock now we've grabbed the inodes. */
-                       rcu_read_unlock();
-
-                       for (i = 0; i < nr_found; i++) {
-                               if (!batch[i])
-                                       continue;
-                               error = xfs_reclaim_inode(batch[i], pag, flags);
-                               if (error && last_error != EFSCORRUPTED)
-                                       last_error = error;
-                       }
-
-                       *nr_to_scan -= XFS_LOOKUP_BATCH;
-
-                       cond_resched();
-
-               } while (nr_found && !done && *nr_to_scan > 0);
-
-               if (trylock && !done)
-                       pag->pag_ici_reclaim_cursor = first_index;
-               else
-                       pag->pag_ici_reclaim_cursor = 0;
-               mutex_unlock(&pag->pag_ici_reclaim_lock);
-               xfs_perag_put(pag);
-       }
-
-       /*
-        * if we skipped any AG, and we still have scan count remaining, do
-        * another pass this time using blocking reclaim semantics (i.e
-        * waiting on the reclaim locks and ignoring the reclaim cursors). This
-        * ensure that when we get more reclaimers than AGs we block rather
-        * than spin trying to execute reclaim.
-        */
-       if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-               trylock = 0;
-               goto restart;
-       }
-       return XFS_ERROR(last_error);
-}
-
-int
-xfs_reclaim_inodes(
-       xfs_mount_t     *mp,
-       int             mode)
-{
-       int             nr_to_scan = INT_MAX;
-
-       return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
-}
-
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-void
-xfs_reclaim_inodes_nr(
-       struct xfs_mount        *mp,
-       int                     nr_to_scan)
-{
-       /* kick background reclaimer and push the AIL */
-       xfs_syncd_queue_reclaim(mp);
-       xfs_ail_push_all(mp->m_ail);
-
-       xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
-}
-
-/*
- * Return the number of reclaimable inodes in the filesystem for
- * the shrinker to determine how much to reclaim.
- */
-int
-xfs_reclaim_inodes_count(
-       struct xfs_mount        *mp)
-{
-       struct xfs_perag        *pag;
-       xfs_agnumber_t          ag = 0;
-       int                     reclaimable = 0;
-
-       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-               ag = pag->pag_agno + 1;
-               reclaimable += pag->pag_ici_reclaimable;
-               xfs_perag_put(pag);
-       }
-       return reclaimable;
-}
-
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
deleted file mode 100644 (file)
index 941202e..0000000
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef XFS_SYNC_H
-#define XFS_SYNC_H 1
-
-struct xfs_mount;
-struct xfs_perag;
-
-#define SYNC_WAIT              0x0001  /* wait for i/o to complete */
-#define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
-
-extern struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
-
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
-
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
-void xfs_flush_inodes(struct xfs_inode *ip);
-
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-                               struct xfs_inode *ip);
-
-int xfs_sync_inode_grab(struct xfs_inode *ip);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
-       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-       int flags);
-
-#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
deleted file mode 100644 (file)
index ee2d2ad..0000000
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2001-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-#include "xfs_error.h"
-
-static struct ctl_table_header *xfs_table_header;
-
-#ifdef CONFIG_PROC_FS
-STATIC int
-xfs_stats_clear_proc_handler(
-       ctl_table       *ctl,
-       int             write,
-       void            __user *buffer,
-       size_t          *lenp,
-       loff_t          *ppos)
-{
-       int             c, ret, *valp = ctl->data;
-       __uint32_t      vn_active;
-
-       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
-
-       if (!ret && write && *valp) {
-               xfs_notice(NULL, "Clearing xfsstats");
-               for_each_possible_cpu(c) {
-                       preempt_disable();
-                       /* save vn_active, it's a universal truth! */
-                       vn_active = per_cpu(xfsstats, c).vn_active;
-                       memset(&per_cpu(xfsstats, c), 0,
-                              sizeof(struct xfsstats));
-                       per_cpu(xfsstats, c).vn_active = vn_active;
-                       preempt_enable();
-               }
-               xfs_stats_clear = 0;
-       }
-
-       return ret;
-}
-
-STATIC int
-xfs_panic_mask_proc_handler(
-       ctl_table       *ctl,
-       int             write,
-       void            __user *buffer,
-       size_t          *lenp,
-       loff_t          *ppos)
-{
-       int             ret, *valp = ctl->data;
-
-       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
-       if (!ret && write) {
-               xfs_panic_mask = *valp;
-#ifdef DEBUG
-               xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
-#endif
-       }
-       return ret;
-}
-#endif /* CONFIG_PROC_FS */
-
-static ctl_table xfs_table[] = {
-       {
-               .procname       = "irix_sgid_inherit",
-               .data           = &xfs_params.sgid_inherit.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.sgid_inherit.min,
-               .extra2         = &xfs_params.sgid_inherit.max
-       },
-       {
-               .procname       = "irix_symlink_mode",
-               .data           = &xfs_params.symlink_mode.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.symlink_mode.min,
-               .extra2         = &xfs_params.symlink_mode.max
-       },
-       {
-               .procname       = "panic_mask",
-               .data           = &xfs_params.panic_mask.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = xfs_panic_mask_proc_handler,
-               .extra1         = &xfs_params.panic_mask.min,
-               .extra2         = &xfs_params.panic_mask.max
-       },
-
-       {
-               .procname       = "error_level",
-               .data           = &xfs_params.error_level.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.error_level.min,
-               .extra2         = &xfs_params.error_level.max
-       },
-       {
-               .procname       = "xfssyncd_centisecs",
-               .data           = &xfs_params.syncd_timer.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.syncd_timer.min,
-               .extra2         = &xfs_params.syncd_timer.max
-       },
-       {
-               .procname       = "inherit_sync",
-               .data           = &xfs_params.inherit_sync.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.inherit_sync.min,
-               .extra2         = &xfs_params.inherit_sync.max
-       },
-       {
-               .procname       = "inherit_nodump",
-               .data           = &xfs_params.inherit_nodump.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.inherit_nodump.min,
-               .extra2         = &xfs_params.inherit_nodump.max
-       },
-       {
-               .procname       = "inherit_noatime",
-               .data           = &xfs_params.inherit_noatim.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.inherit_noatim.min,
-               .extra2         = &xfs_params.inherit_noatim.max
-       },
-       {
-               .procname       = "xfsbufd_centisecs",
-               .data           = &xfs_params.xfs_buf_timer.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.xfs_buf_timer.min,
-               .extra2         = &xfs_params.xfs_buf_timer.max
-       },
-       {
-               .procname       = "age_buffer_centisecs",
-               .data           = &xfs_params.xfs_buf_age.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.xfs_buf_age.min,
-               .extra2         = &xfs_params.xfs_buf_age.max
-       },
-       {
-               .procname       = "inherit_nosymlinks",
-               .data           = &xfs_params.inherit_nosym.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.inherit_nosym.min,
-               .extra2         = &xfs_params.inherit_nosym.max
-       },
-       {
-               .procname       = "rotorstep",
-               .data           = &xfs_params.rotorstep.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.rotorstep.min,
-               .extra2         = &xfs_params.rotorstep.max
-       },
-       {
-               .procname       = "inherit_nodefrag",
-               .data           = &xfs_params.inherit_nodfrg.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.inherit_nodfrg.min,
-               .extra2         = &xfs_params.inherit_nodfrg.max
-       },
-       {
-               .procname       = "filestream_centisecs",
-               .data           = &xfs_params.fstrm_timer.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &xfs_params.fstrm_timer.min,
-               .extra2         = &xfs_params.fstrm_timer.max,
-       },
-       /* please keep this the last entry */
-#ifdef CONFIG_PROC_FS
-       {
-               .procname       = "stats_clear",
-               .data           = &xfs_params.stats_clear.val,
-               .maxlen         = sizeof(int),
-               .mode           = 0644,
-               .proc_handler   = xfs_stats_clear_proc_handler,
-               .extra1         = &xfs_params.stats_clear.min,
-               .extra2         = &xfs_params.stats_clear.max
-       },
-#endif /* CONFIG_PROC_FS */
-
-       {}
-};
-
-static ctl_table xfs_dir_table[] = {
-       {
-               .procname       = "xfs",
-               .mode           = 0555,
-               .child          = xfs_table
-       },
-       {}
-};
-
-static ctl_table xfs_root_table[] = {
-       {
-               .procname       = "fs",
-               .mode           = 0555,
-               .child          = xfs_dir_table
-       },
-       {}
-};
-
-int
-xfs_sysctl_register(void)
-{
-       xfs_table_header = register_sysctl_table(xfs_root_table);
-       if (!xfs_table_header)
-               return -ENOMEM;
-       return 0;
-}
-
-void
-xfs_sysctl_unregister(void)
-{
-       unregister_sysctl_table(xfs_table_header);
-}
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
deleted file mode 100644 (file)
index b9937d4..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2001-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SYSCTL_H__
-#define __XFS_SYSCTL_H__
-
-#include <linux/sysctl.h>
-
-/*
- * Tunable xfs parameters
- */
-
-typedef struct xfs_sysctl_val {
-       int min;
-       int val;
-       int max;
-} xfs_sysctl_val_t;
-
-typedef struct xfs_param {
-       xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
-                                        * not a member of parent dir GID. */
-       xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
-       xfs_sysctl_val_t panic_mask;    /* bitmask to cause panic on errors. */
-       xfs_sysctl_val_t error_level;   /* Degree of reporting for problems  */
-       xfs_sysctl_val_t syncd_timer;   /* Interval between xfssyncd wakeups */
-       xfs_sysctl_val_t stats_clear;   /* Reset all XFS statistics to zero. */
-       xfs_sysctl_val_t inherit_sync;  /* Inherit the "sync" inode flag. */
-       xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
-       xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
-       xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
-       xfs_sysctl_val_t xfs_buf_age;   /* Metadata buffer age before flush. */
-       xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
-       xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
-       xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
-       xfs_sysctl_val_t fstrm_timer;   /* Filestream dir-AG assoc'n timeout. */
-} xfs_param_t;
-
-/*
- * xfs_error_level:
- *
- * How much error reporting will be done when internal problems are
- * encountered.  These problems normally return an EFSCORRUPTED to their
- * caller, with no other information reported.
- *
- * 0   No error reports
- * 1   Report EFSCORRUPTED errors that will cause a filesystem shutdown
- * 5   Report all EFSCORRUPTED errors (all of the above errors, plus any
- *     additional errors that are known to not cause shutdowns)
- *
- * xfs_panic_mask bit 0x8 turns the error reports into panics
- */
-
-enum {
-       /* XFS_REFCACHE_SIZE = 1 */
-       /* XFS_REFCACHE_PURGE = 2 */
-       /* XFS_RESTRICT_CHOWN = 3 */
-       XFS_SGID_INHERIT = 4,
-       XFS_SYMLINK_MODE = 5,
-       XFS_PANIC_MASK = 6,
-       XFS_ERRLEVEL = 7,
-       XFS_SYNCD_TIMER = 8,
-       /* XFS_PROBE_DMAPI = 9 */
-       /* XFS_PROBE_IOOPS = 10 */
-       /* XFS_PROBE_QUOTA = 11 */
-       XFS_STATS_CLEAR = 12,
-       XFS_INHERIT_SYNC = 13,
-       XFS_INHERIT_NODUMP = 14,
-       XFS_INHERIT_NOATIME = 15,
-       XFS_BUF_TIMER = 16,
-       XFS_BUF_AGE = 17,
-       /* XFS_IO_BYPASS = 18 */
-       XFS_INHERIT_NOSYM = 19,
-       XFS_ROTORSTEP = 20,
-       XFS_INHERIT_NODFRG = 21,
-       XFS_FILESTREAM_TIMER = 22,
-};
-
-extern xfs_param_t     xfs_params;
-
-#ifdef CONFIG_SYSCTL
-extern int xfs_sysctl_register(void);
-extern void xfs_sysctl_unregister(void);
-#else
-# define xfs_sysctl_register()         (0)
-# define xfs_sysctl_unregister()       do { } while (0)
-#endif /* CONFIG_SYSCTL */
-
-#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
deleted file mode 100644 (file)
index 88d25d4..0000000
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2009, Christoph Hellwig
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_mount.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_alloc.h"
-#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
-#include "xfs_quota.h"
-#include "xfs_iomap.h"
-#include "xfs_aops.h"
-#include "quota/xfs_dquot_item.h"
-#include "quota/xfs_dquot.h"
-#include "xfs_log_recover.h"
-#include "xfs_inode_item.h"
-
-/*
- * We include this last to have the helpers above available for the trace
- * event implementations.
- */
-#define CREATE_TRACE_POINTS
-#include "xfs_trace.h"
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
deleted file mode 100644 (file)
index 690fc7a..0000000
+++ /dev/null
@@ -1,1746 +0,0 @@
-/*
- * Copyright (c) 2009, Christoph Hellwig
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM xfs
-
-#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
-#define _TRACE_XFS_H
-
-#include <linux/tracepoint.h>
-
-struct xfs_agf;
-struct xfs_alloc_arg;
-struct xfs_attr_list_context;
-struct xfs_buf_log_item;
-struct xfs_da_args;
-struct xfs_da_node_entry;
-struct xfs_dquot;
-struct xlog_ticket;
-struct log;
-struct xlog_recover;
-struct xlog_recover_item;
-struct xfs_buf_log_format;
-struct xfs_inode_log_format;
-
-DECLARE_EVENT_CLASS(xfs_attr_list_class,
-       TP_PROTO(struct xfs_attr_list_context *ctx),
-       TP_ARGS(ctx),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(u32, hashval)
-               __field(u32, blkno)
-               __field(u32, offset)
-               __field(void *, alist)
-               __field(int, bufsize)
-               __field(int, count)
-               __field(int, firstu)
-               __field(int, dupcnt)
-               __field(int, flags)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
-               __entry->ino = ctx->dp->i_ino;
-               __entry->hashval = ctx->cursor->hashval;
-               __entry->blkno = ctx->cursor->blkno;
-               __entry->offset = ctx->cursor->offset;
-               __entry->alist = ctx->alist;
-               __entry->bufsize = ctx->bufsize;
-               __entry->count = ctx->count;
-               __entry->firstu = ctx->firstu;
-               __entry->flags = ctx->flags;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
-                 "alist 0x%p size %u count %u firstu %u flags %d %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->ino,
-                  __entry->hashval,
-                  __entry->blkno,
-                  __entry->offset,
-                  __entry->dupcnt,
-                  __entry->alist,
-                  __entry->bufsize,
-                  __entry->count,
-                  __entry->firstu,
-                  __entry->flags,
-                  __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
-       )
-)
-
-#define DEFINE_ATTR_LIST_EVENT(name) \
-DEFINE_EVENT(xfs_attr_list_class, name, \
-       TP_PROTO(struct xfs_attr_list_context *ctx), \
-       TP_ARGS(ctx))
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
-DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
-
-DECLARE_EVENT_CLASS(xfs_perag_class,
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
-                unsigned long caller_ip),
-       TP_ARGS(mp, agno, refcount, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(int, refcount)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = agno;
-               __entry->refcount = refcount;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d agno %u refcount %d caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->refcount,
-                 (char *)__entry->caller_ip)
-);
-
-#define DEFINE_PERAG_REF_EVENT(name)   \
-DEFINE_EVENT(xfs_perag_class, name,    \
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
-                unsigned long caller_ip),                                      \
-       TP_ARGS(mp, agno, refcount, caller_ip))
-DEFINE_PERAG_REF_EVENT(xfs_perag_get);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
-DEFINE_PERAG_REF_EVENT(xfs_perag_put);
-DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
-DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
-
-TRACE_EVENT(xfs_attr_list_node_descend,
-       TP_PROTO(struct xfs_attr_list_context *ctx,
-                struct xfs_da_node_entry *btree),
-       TP_ARGS(ctx, btree),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(u32, hashval)
-               __field(u32, blkno)
-               __field(u32, offset)
-               __field(void *, alist)
-               __field(int, bufsize)
-               __field(int, count)
-               __field(int, firstu)
-               __field(int, dupcnt)
-               __field(int, flags)
-               __field(u32, bt_hashval)
-               __field(u32, bt_before)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
-               __entry->ino = ctx->dp->i_ino;
-               __entry->hashval = ctx->cursor->hashval;
-               __entry->blkno = ctx->cursor->blkno;
-               __entry->offset = ctx->cursor->offset;
-               __entry->alist = ctx->alist;
-               __entry->bufsize = ctx->bufsize;
-               __entry->count = ctx->count;
-               __entry->firstu = ctx->firstu;
-               __entry->flags = ctx->flags;
-               __entry->bt_hashval = be32_to_cpu(btree->hashval);
-               __entry->bt_before = be32_to_cpu(btree->before);
-       ),
-       TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
-                 "alist 0x%p size %u count %u firstu %u flags %d %s "
-                 "node hashval %u, node before %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                  __entry->ino,
-                  __entry->hashval,
-                  __entry->blkno,
-                  __entry->offset,
-                  __entry->dupcnt,
-                  __entry->alist,
-                  __entry->bufsize,
-                  __entry->count,
-                  __entry->firstu,
-                  __entry->flags,
-                  __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
-                  __entry->bt_hashval,
-                  __entry->bt_before)
-);
-
-TRACE_EVENT(xfs_iext_insert,
-       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
-                struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
-       TP_ARGS(ip, idx, r, state, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_extnum_t, idx)
-               __field(xfs_fileoff_t, startoff)
-               __field(xfs_fsblock_t, startblock)
-               __field(xfs_filblks_t, blockcount)
-               __field(xfs_exntst_t, state)
-               __field(int, bmap_state)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->idx = idx;
-               __entry->startoff = r->br_startoff;
-               __entry->startblock = r->br_startblock;
-               __entry->blockcount = r->br_blockcount;
-               __entry->state = r->br_state;
-               __entry->bmap_state = state;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                 "offset %lld block %lld count %lld flag %d caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
-                 (long)__entry->idx,
-                 __entry->startoff,
-                 (__int64_t)__entry->startblock,
-                 __entry->blockcount,
-                 __entry->state,
-                 (char *)__entry->caller_ip)
-);
-
-DECLARE_EVENT_CLASS(xfs_bmap_class,
-       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
-                unsigned long caller_ip),
-       TP_ARGS(ip, idx, state, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_extnum_t, idx)
-               __field(xfs_fileoff_t, startoff)
-               __field(xfs_fsblock_t, startblock)
-               __field(xfs_filblks_t, blockcount)
-               __field(xfs_exntst_t, state)
-               __field(int, bmap_state)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               struct xfs_ifork        *ifp = (state & BMAP_ATTRFORK) ?
-                                               ip->i_afp : &ip->i_df;
-               struct xfs_bmbt_irec    r;
-
-               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->idx = idx;
-               __entry->startoff = r.br_startoff;
-               __entry->startblock = r.br_startblock;
-               __entry->blockcount = r.br_blockcount;
-               __entry->state = r.br_state;
-               __entry->bmap_state = state;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
-                 "offset %lld block %lld count %lld flag %d caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
-                 (long)__entry->idx,
-                 __entry->startoff,
-                 (__int64_t)__entry->startblock,
-                 __entry->blockcount,
-                 __entry->state,
-                 (char *)__entry->caller_ip)
-)
-
-#define DEFINE_BMAP_EVENT(name) \
-DEFINE_EVENT(xfs_bmap_class, name, \
-       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
-                unsigned long caller_ip), \
-       TP_ARGS(ip, idx, state, caller_ip))
-DEFINE_BMAP_EVENT(xfs_iext_remove);
-DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
-DEFINE_BMAP_EVENT(xfs_bmap_post_update);
-DEFINE_BMAP_EVENT(xfs_extlist);
-
-DECLARE_EVENT_CLASS(xfs_buf_class,
-       TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
-       TP_ARGS(bp, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_daddr_t, bno)
-               __field(size_t, buffer_length)
-               __field(int, hold)
-               __field(int, pincount)
-               __field(unsigned, lockval)
-               __field(unsigned, flags)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = bp->b_target->bt_dev;
-               __entry->bno = bp->b_bn;
-               __entry->buffer_length = bp->b_buffer_length;
-               __entry->hold = atomic_read(&bp->b_hold);
-               __entry->pincount = atomic_read(&bp->b_pin_count);
-               __entry->lockval = bp->b_sema.count;
-               __entry->flags = bp->b_flags;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d flags %s caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long long)__entry->bno,
-                 __entry->buffer_length,
-                 __entry->hold,
-                 __entry->pincount,
-                 __entry->lockval,
-                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
-                 (void *)__entry->caller_ip)
-)
-
-#define DEFINE_BUF_EVENT(name) \
-DEFINE_EVENT(xfs_buf_class, name, \
-       TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
-       TP_ARGS(bp, caller_ip))
-DEFINE_BUF_EVENT(xfs_buf_init);
-DEFINE_BUF_EVENT(xfs_buf_free);
-DEFINE_BUF_EVENT(xfs_buf_hold);
-DEFINE_BUF_EVENT(xfs_buf_rele);
-DEFINE_BUF_EVENT(xfs_buf_iodone);
-DEFINE_BUF_EVENT(xfs_buf_iorequest);
-DEFINE_BUF_EVENT(xfs_buf_bawrite);
-DEFINE_BUF_EVENT(xfs_buf_bdwrite);
-DEFINE_BUF_EVENT(xfs_buf_lock);
-DEFINE_BUF_EVENT(xfs_buf_lock_done);
-DEFINE_BUF_EVENT(xfs_buf_trylock);
-DEFINE_BUF_EVENT(xfs_buf_unlock);
-DEFINE_BUF_EVENT(xfs_buf_iowait);
-DEFINE_BUF_EVENT(xfs_buf_iowait_done);
-DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
-DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
-DEFINE_BUF_EVENT(xfs_buf_delwri_split);
-DEFINE_BUF_EVENT(xfs_buf_get_uncached);
-DEFINE_BUF_EVENT(xfs_bdstrat_shut);
-DEFINE_BUF_EVENT(xfs_buf_item_relse);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone);
-DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
-DEFINE_BUF_EVENT(xfs_buf_error_relse);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
-
-/* not really buffer traces, but the buf provides useful information */
-DEFINE_BUF_EVENT(xfs_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_reset_dqcounts);
-DEFINE_BUF_EVENT(xfs_inode_item_push);
-
-/* pass flags explicitly */
-DECLARE_EVENT_CLASS(xfs_buf_flags_class,
-       TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
-       TP_ARGS(bp, flags, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_daddr_t, bno)
-               __field(size_t, buffer_length)
-               __field(int, hold)
-               __field(int, pincount)
-               __field(unsigned, lockval)
-               __field(unsigned, flags)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = bp->b_target->bt_dev;
-               __entry->bno = bp->b_bn;
-               __entry->buffer_length = bp->b_buffer_length;
-               __entry->flags = flags;
-               __entry->hold = atomic_read(&bp->b_hold);
-               __entry->pincount = atomic_read(&bp->b_pin_count);
-               __entry->lockval = bp->b_sema.count;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d flags %s caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long long)__entry->bno,
-                 __entry->buffer_length,
-                 __entry->hold,
-                 __entry->pincount,
-                 __entry->lockval,
-                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
-                 (void *)__entry->caller_ip)
-)
-
-#define DEFINE_BUF_FLAGS_EVENT(name) \
-DEFINE_EVENT(xfs_buf_flags_class, name, \
-       TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
-       TP_ARGS(bp, flags, caller_ip))
-DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
-DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
-DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
-
-TRACE_EVENT(xfs_buf_ioerror,
-       TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
-       TP_ARGS(bp, error, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_daddr_t, bno)
-               __field(size_t, buffer_length)
-               __field(unsigned, flags)
-               __field(int, hold)
-               __field(int, pincount)
-               __field(unsigned, lockval)
-               __field(int, error)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = bp->b_target->bt_dev;
-               __entry->bno = bp->b_bn;
-               __entry->buffer_length = bp->b_buffer_length;
-               __entry->hold = atomic_read(&bp->b_hold);
-               __entry->pincount = atomic_read(&bp->b_pin_count);
-               __entry->lockval = bp->b_sema.count;
-               __entry->error = error;
-               __entry->flags = bp->b_flags;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d error %d flags %s caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long long)__entry->bno,
-                 __entry->buffer_length,
-                 __entry->hold,
-                 __entry->pincount,
-                 __entry->lockval,
-                 __entry->error,
-                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
-                 (void *)__entry->caller_ip)
-);
-
-DECLARE_EVENT_CLASS(xfs_buf_item_class,
-       TP_PROTO(struct xfs_buf_log_item *bip),
-       TP_ARGS(bip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_daddr_t, buf_bno)
-               __field(size_t, buf_len)
-               __field(int, buf_hold)
-               __field(int, buf_pincount)
-               __field(int, buf_lockval)
-               __field(unsigned, buf_flags)
-               __field(unsigned, bli_recur)
-               __field(int, bli_refcount)
-               __field(unsigned, bli_flags)
-               __field(void *, li_desc)
-               __field(unsigned, li_flags)
-       ),
-       TP_fast_assign(
-               __entry->dev = bip->bli_buf->b_target->bt_dev;
-               __entry->bli_flags = bip->bli_flags;
-               __entry->bli_recur = bip->bli_recur;
-               __entry->bli_refcount = atomic_read(&bip->bli_refcount);
-               __entry->buf_bno = bip->bli_buf->b_bn;
-               __entry->buf_len = bip->bli_buf->b_buffer_length;
-               __entry->buf_flags = bip->bli_buf->b_flags;
-               __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
-               __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
-               __entry->buf_lockval = bip->bli_buf->b_sema.count;
-               __entry->li_desc = bip->bli_item.li_desc;
-               __entry->li_flags = bip->bli_item.li_flags;
-       ),
-       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
-                 "lock %d flags %s recur %d refcount %d bliflags %s "
-                 "lidesc 0x%p liflags %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 (unsigned long long)__entry->buf_bno,
-                 __entry->buf_len,
-                 __entry->buf_hold,
-                 __entry->buf_pincount,
-                 __entry->buf_lockval,
-                 __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
-                 __entry->bli_recur,
-                 __entry->bli_refcount,
-                 __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
-                 __entry->li_desc,
-                 __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
-)
-
-#define DEFINE_BUF_ITEM_EVENT(name) \
-DEFINE_EVENT(xfs_buf_item_class, name, \
-       TP_PROTO(struct xfs_buf_log_item *bip), \
-       TP_ARGS(bip))
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
-
-DECLARE_EVENT_CLASS(xfs_lock_class,
-       TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
-                unsigned long caller_ip),
-       TP_ARGS(ip,  lock_flags, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(int, lock_flags)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->lock_flags = lock_flags;
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
-                 (void *)__entry->caller_ip)
-)
-
-#define DEFINE_LOCK_EVENT(name) \
-DEFINE_EVENT(xfs_lock_class, name, \
-       TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
-                unsigned long caller_ip), \
-       TP_ARGS(ip,  lock_flags, caller_ip))
-DEFINE_LOCK_EVENT(xfs_ilock);
-DEFINE_LOCK_EVENT(xfs_ilock_nowait);
-DEFINE_LOCK_EVENT(xfs_ilock_demote);
-DEFINE_LOCK_EVENT(xfs_iunlock);
-
-DECLARE_EVENT_CLASS(xfs_inode_class,
-       TP_PROTO(struct xfs_inode *ip),
-       TP_ARGS(ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino)
-)
-
-#define DEFINE_INODE_EVENT(name) \
-DEFINE_EVENT(xfs_inode_class, name, \
-       TP_PROTO(struct xfs_inode *ip), \
-       TP_ARGS(ip))
-DEFINE_INODE_EVENT(xfs_iget_skip);
-DEFINE_INODE_EVENT(xfs_iget_reclaim);
-DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
-DEFINE_INODE_EVENT(xfs_iget_hit);
-DEFINE_INODE_EVENT(xfs_iget_miss);
-
-DEFINE_INODE_EVENT(xfs_getattr);
-DEFINE_INODE_EVENT(xfs_setattr);
-DEFINE_INODE_EVENT(xfs_readlink);
-DEFINE_INODE_EVENT(xfs_alloc_file_space);
-DEFINE_INODE_EVENT(xfs_free_file_space);
-DEFINE_INODE_EVENT(xfs_readdir);
-#ifdef CONFIG_XFS_POSIX_ACL
-DEFINE_INODE_EVENT(xfs_get_acl);
-#endif
-DEFINE_INODE_EVENT(xfs_vm_bmap);
-DEFINE_INODE_EVENT(xfs_file_ioctl);
-DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
-DEFINE_INODE_EVENT(xfs_ioctl_setattr);
-DEFINE_INODE_EVENT(xfs_file_fsync);
-DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_write_inode);
-DEFINE_INODE_EVENT(xfs_evict_inode);
-
-DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
-DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
-
-DECLARE_EVENT_CLASS(xfs_iref_class,
-       TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
-       TP_ARGS(ip, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(int, count)
-               __field(int, pincount)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->count = atomic_read(&VFS_I(ip)->i_count);
-               __entry->pincount = atomic_read(&ip->i_pincount);
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->count,
-                 __entry->pincount,
-                 (char *)__entry->caller_ip)
-)
-
-#define DEFINE_IREF_EVENT(name) \
-DEFINE_EVENT(xfs_iref_class, name, \
-       TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
-       TP_ARGS(ip, caller_ip))
-DEFINE_IREF_EVENT(xfs_ihold);
-DEFINE_IREF_EVENT(xfs_irele);
-DEFINE_IREF_EVENT(xfs_inode_pin);
-DEFINE_IREF_EVENT(xfs_inode_unpin);
-DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
-
-DECLARE_EVENT_CLASS(xfs_namespace_class,
-       TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
-       TP_ARGS(dp, name),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, dp_ino)
-               __dynamic_array(char, name, name->len)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(dp)->i_sb->s_dev;
-               __entry->dp_ino = dp->i_ino;
-               memcpy(__get_str(name), name->name, name->len);
-       ),
-       TP_printk("dev %d:%d dp ino 0x%llx name %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->dp_ino,
-                 __get_str(name))
-)
-
-#define DEFINE_NAMESPACE_EVENT(name) \
-DEFINE_EVENT(xfs_namespace_class, name, \
-       TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
-       TP_ARGS(dp, name))
-DEFINE_NAMESPACE_EVENT(xfs_remove);
-DEFINE_NAMESPACE_EVENT(xfs_link);
-DEFINE_NAMESPACE_EVENT(xfs_lookup);
-DEFINE_NAMESPACE_EVENT(xfs_create);
-DEFINE_NAMESPACE_EVENT(xfs_symlink);
-
-TRACE_EVENT(xfs_rename,
-       TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
-                struct xfs_name *src_name, struct xfs_name *target_name),
-       TP_ARGS(src_dp, target_dp, src_name, target_name),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, src_dp_ino)
-               __field(xfs_ino_t, target_dp_ino)
-               __dynamic_array(char, src_name, src_name->len)
-               __dynamic_array(char, target_name, target_name->len)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
-               __entry->src_dp_ino = src_dp->i_ino;
-               __entry->target_dp_ino = target_dp->i_ino;
-               memcpy(__get_str(src_name), src_name->name, src_name->len);
-               memcpy(__get_str(target_name), target_name->name, target_name->len);
-       ),
-       TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
-                 " src name %s target name %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->src_dp_ino,
-                 __entry->target_dp_ino,
-                 __get_str(src_name),
-                 __get_str(target_name))
-)
-
-DECLARE_EVENT_CLASS(xfs_dquot_class,
-       TP_PROTO(struct xfs_dquot *dqp),
-       TP_ARGS(dqp),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(u32, id)
-               __field(unsigned, flags)
-               __field(unsigned, nrefs)
-               __field(unsigned long long, res_bcount)
-               __field(unsigned long long, bcount)
-               __field(unsigned long long, icount)
-               __field(unsigned long long, blk_hardlimit)
-               __field(unsigned long long, blk_softlimit)
-               __field(unsigned long long, ino_hardlimit)
-               __field(unsigned long long, ino_softlimit)
-       ), \
-       TP_fast_assign(
-               __entry->dev = dqp->q_mount->m_super->s_dev;
-               __entry->id = be32_to_cpu(dqp->q_core.d_id);
-               __entry->flags = dqp->dq_flags;
-               __entry->nrefs = dqp->q_nrefs;
-               __entry->res_bcount = dqp->q_res_bcount;
-               __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
-               __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
-               __entry->blk_hardlimit =
-                       be64_to_cpu(dqp->q_core.d_blk_hardlimit);
-               __entry->blk_softlimit =
-                       be64_to_cpu(dqp->q_core.d_blk_softlimit);
-               __entry->ino_hardlimit =
-                       be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-               __entry->ino_softlimit =
-                       be64_to_cpu(dqp->q_core.d_ino_softlimit);
-       ),
-       TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
-                 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
-                 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->id,
-                 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
-                 __entry->nrefs,
-                 __entry->res_bcount,
-                 __entry->bcount,
-                 __entry->blk_hardlimit,
-                 __entry->blk_softlimit,
-                 __entry->icount,
-                 __entry->ino_hardlimit,
-                 __entry->ino_softlimit)
-)
-
-#define DEFINE_DQUOT_EVENT(name) \
-DEFINE_EVENT(xfs_dquot_class, name, \
-       TP_PROTO(struct xfs_dquot *dqp), \
-       TP_ARGS(dqp))
-DEFINE_DQUOT_EVENT(xfs_dqadjust);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
-DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
-DEFINE_DQUOT_EVENT(xfs_dqattach_found);
-DEFINE_DQUOT_EVENT(xfs_dqattach_get);
-DEFINE_DQUOT_EVENT(xfs_dqinit);
-DEFINE_DQUOT_EVENT(xfs_dqreuse);
-DEFINE_DQUOT_EVENT(xfs_dqalloc);
-DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
-DEFINE_DQUOT_EVENT(xfs_dqread);
-DEFINE_DQUOT_EVENT(xfs_dqread_fail);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
-DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
-DEFINE_DQUOT_EVENT(xfs_dqget_hit);
-DEFINE_DQUOT_EVENT(xfs_dqget_miss);
-DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_wait);
-DEFINE_DQUOT_EVENT(xfs_dqput_free);
-DEFINE_DQUOT_EVENT(xfs_dqrele);
-DEFINE_DQUOT_EVENT(xfs_dqflush);
-DEFINE_DQUOT_EVENT(xfs_dqflush_force);
-DEFINE_DQUOT_EVENT(xfs_dqflush_done);
-
-DECLARE_EVENT_CLASS(xfs_loggrant_class,
-       TP_PROTO(struct log *log, struct xlog_ticket *tic),
-       TP_ARGS(log, tic),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(unsigned, trans_type)
-               __field(char, ocnt)
-               __field(char, cnt)
-               __field(int, curr_res)
-               __field(int, unit_res)
-               __field(unsigned int, flags)
-               __field(int, reserveq)
-               __field(int, writeq)
-               __field(int, grant_reserve_cycle)
-               __field(int, grant_reserve_bytes)
-               __field(int, grant_write_cycle)
-               __field(int, grant_write_bytes)
-               __field(int, curr_cycle)
-               __field(int, curr_block)
-               __field(xfs_lsn_t, tail_lsn)
-       ),
-       TP_fast_assign(
-               __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->trans_type = tic->t_trans_type;
-               __entry->ocnt = tic->t_ocnt;
-               __entry->cnt = tic->t_cnt;
-               __entry->curr_res = tic->t_curr_res;
-               __entry->unit_res = tic->t_unit_res;
-               __entry->flags = tic->t_flags;
-               __entry->reserveq = list_empty(&log->l_reserveq);
-               __entry->writeq = list_empty(&log->l_writeq);
-               xlog_crack_grant_head(&log->l_grant_reserve_head,
-                               &__entry->grant_reserve_cycle,
-                               &__entry->grant_reserve_bytes);
-               xlog_crack_grant_head(&log->l_grant_write_head,
-                               &__entry->grant_write_cycle,
-                               &__entry->grant_write_bytes);
-               __entry->curr_cycle = log->l_curr_cycle;
-               __entry->curr_block = log->l_curr_block;
-               __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
-       ),
-       TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
-                 "t_unit_res %u t_flags %s reserveq %s "
-                 "writeq %s grant_reserve_cycle %d "
-                 "grant_reserve_bytes %d grant_write_cycle %d "
-                 "grant_write_bytes %d curr_cycle %d curr_block %d "
-                 "tail_cycle %d tail_block %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
-                 __entry->ocnt,
-                 __entry->cnt,
-                 __entry->curr_res,
-                 __entry->unit_res,
-                 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
-                 __entry->reserveq ? "empty" : "active",
-                 __entry->writeq ? "empty" : "active",
-                 __entry->grant_reserve_cycle,
-                 __entry->grant_reserve_bytes,
-                 __entry->grant_write_cycle,
-                 __entry->grant_write_bytes,
-                 __entry->curr_cycle,
-                 __entry->curr_block,
-                 CYCLE_LSN(__entry->tail_lsn),
-                 BLOCK_LSN(__entry->tail_lsn)
-       )
-)
-
-#define DEFINE_LOGGRANT_EVENT(name) \
-DEFINE_EVENT(xfs_loggrant_class, name, \
-       TP_PROTO(struct log *log, struct xlog_ticket *tic), \
-       TP_ARGS(log, tic))
-DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
-DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
-DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
-DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
-DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-
-DECLARE_EVENT_CLASS(xfs_file_class,
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-       TP_ARGS(ip, count, offset, flags),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fsize_t, size)
-               __field(xfs_fsize_t, new_size)
-               __field(loff_t, offset)
-               __field(size_t, count)
-               __field(int, flags)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
-               __entry->offset = offset;
-               __entry->count = count;
-               __entry->flags = flags;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count 0x%zx ioflags %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->new_size,
-                 __entry->offset,
-                 __entry->count,
-                 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
-)
-
-#define DEFINE_RW_EVENT(name)          \
-DEFINE_EVENT(xfs_file_class, name,     \
-       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
-       TP_ARGS(ip, count, offset, flags))
-DEFINE_RW_EVENT(xfs_file_read);
-DEFINE_RW_EVENT(xfs_file_buffered_write);
-DEFINE_RW_EVENT(xfs_file_direct_write);
-DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
-
-DECLARE_EVENT_CLASS(xfs_page_class,
-       TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-       TP_ARGS(inode, page, off),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(pgoff_t, pgoff)
-               __field(loff_t, size)
-               __field(unsigned long, offset)
-               __field(int, delalloc)
-               __field(int, unwritten)
-       ),
-       TP_fast_assign(
-               int delalloc = -1, unwritten = -1;
-
-               if (page_has_buffers(page))
-                       xfs_count_page_state(page, &delalloc, &unwritten);
-               __entry->dev = inode->i_sb->s_dev;
-               __entry->ino = XFS_I(inode)->i_ino;
-               __entry->pgoff = page_offset(page);
-               __entry->size = i_size_read(inode);
-               __entry->offset = off;
-               __entry->delalloc = delalloc;
-               __entry->unwritten = unwritten;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-                 "delalloc %d unwritten %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->pgoff,
-                 __entry->size,
-                 __entry->offset,
-                 __entry->delalloc,
-                 __entry->unwritten)
-)
-
-#define DEFINE_PAGE_EVENT(name)                \
-DEFINE_EVENT(xfs_page_class, name,     \
-       TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
-       TP_ARGS(inode, page, off))
-DEFINE_PAGE_EVENT(xfs_writepage);
-DEFINE_PAGE_EVENT(xfs_releasepage);
-DEFINE_PAGE_EVENT(xfs_invalidatepage);
-
-DECLARE_EVENT_CLASS(xfs_imap_class,
-       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-                int type, struct xfs_bmbt_irec *irec),
-       TP_ARGS(ip, offset, count, type, irec),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(loff_t, size)
-               __field(loff_t, new_size)
-               __field(loff_t, offset)
-               __field(size_t, count)
-               __field(int, type)
-               __field(xfs_fileoff_t, startoff)
-               __field(xfs_fsblock_t, startblock)
-               __field(xfs_filblks_t, blockcount)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->size = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
-               __entry->offset = offset;
-               __entry->count = count;
-               __entry->type = type;
-               __entry->startoff = irec ? irec->br_startoff : 0;
-               __entry->startblock = irec ? irec->br_startblock : 0;
-               __entry->blockcount = irec ? irec->br_blockcount : 0;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count %zd type %s "
-                 "startoff 0x%llx startblock %lld blockcount 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->new_size,
-                 __entry->offset,
-                 __entry->count,
-                 __print_symbolic(__entry->type, XFS_IO_TYPES),
-                 __entry->startoff,
-                 (__int64_t)__entry->startblock,
-                 __entry->blockcount)
-)
-
-#define DEFINE_IOMAP_EVENT(name)       \
-DEFINE_EVENT(xfs_imap_class, name,     \
-       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
-                int type, struct xfs_bmbt_irec *irec),         \
-       TP_ARGS(ip, offset, count, type, irec))
-DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
-DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
-
-DECLARE_EVENT_CLASS(xfs_simple_io_class,
-       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-       TP_ARGS(ip, offset, count),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(loff_t, isize)
-               __field(loff_t, disize)
-               __field(loff_t, new_size)
-               __field(loff_t, offset)
-               __field(size_t, count)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->isize = ip->i_size;
-               __entry->disize = ip->i_d.di_size;
-               __entry->new_size = ip->i_new_size;
-               __entry->offset = offset;
-               __entry->count = count;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
-                 "offset 0x%llx count %zd",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->isize,
-                 __entry->disize,
-                 __entry->new_size,
-                 __entry->offset,
-                 __entry->count)
-);
-
-#define DEFINE_SIMPLE_IO_EVENT(name)   \
-DEFINE_EVENT(xfs_simple_io_class, name,        \
-       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
-       TP_ARGS(ip, offset, count))
-DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
-DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
-DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
-DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
-
-DECLARE_EVENT_CLASS(xfs_itrunc_class,
-       TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
-       TP_ARGS(ip, new_size),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fsize_t, size)
-               __field(xfs_fsize_t, new_size)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->size = ip->i_d.di_size;
-               __entry->new_size = new_size;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->new_size)
-)
-
-#define DEFINE_ITRUNC_EVENT(name) \
-DEFINE_EVENT(xfs_itrunc_class, name, \
-       TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
-       TP_ARGS(ip, new_size))
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
-DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
-
-TRACE_EVENT(xfs_pagecache_inval,
-       TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
-       TP_ARGS(ip, start, finish),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fsize_t, size)
-               __field(xfs_off_t, start)
-               __field(xfs_off_t, finish)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->size = ip->i_d.di_size;
-               __entry->start = start;
-               __entry->finish = finish;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->start,
-                 __entry->finish)
-);
-
-TRACE_EVENT(xfs_bunmap,
-       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
-                int flags, unsigned long caller_ip),
-       TP_ARGS(ip, bno, len, flags, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(xfs_fsize_t, size)
-               __field(xfs_fileoff_t, bno)
-               __field(xfs_filblks_t, len)
-               __field(unsigned long, caller_ip)
-               __field(int, flags)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->ino = ip->i_ino;
-               __entry->size = ip->i_d.di_size;
-               __entry->bno = bno;
-               __entry->len = len;
-               __entry->caller_ip = caller_ip;
-               __entry->flags = flags;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
-                 "flags %s caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->bno,
-                 __entry->len,
-                 __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
-                 (void *)__entry->caller_ip)
-
-);
-
-DECLARE_EVENT_CLASS(xfs_busy_class,
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                xfs_agblock_t agbno, xfs_extlen_t len),
-       TP_ARGS(mp, agno, agbno, len),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, len)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = agno;
-               __entry->agbno = agbno;
-               __entry->len = len;
-       ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->len)
-);
-#define DEFINE_BUSY_EVENT(name) \
-DEFINE_EVENT(xfs_busy_class, name, \
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-                xfs_agblock_t agbno, xfs_extlen_t len), \
-       TP_ARGS(mp, agno, agbno, len))
-DEFINE_BUSY_EVENT(xfs_alloc_busy);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
-DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
-
-TRACE_EVENT(xfs_alloc_busy_trim,
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                xfs_agblock_t agbno, xfs_extlen_t len,
-                xfs_agblock_t tbno, xfs_extlen_t tlen),
-       TP_ARGS(mp, agno, agbno, len, tbno, tlen),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, len)
-               __field(xfs_agblock_t, tbno)
-               __field(xfs_extlen_t, tlen)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = agno;
-               __entry->agbno = agbno;
-               __entry->len = len;
-               __entry->tbno = tbno;
-               __entry->tlen = tlen;
-       ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->len,
-                 __entry->tbno,
-                 __entry->tlen)
-);
-
-TRACE_EVENT(xfs_trans_commit_lsn,
-       TP_PROTO(struct xfs_trans *trans),
-       TP_ARGS(trans),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(struct xfs_trans *, tp)
-               __field(xfs_lsn_t, lsn)
-       ),
-       TP_fast_assign(
-               __entry->dev = trans->t_mountp->m_super->s_dev;
-               __entry->tp = trans;
-               __entry->lsn = trans->t_commit_lsn;
-       ),
-       TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->tp,
-                 __entry->lsn)
-);
-
-TRACE_EVENT(xfs_agf,
-       TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
-                unsigned long caller_ip),
-       TP_ARGS(mp, agf, flags, caller_ip),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(int, flags)
-               __field(__u32, length)
-               __field(__u32, bno_root)
-               __field(__u32, cnt_root)
-               __field(__u32, bno_level)
-               __field(__u32, cnt_level)
-               __field(__u32, flfirst)
-               __field(__u32, fllast)
-               __field(__u32, flcount)
-               __field(__u32, freeblks)
-               __field(__u32, longest)
-               __field(unsigned long, caller_ip)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = be32_to_cpu(agf->agf_seqno),
-               __entry->flags = flags;
-               __entry->length = be32_to_cpu(agf->agf_length),
-               __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
-               __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
-               __entry->bno_level =
-                               be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
-               __entry->cnt_level =
-                               be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
-               __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
-               __entry->fllast = be32_to_cpu(agf->agf_fllast),
-               __entry->flcount = be32_to_cpu(agf->agf_flcount),
-               __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
-               __entry->longest = be32_to_cpu(agf->agf_longest);
-               __entry->caller_ip = caller_ip;
-       ),
-       TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
-                 "levels b %u c %u flfirst %u fllast %u flcount %u "
-                 "freeblks %u longest %u caller %pf",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
-                 __entry->length,
-                 __entry->bno_root,
-                 __entry->cnt_root,
-                 __entry->bno_level,
-                 __entry->cnt_level,
-                 __entry->flfirst,
-                 __entry->fllast,
-                 __entry->flcount,
-                 __entry->freeblks,
-                 __entry->longest,
-                 (void *)__entry->caller_ip)
-);
-
-TRACE_EVENT(xfs_free_extent,
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
-                xfs_extlen_t len, bool isfl, int haveleft, int haveright),
-       TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, len)
-               __field(int, isfl)
-               __field(int, haveleft)
-               __field(int, haveright)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = agno;
-               __entry->agbno = agbno;
-               __entry->len = len;
-               __entry->isfl = isfl;
-               __entry->haveleft = haveleft;
-               __entry->haveright = haveright;
-       ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->len,
-                 __entry->isfl,
-                 __entry->haveleft ?
-                       (__entry->haveright ? "both" : "left") :
-                       (__entry->haveright ? "right" : "none"))
-
-);
-
-DECLARE_EVENT_CLASS(xfs_alloc_class,
-       TP_PROTO(struct xfs_alloc_arg *args),
-       TP_ARGS(args),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, minlen)
-               __field(xfs_extlen_t, maxlen)
-               __field(xfs_extlen_t, mod)
-               __field(xfs_extlen_t, prod)
-               __field(xfs_extlen_t, minleft)
-               __field(xfs_extlen_t, total)
-               __field(xfs_extlen_t, alignment)
-               __field(xfs_extlen_t, minalignslop)
-               __field(xfs_extlen_t, len)
-               __field(short, type)
-               __field(short, otype)
-               __field(char, wasdel)
-               __field(char, wasfromfl)
-               __field(char, isfl)
-               __field(char, userdata)
-               __field(xfs_fsblock_t, firstblock)
-       ),
-       TP_fast_assign(
-               __entry->dev = args->mp->m_super->s_dev;
-               __entry->agno = args->agno;
-               __entry->agbno = args->agbno;
-               __entry->minlen = args->minlen;
-               __entry->maxlen = args->maxlen;
-               __entry->mod = args->mod;
-               __entry->prod = args->prod;
-               __entry->minleft = args->minleft;
-               __entry->total = args->total;
-               __entry->alignment = args->alignment;
-               __entry->minalignslop = args->minalignslop;
-               __entry->len = args->len;
-               __entry->type = args->type;
-               __entry->otype = args->otype;
-               __entry->wasdel = args->wasdel;
-               __entry->wasfromfl = args->wasfromfl;
-               __entry->isfl = args->isfl;
-               __entry->userdata = args->userdata;
-               __entry->firstblock = args->firstblock;
-       ),
-       TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
-                 "prod %u minleft %u total %u alignment %u minalignslop %u "
-                 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
-                 "userdata %d firstblock 0x%llx",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->minlen,
-                 __entry->maxlen,
-                 __entry->mod,
-                 __entry->prod,
-                 __entry->minleft,
-                 __entry->total,
-                 __entry->alignment,
-                 __entry->minalignslop,
-                 __entry->len,
-                 __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
-                 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
-                 __entry->wasdel,
-                 __entry->wasfromfl,
-                 __entry->isfl,
-                 __entry->userdata,
-                 (unsigned long long)__entry->firstblock)
-)
-
-#define DEFINE_ALLOC_EVENT(name) \
-DEFINE_EVENT(xfs_alloc_class, name, \
-       TP_PROTO(struct xfs_alloc_arg *args), \
-       TP_ARGS(args))
-DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
-DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
-DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
-DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
-DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
-DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
-DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
-DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
-DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
-
-DECLARE_EVENT_CLASS(xfs_dir2_class,
-       TP_PROTO(struct xfs_da_args *args),
-       TP_ARGS(args),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __dynamic_array(char, name, args->namelen)
-               __field(int, namelen)
-               __field(xfs_dahash_t, hashval)
-               __field(xfs_ino_t, inumber)
-               __field(int, op_flags)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
-               __entry->ino = args->dp->i_ino;
-               if (args->namelen)
-                       memcpy(__get_str(name), args->name, args->namelen);
-               __entry->namelen = args->namelen;
-               __entry->hashval = args->hashval;
-               __entry->inumber = args->inumber;
-               __entry->op_flags = args->op_flags;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
-                 "inumber 0x%llx op_flags %s",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->namelen,
-                 __entry->namelen ? __get_str(name) : NULL,
-                 __entry->namelen,
-                 __entry->hashval,
-                 __entry->inumber,
-                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
-)
-
-#define DEFINE_DIR2_EVENT(name) \
-DEFINE_EVENT(xfs_dir2_class, name, \
-       TP_PROTO(struct xfs_da_args *args), \
-       TP_ARGS(args))
-DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
-DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
-DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
-DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
-DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
-DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
-DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
-DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
-DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
-DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
-DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
-DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
-DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
-DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
-
-DECLARE_EVENT_CLASS(xfs_dir2_space_class,
-       TP_PROTO(struct xfs_da_args *args, int idx),
-       TP_ARGS(args, idx),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(int, op_flags)
-               __field(int, idx)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
-               __entry->ino = args->dp->i_ino;
-               __entry->op_flags = args->op_flags;
-               __entry->idx = idx;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
-                 __entry->idx)
-)
-
-#define DEFINE_DIR2_SPACE_EVENT(name) \
-DEFINE_EVENT(xfs_dir2_space_class, name, \
-       TP_PROTO(struct xfs_da_args *args, int idx), \
-       TP_ARGS(args, idx))
-DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
-DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
-DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
-DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
-
-TRACE_EVENT(xfs_dir2_leafn_moveents,
-       TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
-       TP_ARGS(args, src_idx, dst_idx, count),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(int, op_flags)
-               __field(int, src_idx)
-               __field(int, dst_idx)
-               __field(int, count)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
-               __entry->ino = args->dp->i_ino;
-               __entry->op_flags = args->op_flags;
-               __entry->src_idx = src_idx;
-               __entry->dst_idx = dst_idx;
-               __entry->count = count;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx op_flags %s "
-                 "src_idx %d dst_idx %d count %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
-                 __entry->src_idx,
-                 __entry->dst_idx,
-                 __entry->count)
-);
-
-#define XFS_SWAPEXT_INODES \
-       { 0,    "target" }, \
-       { 1,    "temp" }
-
-#define XFS_INODE_FORMAT_STR \
-       { 0,    "invalid" }, \
-       { 1,    "local" }, \
-       { 2,    "extent" }, \
-       { 3,    "btree" }
-
-DECLARE_EVENT_CLASS(xfs_swap_extent_class,
-       TP_PROTO(struct xfs_inode *ip, int which),
-       TP_ARGS(ip, which),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(int, which)
-               __field(xfs_ino_t, ino)
-               __field(int, format)
-               __field(int, nex)
-               __field(int, max_nex)
-               __field(int, broot_size)
-               __field(int, fork_off)
-       ),
-       TP_fast_assign(
-               __entry->dev = VFS_I(ip)->i_sb->s_dev;
-               __entry->which = which;
-               __entry->ino = ip->i_ino;
-               __entry->format = ip->i_d.di_format;
-               __entry->nex = ip->i_d.di_nextents;
-               __entry->max_nex = ip->i_df.if_ext_max;
-               __entry->broot_size = ip->i_df.if_broot_bytes;
-               __entry->fork_off = XFS_IFORK_BOFF(ip);
-       ),
-       TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
-                 "Max in-fork extents %d, broot size %d, fork offset %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
-                 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
-                 __entry->nex,
-                 __entry->max_nex,
-                 __entry->broot_size,
-                 __entry->fork_off)
-)
-
-#define DEFINE_SWAPEXT_EVENT(name) \
-DEFINE_EVENT(xfs_swap_extent_class, name, \
-       TP_PROTO(struct xfs_inode *ip, int which), \
-       TP_ARGS(ip, which))
-
-DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
-DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
-
-DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
-       TP_PROTO(struct log *log, struct xlog_recover *trans,
-               struct xlog_recover_item *item, int pass),
-       TP_ARGS(log, trans, item, pass),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(unsigned long, item)
-               __field(xlog_tid_t, tid)
-               __field(int, type)
-               __field(int, pass)
-               __field(int, count)
-               __field(int, total)
-       ),
-       TP_fast_assign(
-               __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->item = (unsigned long)item;
-               __entry->tid = trans->r_log_tid;
-               __entry->type = ITEM_TYPE(item);
-               __entry->pass = pass;
-               __entry->count = item->ri_cnt;
-               __entry->total = item->ri_total;
-       ),
-       TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
-                 "item region count/total %d/%d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->tid,
-                 __entry->pass,
-                 (void *)__entry->item,
-                 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
-                 __entry->count,
-                 __entry->total)
-)
-
-#define DEFINE_LOG_RECOVER_ITEM(name) \
-DEFINE_EVENT(xfs_log_recover_item_class, name, \
-       TP_PROTO(struct log *log, struct xlog_recover *trans, \
-               struct xlog_recover_item *item, int pass), \
-       TP_ARGS(log, trans, item, pass))
-
-DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
-DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
-DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
-DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
-DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
-
-DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
-       TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
-       TP_ARGS(log, buf_f),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(__int64_t, blkno)
-               __field(unsigned short, len)
-               __field(unsigned short, flags)
-               __field(unsigned short, size)
-               __field(unsigned int, map_size)
-       ),
-       TP_fast_assign(
-               __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->blkno = buf_f->blf_blkno;
-               __entry->len = buf_f->blf_len;
-               __entry->flags = buf_f->blf_flags;
-               __entry->size = buf_f->blf_size;
-               __entry->map_size = buf_f->blf_map_size;
-       ),
-       TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
-                       "map_size %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->blkno,
-                 __entry->len,
-                 __entry->flags,
-                 __entry->size,
-                 __entry->map_size)
-)
-
-#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
-DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
-       TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
-       TP_ARGS(log, buf_f))
-
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
-DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
-
-DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
-       TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
-       TP_ARGS(log, in_f),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_ino_t, ino)
-               __field(unsigned short, size)
-               __field(int, fields)
-               __field(unsigned short, asize)
-               __field(unsigned short, dsize)
-               __field(__int64_t, blkno)
-               __field(int, len)
-               __field(int, boffset)
-       ),
-       TP_fast_assign(
-               __entry->dev = log->l_mp->m_super->s_dev;
-               __entry->ino = in_f->ilf_ino;
-               __entry->size = in_f->ilf_size;
-               __entry->fields = in_f->ilf_fields;
-               __entry->asize = in_f->ilf_asize;
-               __entry->dsize = in_f->ilf_dsize;
-               __entry->blkno = in_f->ilf_blkno;
-               __entry->len = in_f->ilf_len;
-               __entry->boffset = in_f->ilf_boffset;
-       ),
-       TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
-                       "dsize %d, blkno 0x%llx, len %d, boffset %d",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->ino,
-                 __entry->size,
-                 __entry->fields,
-                 __entry->asize,
-                 __entry->dsize,
-                 __entry->blkno,
-                 __entry->len,
-                 __entry->boffset)
-)
-#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
-DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
-       TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
-       TP_ARGS(log, in_f))
-
-DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
-DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
-DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
-
-DECLARE_EVENT_CLASS(xfs_discard_class,
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                xfs_agblock_t agbno, xfs_extlen_t len),
-       TP_ARGS(mp, agno, agbno, len),
-       TP_STRUCT__entry(
-               __field(dev_t, dev)
-               __field(xfs_agnumber_t, agno)
-               __field(xfs_agblock_t, agbno)
-               __field(xfs_extlen_t, len)
-       ),
-       TP_fast_assign(
-               __entry->dev = mp->m_super->s_dev;
-               __entry->agno = agno;
-               __entry->agbno = agbno;
-               __entry->len = len;
-       ),
-       TP_printk("dev %d:%d agno %u agbno %u len %u\n",
-                 MAJOR(__entry->dev), MINOR(__entry->dev),
-                 __entry->agno,
-                 __entry->agbno,
-                 __entry->len)
-)
-
-#define DEFINE_DISCARD_EVENT(name) \
-DEFINE_EVENT(xfs_discard_class, name, \
-       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-                xfs_agblock_t agbno, xfs_extlen_t len), \
-       TP_ARGS(mp, agno, agbno, len))
-DEFINE_DISCARD_EVENT(xfs_discard_extent);
-DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
-DEFINE_DISCARD_EVENT(xfs_discard_exclude);
-DEFINE_DISCARD_EVENT(xfs_discard_busy);
-
-#endif /* _TRACE_XFS_H */
-
-#undef TRACE_INCLUDE_PATH
-#define TRACE_INCLUDE_PATH .
-#define TRACE_INCLUDE_FILE xfs_trace
-#include <trace/define_trace.h>
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
deleted file mode 100644 (file)
index 7c220b4..0000000
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_VNODE_H__
-#define __XFS_VNODE_H__
-
-#include "xfs_fs.h"
-
-struct file;
-struct xfs_inode;
-struct xfs_iomap;
-struct attrlist_cursor_kern;
-
-/*
- * Return values for xfs_inactive.  A return value of
- * VN_INACTIVE_NOCACHE implies that the file system behavior
- * has disassociated its state and bhv_desc_t from the vnode.
- */
-#define        VN_INACTIVE_CACHE       0
-#define        VN_INACTIVE_NOCACHE     1
-
-/*
- * Flags for read/write calls - same values as IRIX
- */
-#define IO_ISDIRECT    0x00004         /* bypass page cache */
-#define IO_INVIS       0x00020         /* don't update inode timestamps */
-
-#define XFS_IO_FLAGS \
-       { IO_ISDIRECT,  "DIRECT" }, \
-       { IO_INVIS,     "INVIS"}
-
-/*
- * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
- */
-#define FI_NONE                        0       /* none */
-#define FI_REMAPF              1       /* Do a remapf prior to the operation */
-#define FI_REMAPF_LOCKED       2       /* Do a remapf prior to the operation.
-                                          Prevent VM access to the pages until
-                                          the operation completes. */
-
-/*
- * Some useful predicates.
- */
-#define VN_MAPPED(vp)  mapping_mapped(vp->i_mapping)
-#define VN_CACHED(vp)  (vp->i_mapping->nrpages)
-#define VN_DIRTY(vp)   mapping_tagged(vp->i_mapping, \
-                                       PAGECACHE_TAG_DIRTY)
-
-
-#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_xattr.c b/fs/xfs/linux-2.6/xfs_xattr.c
deleted file mode 100644 (file)
index 87d3e03..0000000
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (C) 2008 Christoph Hellwig.
- * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include "xfs.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_attr.h"
-#include "xfs_attr_leaf.h"
-#include "xfs_acl.h"
-#include "xfs_vnodeops.h"
-
-#include <linux/posix_acl_xattr.h>
-#include <linux/xattr.h>
-
-
-static int
-xfs_xattr_get(struct dentry *dentry, const char *name,
-               void *value, size_t size, int xflags)
-{
-       struct xfs_inode *ip = XFS_I(dentry->d_inode);
-       int error, asize = size;
-
-       if (strcmp(name, "") == 0)
-               return -EINVAL;
-
-       /* Convert Linux syscall to XFS internal ATTR flags */
-       if (!size) {
-               xflags |= ATTR_KERNOVAL;
-               value = NULL;
-       }
-
-       error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
-       if (error)
-               return error;
-       return asize;
-}
-
-static int
-xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
-               size_t size, int flags, int xflags)
-{
-       struct xfs_inode *ip = XFS_I(dentry->d_inode);
-
-       if (strcmp(name, "") == 0)
-               return -EINVAL;
-
-       /* Convert Linux syscall to XFS internal ATTR flags */
-       if (flags & XATTR_CREATE)
-               xflags |= ATTR_CREATE;
-       if (flags & XATTR_REPLACE)
-               xflags |= ATTR_REPLACE;
-
-       if (!value)
-               return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
-       return -xfs_attr_set(ip, (unsigned char *)name,
-                               (void *)value, size, xflags);
-}
-
-static const struct xattr_handler xfs_xattr_user_handler = {
-       .prefix = XATTR_USER_PREFIX,
-       .flags  = 0, /* no flags implies user namespace */
-       .get    = xfs_xattr_get,
-       .set    = xfs_xattr_set,
-};
-
-static const struct xattr_handler xfs_xattr_trusted_handler = {
-       .prefix = XATTR_TRUSTED_PREFIX,
-       .flags  = ATTR_ROOT,
-       .get    = xfs_xattr_get,
-       .set    = xfs_xattr_set,
-};
-
-static const struct xattr_handler xfs_xattr_security_handler = {
-       .prefix = XATTR_SECURITY_PREFIX,
-       .flags  = ATTR_SECURE,
-       .get    = xfs_xattr_get,
-       .set    = xfs_xattr_set,
-};
-
-const struct xattr_handler *xfs_xattr_handlers[] = {
-       &xfs_xattr_user_handler,
-       &xfs_xattr_trusted_handler,
-       &xfs_xattr_security_handler,
-#ifdef CONFIG_XFS_POSIX_ACL
-       &xfs_xattr_acl_access_handler,
-       &xfs_xattr_acl_default_handler,
-#endif
-       NULL
-};
-
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
-       if (flags & XFS_ATTR_SECURE)
-               return sizeof("security");
-       else if (flags & XFS_ATTR_ROOT)
-               return sizeof("trusted");
-       else
-               return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
-       if (flags & XFS_ATTR_SECURE)
-               return xfs_xattr_security_handler.prefix;
-       else if (flags & XFS_ATTR_ROOT)
-               return xfs_xattr_trusted_handler.prefix;
-       else
-               return xfs_xattr_user_handler.prefix;
-}
-
-static int
-xfs_xattr_put_listent(
-       struct xfs_attr_list_context *context,
-       int             flags,
-       unsigned char   *name,
-       int             namelen,
-       int             valuelen,
-       unsigned char   *value)
-{
-       unsigned int prefix_len = xfs_xattr_prefix_len(flags);
-       char *offset;
-       int arraytop;
-
-       ASSERT(context->count >= 0);
-
-       /*
-        * Only show root namespace entries if we are actually allowed to
-        * see them.
-        */
-       if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
-               return 0;
-
-       arraytop = context->count + prefix_len + namelen + 1;
-       if (arraytop > context->firstu) {
-               context->count = -1;    /* insufficient space */
-               return 1;
-       }
-       offset = (char *)context->alist + context->count;
-       strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
-       offset += prefix_len;
-       strncpy(offset, (char *)name, namelen);                 /* real name */
-       offset += namelen;
-       *offset = '\0';
-       context->count += prefix_len + namelen + 1;
-       return 0;
-}
-
-static int
-xfs_xattr_put_listent_sizes(
-       struct xfs_attr_list_context *context,
-       int             flags,
-       unsigned char   *name,
-       int             namelen,
-       int             valuelen,
-       unsigned char   *value)
-{
-       context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
-       return 0;
-}
-
-static int
-list_one_attr(const char *name, const size_t len, void *data,
-               size_t size, ssize_t *result)
-{
-       char *p = data + *result;
-
-       *result += len;
-       if (!size)
-               return 0;
-       if (*result > size)
-               return -ERANGE;
-
-       strcpy(p, name);
-       return 0;
-}
-
-ssize_t
-xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
-{
-       struct xfs_attr_list_context context;
-       struct attrlist_cursor_kern cursor = { 0 };
-       struct inode            *inode = dentry->d_inode;
-       int                     error;
-
-       /*
-        * First read the regular on-disk attributes.
-        */
-       memset(&context, 0, sizeof(context));
-       context.dp = XFS_I(inode);
-       context.cursor = &cursor;
-       context.resynch = 1;
-       context.alist = data;
-       context.bufsize = size;
-       context.firstu = context.bufsize;
-
-       if (size)
-               context.put_listent = xfs_xattr_put_listent;
-       else
-               context.put_listent = xfs_xattr_put_listent_sizes;
-
-       xfs_attr_list_int(&context);
-       if (context.count < 0)
-               return -ERANGE;
-
-       /*
-        * Then add the two synthetic ACL attributes.
-        */
-       if (posix_acl_access_exists(inode)) {
-               error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
-                               strlen(POSIX_ACL_XATTR_ACCESS) + 1,
-                               data, size, &context.count);
-               if (error)
-                       return error;
-       }
-
-       if (posix_acl_default_exists(inode)) {
-               error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
-                               strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
-                               data, size, &context.count);
-               if (error)
-                       return error;
-       }
-
-       return context.count;
-}
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
new file mode 100644 (file)
index 0000000..ff6a198
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_MRLOCK_H__
+#define __XFS_SUPPORT_MRLOCK_H__
+
+#include <linux/rwsem.h>
+
+typedef struct {
+       struct rw_semaphore     mr_lock;
+#ifdef DEBUG
+       int                     mr_writer;
+#endif
+} mrlock_t;
+
+#ifdef DEBUG
+#define mrinit(mrp, name)      \
+       do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
+#else
+#define mrinit(mrp, name)      \
+       do { init_rwsem(&(mrp)->mr_lock); } while (0)
+#endif
+
+#define mrlock_init(mrp, t,n,s)        mrinit(mrp, n)
+#define mrfree(mrp)            do { } while (0)
+
+static inline void mraccess_nested(mrlock_t *mrp, int subclass)
+{
+       down_read_nested(&mrp->mr_lock, subclass);
+}
+
+static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
+{
+       down_write_nested(&mrp->mr_lock, subclass);
+#ifdef DEBUG
+       mrp->mr_writer = 1;
+#endif
+}
+
+static inline int mrtryaccess(mrlock_t *mrp)
+{
+       return down_read_trylock(&mrp->mr_lock);
+}
+
+static inline int mrtryupdate(mrlock_t *mrp)
+{
+       if (!down_write_trylock(&mrp->mr_lock))
+               return 0;
+#ifdef DEBUG
+       mrp->mr_writer = 1;
+#endif
+       return 1;
+}
+
+static inline void mrunlock_excl(mrlock_t *mrp)
+{
+#ifdef DEBUG
+       mrp->mr_writer = 0;
+#endif
+       up_write(&mrp->mr_lock);
+}
+
+static inline void mrunlock_shared(mrlock_t *mrp)
+{
+       up_read(&mrp->mr_lock);
+}
+
+static inline void mrdemote(mrlock_t *mrp)
+{
+#ifdef DEBUG
+       mrp->mr_writer = 0;
+#endif
+       downgrade_write(&mrp->mr_lock);
+}
+
+#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
deleted file mode 100644 (file)
index db62959..0000000
+++ /dev/null
@@ -1,1454 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-#include "xfs_trace.h"
-
-
-/*
-   LOCK ORDER
-
-   inode lock              (ilock)
-   dquot hash-chain lock    (hashlock)
-   xqm dquot freelist lock  (freelistlock
-   mount's dquot list lock  (mplistlock)
-   user dquot lock - lock ordering among dquots is based on the uid or gid
-   group dquot lock - similar to udquots. Between the two dquots, the udquot
-                     has to be locked first.
-   pin lock - the dquot lock must be held to take this lock.
-   flush lock - ditto.
-*/
-
-#ifdef DEBUG
-xfs_buftarg_t *xfs_dqerror_target;
-int xfs_do_dqerror;
-int xfs_dqreq_num;
-int xfs_dqerror_mod = 33;
-#endif
-
-static struct lock_class_key xfs_dquot_other_class;
-
-/*
- * Allocate and initialize a dquot. We don't always allocate fresh memory;
- * we try to reclaim a free dquot if the number of incore dquots are above
- * a threshold.
- * The only field inside the core that gets initialized at this point
- * is the d_id field. The idea is to fill in the entire q_core
- * when we read in the on disk dquot.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqinit(
-       xfs_mount_t  *mp,
-       xfs_dqid_t   id,
-       uint         type)
-{
-       xfs_dquot_t     *dqp;
-       boolean_t       brandnewdquot;
-
-       brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
-       dqp->dq_flags = type;
-       dqp->q_core.d_id = cpu_to_be32(id);
-       dqp->q_mount = mp;
-
-       /*
-        * No need to re-initialize these if this is a reclaimed dquot.
-        */
-       if (brandnewdquot) {
-               INIT_LIST_HEAD(&dqp->q_freelist);
-               mutex_init(&dqp->q_qlock);
-               init_waitqueue_head(&dqp->q_pinwait);
-
-               /*
-                * Because we want to use a counting completion, complete
-                * the flush completion once to allow a single access to
-                * the flush completion without blocking.
-                */
-               init_completion(&dqp->q_flush);
-               complete(&dqp->q_flush);
-
-               trace_xfs_dqinit(dqp);
-       } else {
-               /*
-                * Only the q_core portion was zeroed in dqreclaim_one().
-                * So, we need to reset others.
-                */
-               dqp->q_nrefs = 0;
-               dqp->q_blkno = 0;
-               INIT_LIST_HEAD(&dqp->q_mplist);
-               INIT_LIST_HEAD(&dqp->q_hashlist);
-               dqp->q_bufoffset = 0;
-               dqp->q_fileoffset = 0;
-               dqp->q_transp = NULL;
-               dqp->q_gdquot = NULL;
-               dqp->q_res_bcount = 0;
-               dqp->q_res_icount = 0;
-               dqp->q_res_rtbcount = 0;
-               atomic_set(&dqp->q_pincount, 0);
-               dqp->q_hash = NULL;
-               ASSERT(list_empty(&dqp->q_freelist));
-
-               trace_xfs_dqreuse(dqp);
-       }
-
-       /*
-        * In either case we need to make sure group quotas have a different
-        * lock class than user quotas, to make sure lockdep knows we can
-        * locks of one of each at the same time.
-        */
-       if (!(type & XFS_DQ_USER))
-               lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
-
-       /*
-        * log item gets initialized later
-        */
-       return (dqp);
-}
-
-/*
- * This is called to free all the memory associated with a dquot
- */
-void
-xfs_qm_dqdestroy(
-       xfs_dquot_t     *dqp)
-{
-       ASSERT(list_empty(&dqp->q_freelist));
-
-       mutex_destroy(&dqp->q_qlock);
-       kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
-
-       atomic_dec(&xfs_Gqm->qm_totaldquots);
-}
-
-/*
- * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
- */
-STATIC void
-xfs_qm_dqinit_core(
-       xfs_dqid_t      id,
-       uint            type,
-       xfs_dqblk_t     *d)
-{
-       /*
-        * Caller has zero'd the entire dquot 'chunk' already.
-        */
-       d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
-       d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
-       d->dd_diskdq.d_id = cpu_to_be32(id);
-       d->dd_diskdq.d_flags = type;
-}
-
-/*
- * If default limits are in force, push them into the dquot now.
- * We overwrite the dquot limits only if they are zero and this
- * is not the root dquot.
- */
-void
-xfs_qm_adjust_dqlimits(
-       xfs_mount_t             *mp,
-       xfs_disk_dquot_t        *d)
-{
-       xfs_quotainfo_t         *q = mp->m_quotainfo;
-
-       ASSERT(d->d_id);
-
-       if (q->qi_bsoftlimit && !d->d_blk_softlimit)
-               d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
-       if (q->qi_bhardlimit && !d->d_blk_hardlimit)
-               d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
-       if (q->qi_isoftlimit && !d->d_ino_softlimit)
-               d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
-       if (q->qi_ihardlimit && !d->d_ino_hardlimit)
-               d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
-       if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
-               d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
-       if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
-               d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
-}
-
-/*
- * Check the limits and timers of a dquot and start or reset timers
- * if necessary.
- * This gets called even when quota enforcement is OFF, which makes our
- * life a little less complicated. (We just don't reject any quota
- * reservations in that case, when enforcement is off).
- * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
- * enforcement's off.
- * In contrast, warnings are a little different in that they don't
- * 'automatically' get started when limits get exceeded.  They do
- * get reset to zero, however, when we find the count to be under
- * the soft limit (they are only ever set non-zero via userspace).
- */
-void
-xfs_qm_adjust_dqtimers(
-       xfs_mount_t             *mp,
-       xfs_disk_dquot_t        *d)
-{
-       ASSERT(d->d_id);
-
-#ifdef DEBUG
-       if (d->d_blk_hardlimit)
-               ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
-                      be64_to_cpu(d->d_blk_hardlimit));
-       if (d->d_ino_hardlimit)
-               ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
-                      be64_to_cpu(d->d_ino_hardlimit));
-       if (d->d_rtb_hardlimit)
-               ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
-                      be64_to_cpu(d->d_rtb_hardlimit));
-#endif
-
-       if (!d->d_btimer) {
-               if ((d->d_blk_softlimit &&
-                    (be64_to_cpu(d->d_bcount) >=
-                     be64_to_cpu(d->d_blk_softlimit))) ||
-                   (d->d_blk_hardlimit &&
-                    (be64_to_cpu(d->d_bcount) >=
-                     be64_to_cpu(d->d_blk_hardlimit)))) {
-                       d->d_btimer = cpu_to_be32(get_seconds() +
-                                       mp->m_quotainfo->qi_btimelimit);
-               } else {
-                       d->d_bwarns = 0;
-               }
-       } else {
-               if ((!d->d_blk_softlimit ||
-                    (be64_to_cpu(d->d_bcount) <
-                     be64_to_cpu(d->d_blk_softlimit))) &&
-                   (!d->d_blk_hardlimit ||
-                   (be64_to_cpu(d->d_bcount) <
-                    be64_to_cpu(d->d_blk_hardlimit)))) {
-                       d->d_btimer = 0;
-               }
-       }
-
-       if (!d->d_itimer) {
-               if ((d->d_ino_softlimit &&
-                    (be64_to_cpu(d->d_icount) >=
-                     be64_to_cpu(d->d_ino_softlimit))) ||
-                   (d->d_ino_hardlimit &&
-                    (be64_to_cpu(d->d_icount) >=
-                     be64_to_cpu(d->d_ino_hardlimit)))) {
-                       d->d_itimer = cpu_to_be32(get_seconds() +
-                                       mp->m_quotainfo->qi_itimelimit);
-               } else {
-                       d->d_iwarns = 0;
-               }
-       } else {
-               if ((!d->d_ino_softlimit ||
-                    (be64_to_cpu(d->d_icount) <
-                     be64_to_cpu(d->d_ino_softlimit)))  &&
-                   (!d->d_ino_hardlimit ||
-                    (be64_to_cpu(d->d_icount) <
-                     be64_to_cpu(d->d_ino_hardlimit)))) {
-                       d->d_itimer = 0;
-               }
-       }
-
-       if (!d->d_rtbtimer) {
-               if ((d->d_rtb_softlimit &&
-                    (be64_to_cpu(d->d_rtbcount) >=
-                     be64_to_cpu(d->d_rtb_softlimit))) ||
-                   (d->d_rtb_hardlimit &&
-                    (be64_to_cpu(d->d_rtbcount) >=
-                     be64_to_cpu(d->d_rtb_hardlimit)))) {
-                       d->d_rtbtimer = cpu_to_be32(get_seconds() +
-                                       mp->m_quotainfo->qi_rtbtimelimit);
-               } else {
-                       d->d_rtbwarns = 0;
-               }
-       } else {
-               if ((!d->d_rtb_softlimit ||
-                    (be64_to_cpu(d->d_rtbcount) <
-                     be64_to_cpu(d->d_rtb_softlimit))) &&
-                   (!d->d_rtb_hardlimit ||
-                    (be64_to_cpu(d->d_rtbcount) <
-                     be64_to_cpu(d->d_rtb_hardlimit)))) {
-                       d->d_rtbtimer = 0;
-               }
-       }
-}
-
-/*
- * initialize a buffer full of dquots and log the whole thing
- */
-STATIC void
-xfs_qm_init_dquot_blk(
-       xfs_trans_t     *tp,
-       xfs_mount_t     *mp,
-       xfs_dqid_t      id,
-       uint            type,
-       xfs_buf_t       *bp)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       xfs_dqblk_t     *d;
-       int             curid, i;
-
-       ASSERT(tp);
-       ASSERT(xfs_buf_islocked(bp));
-
-       d = bp->b_addr;
-
-       /*
-        * ID of the first dquot in the block - id's are zero based.
-        */
-       curid = id - (id % q->qi_dqperchunk);
-       ASSERT(curid >= 0);
-       memset(d, 0, BBTOB(q->qi_dqchunklen));
-       for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
-               xfs_qm_dqinit_core(curid, type, d);
-       xfs_trans_dquot_buf(tp, bp,
-                           (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
-                           ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
-                            XFS_BLF_GDQUOT_BUF)));
-       xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
-}
-
-
-
-/*
- * Allocate a block and fill it with dquots.
- * This is called when the bmapi finds a hole.
- */
-STATIC int
-xfs_qm_dqalloc(
-       xfs_trans_t     **tpp,
-       xfs_mount_t     *mp,
-       xfs_dquot_t     *dqp,
-       xfs_inode_t     *quotip,
-       xfs_fileoff_t   offset_fsb,
-       xfs_buf_t       **O_bpp)
-{
-       xfs_fsblock_t   firstblock;
-       xfs_bmap_free_t flist;
-       xfs_bmbt_irec_t map;
-       int             nmaps, error, committed;
-       xfs_buf_t       *bp;
-       xfs_trans_t     *tp = *tpp;
-
-       ASSERT(tp != NULL);
-
-       trace_xfs_dqalloc(dqp);
-
-       /*
-        * Initialize the bmap freelist prior to calling bmapi code.
-        */
-       xfs_bmap_init(&flist, &firstblock);
-       xfs_ilock(quotip, XFS_ILOCK_EXCL);
-       /*
-        * Return if this type of quotas is turned off while we didn't
-        * have an inode lock
-        */
-       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-               xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-               return (ESRCH);
-       }
-
-       xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
-       nmaps = 1;
-       if ((error = xfs_bmapi(tp, quotip,
-                             offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
-                             XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
-                             &firstblock,
-                             XFS_QM_DQALLOC_SPACE_RES(mp),
-                             &map, &nmaps, &flist))) {
-               goto error0;
-       }
-       ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
-       ASSERT(nmaps == 1);
-       ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
-              (map.br_startblock != HOLESTARTBLOCK));
-
-       /*
-        * Keep track of the blkno to save a lookup later
-        */
-       dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
-       /* now we can just get the buffer (there's nothing to read yet) */
-       bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
-                              dqp->q_blkno,
-                              mp->m_quotainfo->qi_dqchunklen,
-                              0);
-       if (!bp || (error = xfs_buf_geterror(bp)))
-               goto error1;
-       /*
-        * Make a chunk of dquots out of this buffer and log
-        * the entire thing.
-        */
-       xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
-                             dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
-
-       /*
-        * xfs_bmap_finish() may commit the current transaction and
-        * start a second transaction if the freelist is not empty.
-        *
-        * Since we still want to modify this buffer, we need to
-        * ensure that the buffer is not released on commit of
-        * the first transaction and ensure the buffer is added to the
-        * second transaction.
-        *
-        * If there is only one transaction then don't stop the buffer
-        * from being released when it commits later on.
-        */
-
-       xfs_trans_bhold(tp, bp);
-
-       if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
-               goto error1;
-       }
-
-       if (committed) {
-               tp = *tpp;
-               xfs_trans_bjoin(tp, bp);
-       } else {
-               xfs_trans_bhold_release(tp, bp);
-       }
-
-       *O_bpp = bp;
-       return 0;
-
-      error1:
-       xfs_bmap_cancel(&flist);
-      error0:
-       xfs_iunlock(quotip, XFS_ILOCK_EXCL);
-
-       return (error);
-}
-
-/*
- * Maps a dquot to the buffer containing its on-disk version.
- * This returns a ptr to the buffer containing the on-disk dquot
- * in the bpp param, and a ptr to the on-disk dquot within that buffer
- */
-STATIC int
-xfs_qm_dqtobp(
-       xfs_trans_t             **tpp,
-       xfs_dquot_t             *dqp,
-       xfs_disk_dquot_t        **O_ddpp,
-       xfs_buf_t               **O_bpp,
-       uint                    flags)
-{
-       xfs_bmbt_irec_t map;
-       int             nmaps = 1, error;
-       xfs_buf_t       *bp;
-       xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
-       xfs_mount_t     *mp = dqp->q_mount;
-       xfs_disk_dquot_t *ddq;
-       xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
-       xfs_trans_t     *tp = (tpp ? *tpp : NULL);
-
-       dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
-
-       xfs_ilock(quotip, XFS_ILOCK_SHARED);
-       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
-               /*
-                * Return if this type of quotas is turned off while we
-                * didn't have the quota inode lock.
-                */
-               xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-               return ESRCH;
-       }
-
-       /*
-        * Find the block map; no allocations yet
-        */
-       error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
-                         XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
-                         NULL, 0, &map, &nmaps, NULL);
-
-       xfs_iunlock(quotip, XFS_ILOCK_SHARED);
-       if (error)
-               return error;
-
-       ASSERT(nmaps == 1);
-       ASSERT(map.br_blockcount == 1);
-
-       /*
-        * Offset of dquot in the (fixed sized) dquot chunk.
-        */
-       dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
-               sizeof(xfs_dqblk_t);
-
-       ASSERT(map.br_startblock != DELAYSTARTBLOCK);
-       if (map.br_startblock == HOLESTARTBLOCK) {
-               /*
-                * We don't allocate unless we're asked to
-                */
-               if (!(flags & XFS_QMOPT_DQALLOC))
-                       return ENOENT;
-
-               ASSERT(tp);
-               error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
-                                       dqp->q_fileoffset, &bp);
-               if (error)
-                       return error;
-               tp = *tpp;
-       } else {
-               trace_xfs_dqtobp_read(dqp);
-
-               /*
-                * store the blkno etc so that we don't have to do the
-                * mapping all the time
-                */
-               dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
-
-               error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                                          dqp->q_blkno,
-                                          mp->m_quotainfo->qi_dqchunklen,
-                                          0, &bp);
-               if (error || !bp)
-                       return XFS_ERROR(error);
-       }
-
-       ASSERT(xfs_buf_islocked(bp));
-
-       /*
-        * calculate the location of the dquot inside the buffer.
-        */
-       ddq = bp->b_addr + dqp->q_bufoffset;
-
-       /*
-        * A simple sanity check in case we got a corrupted dquot...
-        */
-       error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-                          flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                          "dqtobp");
-       if (error) {
-               if (!(flags & XFS_QMOPT_DQREPAIR)) {
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(EIO);
-               }
-       }
-
-       *O_bpp = bp;
-       *O_ddpp = ddq;
-
-       return (0);
-}
-
-
-/*
- * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
- * and release the buffer immediately.
- *
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqread(
-       xfs_trans_t     **tpp,
-       xfs_dqid_t      id,
-       xfs_dquot_t     *dqp,   /* dquot to get filled in */
-       uint            flags)
-{
-       xfs_disk_dquot_t *ddqp;
-       xfs_buf_t        *bp;
-       int              error;
-       xfs_trans_t      *tp;
-
-       ASSERT(tpp);
-
-       trace_xfs_dqread(dqp);
-
-       /*
-        * get a pointer to the on-disk dquot and the buffer containing it
-        * dqp already knows its own type (GROUP/USER).
-        */
-       if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
-               return (error);
-       }
-       tp = *tpp;
-
-       /* copy everything from disk dquot to the incore dquot */
-       memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
-       ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-       xfs_qm_dquot_logitem_init(dqp);
-
-       /*
-        * Reservation counters are defined as reservation plus current usage
-        * to avoid having to add every time.
-        */
-       dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
-       dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
-       dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
-
-       /* Mark the buf so that this will stay incore a little longer */
-       XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
-
-       /*
-        * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
-        * So we need to release with xfs_trans_brelse().
-        * The strategy here is identical to that of inodes; we lock
-        * the dquot in xfs_qm_dqget() before making it accessible to
-        * others. This is because dquots, like inodes, need a good level of
-        * concurrency, and we don't want to take locks on the entire buffers
-        * for dquot accesses.
-        * Note also that the dquot buffer may even be dirty at this point, if
-        * this particular dquot was repaired. We still aren't afraid to
-        * brelse it because we have the changes incore.
-        */
-       ASSERT(xfs_buf_islocked(bp));
-       xfs_trans_brelse(tp, bp);
-
-       return (error);
-}
-
-
-/*
- * allocate an incore dquot from the kernel heap,
- * and fill its core with quota information kept on disk.
- * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
- * if it wasn't already allocated.
- */
-STATIC int
-xfs_qm_idtodq(
-       xfs_mount_t     *mp,
-       xfs_dqid_t      id,      /* gid or uid, depending on type */
-       uint            type,    /* UDQUOT or GDQUOT */
-       uint            flags,   /* DQALLOC, DQREPAIR */
-       xfs_dquot_t     **O_dqpp)/* OUT : incore dquot, not locked */
-{
-       xfs_dquot_t     *dqp;
-       int             error;
-       xfs_trans_t     *tp;
-       int             cancelflags=0;
-
-       dqp = xfs_qm_dqinit(mp, id, type);
-       tp = NULL;
-       if (flags & XFS_QMOPT_DQALLOC) {
-               tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
-               error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                               XFS_WRITE_LOG_RES(mp) +
-                               BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
-                               128,
-                               0,
-                               XFS_TRANS_PERM_LOG_RES,
-                               XFS_WRITE_LOG_COUNT);
-               if (error) {
-                       cancelflags = 0;
-                       goto error0;
-               }
-               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
-       }
-
-       /*
-        * Read it from disk; xfs_dqread() takes care of
-        * all the necessary initialization of dquot's fields (locks, etc)
-        */
-       if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
-               /*
-                * This can happen if quotas got turned off (ESRCH),
-                * or if the dquot didn't exist on disk and we ask to
-                * allocate (ENOENT).
-                */
-               trace_xfs_dqread_fail(dqp);
-               cancelflags |= XFS_TRANS_ABORT;
-               goto error0;
-       }
-       if (tp) {
-               if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
-                       goto error1;
-       }
-
-       *O_dqpp = dqp;
-       return (0);
-
- error0:
-       ASSERT(error);
-       if (tp)
-               xfs_trans_cancel(tp, cancelflags);
- error1:
-       xfs_qm_dqdestroy(dqp);
-       *O_dqpp = NULL;
-       return (error);
-}
-
-/*
- * Lookup a dquot in the incore dquot hashtable. We keep two separate
- * hashtables for user and group dquots; and, these are global tables
- * inside the XQM, not per-filesystem tables.
- * The hash chain must be locked by caller, and it is left locked
- * on return. Returning dquot is locked.
- */
-STATIC int
-xfs_qm_dqlookup(
-       xfs_mount_t             *mp,
-       xfs_dqid_t              id,
-       xfs_dqhash_t            *qh,
-       xfs_dquot_t             **O_dqpp)
-{
-       xfs_dquot_t             *dqp;
-       uint                    flist_locked;
-
-       ASSERT(mutex_is_locked(&qh->qh_lock));
-
-       flist_locked = B_FALSE;
-
-       /*
-        * Traverse the hashchain looking for a match
-        */
-       list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
-               /*
-                * We already have the hashlock. We don't need the
-                * dqlock to look at the id field of the dquot, since the
-                * id can't be modified without the hashlock anyway.
-                */
-               if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
-                       trace_xfs_dqlookup_found(dqp);
-
-                       /*
-                        * All in core dquots must be on the dqlist of mp
-                        */
-                       ASSERT(!list_empty(&dqp->q_mplist));
-
-                       xfs_dqlock(dqp);
-                       if (dqp->q_nrefs == 0) {
-                               ASSERT(!list_empty(&dqp->q_freelist));
-                               if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
-                                       trace_xfs_dqlookup_want(dqp);
-
-                                       /*
-                                        * We may have raced with dqreclaim_one()
-                                        * (and lost). So, flag that we don't
-                                        * want the dquot to be reclaimed.
-                                        */
-                                       dqp->dq_flags |= XFS_DQ_WANT;
-                                       xfs_dqunlock(dqp);
-                                       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-                                       xfs_dqlock(dqp);
-                                       dqp->dq_flags &= ~(XFS_DQ_WANT);
-                               }
-                               flist_locked = B_TRUE;
-                       }
-
-                       /*
-                        * id couldn't have changed; we had the hashlock all
-                        * along
-                        */
-                       ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
-
-                       if (flist_locked) {
-                               if (dqp->q_nrefs != 0) {
-                                       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                                       flist_locked = B_FALSE;
-                               } else {
-                                       /* take it off the freelist */
-                                       trace_xfs_dqlookup_freelist(dqp);
-                                       list_del_init(&dqp->q_freelist);
-                                       xfs_Gqm->qm_dqfrlist_cnt--;
-                               }
-                       }
-
-                       XFS_DQHOLD(dqp);
-
-                       if (flist_locked)
-                               mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                       /*
-                        * move the dquot to the front of the hashchain
-                        */
-                       ASSERT(mutex_is_locked(&qh->qh_lock));
-                       list_move(&dqp->q_hashlist, &qh->qh_list);
-                       trace_xfs_dqlookup_done(dqp);
-                       *O_dqpp = dqp;
-                       return 0;
-               }
-       }
-
-       *O_dqpp = NULL;
-       ASSERT(mutex_is_locked(&qh->qh_lock));
-       return (1);
-}
-
-/*
- * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
- * a locked dquot, doing an allocation (if requested) as needed.
- * When both an inode and an id are given, the inode's id takes precedence.
- * That is, if the id changes while we don't hold the ilock inside this
- * function, the new dquot is returned, not necessarily the one requested
- * in the id argument.
- */
-int
-xfs_qm_dqget(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *ip,      /* locked inode (optional) */
-       xfs_dqid_t      id,       /* uid/projid/gid depending on type */
-       uint            type,     /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
-       uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
-       xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
-{
-       xfs_dquot_t     *dqp;
-       xfs_dqhash_t    *h;
-       uint            version;
-       int             error;
-
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-       if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
-           (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
-           (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
-               return (ESRCH);
-       }
-       h = XFS_DQ_HASH(mp, id, type);
-
-#ifdef DEBUG
-       if (xfs_do_dqerror) {
-               if ((xfs_dqerror_target == mp->m_ddev_targp) &&
-                   (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
-                       xfs_debug(mp, "Returning error in dqget");
-                       return (EIO);
-               }
-       }
-#endif
-
- again:
-
-#ifdef DEBUG
-       ASSERT(type == XFS_DQ_USER ||
-              type == XFS_DQ_PROJ ||
-              type == XFS_DQ_GROUP);
-       if (ip) {
-               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-               if (type == XFS_DQ_USER)
-                       ASSERT(ip->i_udquot == NULL);
-               else
-                       ASSERT(ip->i_gdquot == NULL);
-       }
-#endif
-       mutex_lock(&h->qh_lock);
-
-       /*
-        * Look in the cache (hashtable).
-        * The chain is kept locked during lookup.
-        */
-       if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
-               XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
-               /*
-                * The dquot was found, moved to the front of the chain,
-                * taken off the freelist if it was on it, and locked
-                * at this point. Just unlock the hashchain and return.
-                */
-               ASSERT(*O_dqpp);
-               ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
-               mutex_unlock(&h->qh_lock);
-               trace_xfs_dqget_hit(*O_dqpp);
-               return (0);     /* success */
-       }
-       XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
-
-       /*
-        * Dquot cache miss. We don't want to keep the inode lock across
-        * a (potential) disk read. Also we don't want to deal with the lock
-        * ordering between quotainode and this inode. OTOH, dropping the inode
-        * lock here means dealing with a chown that can happen before
-        * we re-acquire the lock.
-        */
-       if (ip)
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * Save the hashchain version stamp, and unlock the chain, so that
-        * we don't keep the lock across a disk read
-        */
-       version = h->qh_version;
-       mutex_unlock(&h->qh_lock);
-
-       /*
-        * Allocate the dquot on the kernel heap, and read the ondisk
-        * portion off the disk. Also, do all the necessary initialization
-        * This can return ENOENT if dquot didn't exist on disk and we didn't
-        * ask it to allocate; ESRCH if quotas got turned off suddenly.
-        */
-       if ((error = xfs_qm_idtodq(mp, id, type,
-                                 flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
-                                          XFS_QMOPT_DOWARN),
-                                 &dqp))) {
-               if (ip)
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
-               return (error);
-       }
-
-       /*
-        * See if this is mount code calling to look at the overall quota limits
-        * which are stored in the id == 0 user or group's dquot.
-        * Since we may not have done a quotacheck by this point, just return
-        * the dquot without attaching it to any hashtables, lists, etc, or even
-        * taking a reference.
-        * The caller must dqdestroy this once done.
-        */
-       if (flags & XFS_QMOPT_DQSUSER) {
-               ASSERT(id == 0);
-               ASSERT(! ip);
-               goto dqret;
-       }
-
-       /*
-        * Dquot lock comes after hashlock in the lock ordering
-        */
-       if (ip) {
-               xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-               /*
-                * A dquot could be attached to this inode by now, since
-                * we had dropped the ilock.
-                */
-               if (type == XFS_DQ_USER) {
-                       if (!XFS_IS_UQUOTA_ON(mp)) {
-                               /* inode stays locked on return */
-                               xfs_qm_dqdestroy(dqp);
-                               return XFS_ERROR(ESRCH);
-                       }
-                       if (ip->i_udquot) {
-                               xfs_qm_dqdestroy(dqp);
-                               dqp = ip->i_udquot;
-                               xfs_dqlock(dqp);
-                               goto dqret;
-                       }
-               } else {
-                       if (!XFS_IS_OQUOTA_ON(mp)) {
-                               /* inode stays locked on return */
-                               xfs_qm_dqdestroy(dqp);
-                               return XFS_ERROR(ESRCH);
-                       }
-                       if (ip->i_gdquot) {
-                               xfs_qm_dqdestroy(dqp);
-                               dqp = ip->i_gdquot;
-                               xfs_dqlock(dqp);
-                               goto dqret;
-                       }
-               }
-       }
-
-       /*
-        * Hashlock comes after ilock in lock order
-        */
-       mutex_lock(&h->qh_lock);
-       if (version != h->qh_version) {
-               xfs_dquot_t *tmpdqp;
-               /*
-                * Now, see if somebody else put the dquot in the
-                * hashtable before us. This can happen because we didn't
-                * keep the hashchain lock. We don't have to worry about
-                * lock order between the two dquots here since dqp isn't
-                * on any findable lists yet.
-                */
-               if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
-                       /*
-                        * Duplicate found. Just throw away the new dquot
-                        * and start over.
-                        */
-                       xfs_qm_dqput(tmpdqp);
-                       mutex_unlock(&h->qh_lock);
-                       xfs_qm_dqdestroy(dqp);
-                       XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
-                       goto again;
-               }
-       }
-
-       /*
-        * Put the dquot at the beginning of the hash-chain and mp's list
-        * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
-        */
-       ASSERT(mutex_is_locked(&h->qh_lock));
-       dqp->q_hash = h;
-       list_add(&dqp->q_hashlist, &h->qh_list);
-       h->qh_version++;
-
-       /*
-        * Attach this dquot to this filesystem's list of all dquots,
-        * kept inside the mount structure in m_quotainfo field
-        */
-       mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
-
-       /*
-        * We return a locked dquot to the caller, with a reference taken
-        */
-       xfs_dqlock(dqp);
-       dqp->q_nrefs = 1;
-
-       list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
-       mp->m_quotainfo->qi_dquots++;
-       mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-       mutex_unlock(&h->qh_lock);
- dqret:
-       ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       trace_xfs_dqget_miss(dqp);
-       *O_dqpp = dqp;
-       return (0);
-}
-
-
-/*
- * Release a reference to the dquot (decrement ref-count)
- * and unlock it. If there is a group quota attached to this
- * dquot, carefully release that too without tripping over
- * deadlocks'n'stuff.
- */
-void
-xfs_qm_dqput(
-       xfs_dquot_t     *dqp)
-{
-       xfs_dquot_t     *gdqp;
-
-       ASSERT(dqp->q_nrefs > 0);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-       trace_xfs_dqput(dqp);
-
-       if (dqp->q_nrefs != 1) {
-               dqp->q_nrefs--;
-               xfs_dqunlock(dqp);
-               return;
-       }
-
-       /*
-        * drop the dqlock and acquire the freelist and dqlock
-        * in the right order; but try to get it out-of-order first
-        */
-       if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
-               trace_xfs_dqput_wait(dqp);
-               xfs_dqunlock(dqp);
-               mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-               xfs_dqlock(dqp);
-       }
-
-       while (1) {
-               gdqp = NULL;
-
-               /* We can't depend on nrefs being == 1 here */
-               if (--dqp->q_nrefs == 0) {
-                       trace_xfs_dqput_free(dqp);
-
-                       list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
-                       xfs_Gqm->qm_dqfrlist_cnt++;
-
-                       /*
-                        * If we just added a udquot to the freelist, then
-                        * we want to release the gdquot reference that
-                        * it (probably) has. Otherwise it'll keep the
-                        * gdquot from getting reclaimed.
-                        */
-                       if ((gdqp = dqp->q_gdquot)) {
-                               /*
-                                * Avoid a recursive dqput call
-                                */
-                               xfs_dqlock(gdqp);
-                               dqp->q_gdquot = NULL;
-                       }
-               }
-               xfs_dqunlock(dqp);
-
-               /*
-                * If we had a group quota inside the user quota as a hint,
-                * release it now.
-                */
-               if (! gdqp)
-                       break;
-               dqp = gdqp;
-       }
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-}
-
-/*
- * Release a dquot. Flush it if dirty, then dqput() it.
- * dquot must not be locked.
- */
-void
-xfs_qm_dqrele(
-       xfs_dquot_t     *dqp)
-{
-       if (!dqp)
-               return;
-
-       trace_xfs_dqrele(dqp);
-
-       xfs_dqlock(dqp);
-       /*
-        * We don't care to flush it if the dquot is dirty here.
-        * That will create stutters that we want to avoid.
-        * Instead we do a delayed write when we try to reclaim
-        * a dirty dquot. Also xfs_sync will take part of the burden...
-        */
-       xfs_qm_dqput(dqp);
-}
-
-/*
- * This is the dquot flushing I/O completion routine.  It is called
- * from interrupt level when the buffer containing the dquot is
- * flushed to disk.  It is responsible for removing the dquot logitem
- * from the AIL if it has not been re-logged, and unlocking the dquot's
- * flush lock. This behavior is very similar to that of inodes..
- */
-STATIC void
-xfs_qm_dqflush_done(
-       struct xfs_buf          *bp,
-       struct xfs_log_item     *lip)
-{
-       xfs_dq_logitem_t        *qip = (struct xfs_dq_logitem *)lip;
-       xfs_dquot_t             *dqp = qip->qli_dquot;
-       struct xfs_ail          *ailp = lip->li_ailp;
-
-       /*
-        * We only want to pull the item from the AIL if its
-        * location in the log has not changed since we started the flush.
-        * Thus, we only bother if the dquot's lsn has
-        * not changed. First we check the lsn outside the lock
-        * since it's cheaper, and then we recheck while
-        * holding the lock before removing the dquot from the AIL.
-        */
-       if ((lip->li_flags & XFS_LI_IN_AIL) &&
-           lip->li_lsn == qip->qli_flush_lsn) {
-
-               /* xfs_trans_ail_delete() drops the AIL lock. */
-               spin_lock(&ailp->xa_lock);
-               if (lip->li_lsn == qip->qli_flush_lsn)
-                       xfs_trans_ail_delete(ailp, lip);
-               else
-                       spin_unlock(&ailp->xa_lock);
-       }
-
-       /*
-        * Release the dq's flush lock since we're done with it.
-        */
-       xfs_dqfunlock(dqp);
-}
-
-/*
- * Write a modified dquot to disk.
- * The dquot must be locked and the flush lock too taken by caller.
- * The flush lock will not be unlocked until the dquot reaches the disk,
- * but the dquot is free to be unlocked and modified by the caller
- * in the interim. Dquot is still locked on return. This behavior is
- * identical to that of inodes.
- */
-int
-xfs_qm_dqflush(
-       xfs_dquot_t             *dqp,
-       uint                    flags)
-{
-       struct xfs_mount        *mp = dqp->q_mount;
-       struct xfs_buf          *bp;
-       struct xfs_disk_dquot   *ddqp;
-       int                     error;
-
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       ASSERT(!completion_done(&dqp->q_flush));
-
-       trace_xfs_dqflush(dqp);
-
-       /*
-        * If not dirty, or it's pinned and we are not supposed to block, nada.
-        */
-       if (!XFS_DQ_IS_DIRTY(dqp) ||
-           (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
-               xfs_dqfunlock(dqp);
-               return 0;
-       }
-       xfs_qm_dqunpin_wait(dqp);
-
-       /*
-        * This may have been unpinned because the filesystem is shutting
-        * down forcibly. If that's the case we must not write this dquot
-        * to disk, because the log record didn't make it to disk!
-        */
-       if (XFS_FORCED_SHUTDOWN(mp)) {
-               dqp->dq_flags &= ~XFS_DQ_DIRTY;
-               xfs_dqfunlock(dqp);
-               return XFS_ERROR(EIO);
-       }
-
-       /*
-        * Get the buffer containing the on-disk dquot
-        */
-       error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp);
-       if (error) {
-               ASSERT(error != ENOENT);
-               xfs_dqfunlock(dqp);
-               return error;
-       }
-
-       /*
-        * Calculate the location of the dquot inside the buffer.
-        */
-       ddqp = bp->b_addr + dqp->q_bufoffset;
-
-       /*
-        * A simple sanity check in case we got a corrupted dquot..
-        */
-       error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
-                          XFS_QMOPT_DOWARN, "dqflush (incore copy)");
-       if (error) {
-               xfs_buf_relse(bp);
-               xfs_dqfunlock(dqp);
-               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-               return XFS_ERROR(EIO);
-       }
-
-       /* This is the only portion of data that needs to persist */
-       memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
-
-       /*
-        * Clear the dirty field and remember the flush lsn for later use.
-        */
-       dqp->dq_flags &= ~XFS_DQ_DIRTY;
-
-       xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
-                                       &dqp->q_logitem.qli_item.li_lsn);
-
-       /*
-        * Attach an iodone routine so that we can remove this dquot from the
-        * AIL and release the flush lock once the dquot is synced to disk.
-        */
-       xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
-                                 &dqp->q_logitem.qli_item);
-
-       /*
-        * If the buffer is pinned then push on the log so we won't
-        * get stuck waiting in the write for too long.
-        */
-       if (xfs_buf_ispinned(bp)) {
-               trace_xfs_dqflush_force(dqp);
-               xfs_log_force(mp, 0);
-       }
-
-       if (flags & SYNC_WAIT)
-               error = xfs_bwrite(mp, bp);
-       else
-               xfs_bdwrite(mp, bp);
-
-       trace_xfs_dqflush_done(dqp);
-
-       /*
-        * dqp is still locked, but caller is free to unlock it now.
-        */
-       return error;
-
-}
-
-int
-xfs_qm_dqlock_nowait(
-       xfs_dquot_t *dqp)
-{
-       return mutex_trylock(&dqp->q_qlock);
-}
-
-void
-xfs_dqlock(
-       xfs_dquot_t *dqp)
-{
-       mutex_lock(&dqp->q_qlock);
-}
-
-void
-xfs_dqunlock(
-       xfs_dquot_t *dqp)
-{
-       mutex_unlock(&(dqp->q_qlock));
-       if (dqp->q_logitem.qli_dquot == dqp) {
-               /* Once was dqp->q_mount, but might just have been cleared */
-               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
-                                       (xfs_log_item_t*)&(dqp->q_logitem));
-       }
-}
-
-
-void
-xfs_dqunlock_nonotify(
-       xfs_dquot_t *dqp)
-{
-       mutex_unlock(&(dqp->q_qlock));
-}
-
-/*
- * Lock two xfs_dquot structures.
- *
- * To avoid deadlocks we always lock the quota structure with
- * the lowerd id first.
- */
-void
-xfs_dqlock2(
-       xfs_dquot_t     *d1,
-       xfs_dquot_t     *d2)
-{
-       if (d1 && d2) {
-               ASSERT(d1 != d2);
-               if (be32_to_cpu(d1->q_core.d_id) >
-                   be32_to_cpu(d2->q_core.d_id)) {
-                       mutex_lock(&d2->q_qlock);
-                       mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
-               } else {
-                       mutex_lock(&d1->q_qlock);
-                       mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
-               }
-       } else if (d1) {
-               mutex_lock(&d1->q_qlock);
-       } else if (d2) {
-               mutex_lock(&d2->q_qlock);
-       }
-}
-
-
-/*
- * Take a dquot out of the mount's dqlist as well as the hashlist.
- * This is called via unmount as well as quotaoff, and the purge
- * will always succeed unless there are soft (temp) references
- * outstanding.
- *
- * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
- * that we're returning! XXXsup - not cool.
- */
-/* ARGSUSED */
-int
-xfs_qm_dqpurge(
-       xfs_dquot_t     *dqp)
-{
-       xfs_dqhash_t    *qh = dqp->q_hash;
-       xfs_mount_t     *mp = dqp->q_mount;
-
-       ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
-       ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
-
-       xfs_dqlock(dqp);
-       /*
-        * We really can't afford to purge a dquot that is
-        * referenced, because these are hard refs.
-        * It shouldn't happen in general because we went thru _all_ inodes in
-        * dqrele_all_inodes before calling this and didn't let the mountlock go.
-        * However it is possible that we have dquots with temporary
-        * references that are not attached to an inode. e.g. see xfs_setattr().
-        */
-       if (dqp->q_nrefs != 0) {
-               xfs_dqunlock(dqp);
-               mutex_unlock(&dqp->q_hash->qh_lock);
-               return (1);
-       }
-
-       ASSERT(!list_empty(&dqp->q_freelist));
-
-       /*
-        * If we're turning off quotas, we have to make sure that, for
-        * example, we don't delete quota disk blocks while dquots are
-        * in the process of getting written to those disk blocks.
-        * This dquot might well be on AIL, and we can't leave it there
-        * if we're turning off quotas. Basically, we need this flush
-        * lock, and are willing to block on it.
-        */
-       if (!xfs_dqflock_nowait(dqp)) {
-               /*
-                * Block on the flush lock after nudging dquot buffer,
-                * if it is incore.
-                */
-               xfs_qm_dqflock_pushbuf_wait(dqp);
-       }
-
-       /*
-        * XXXIf we're turning this type of quotas off, we don't care
-        * about the dirty metadata sitting in this dquot. OTOH, if
-        * we're unmounting, we do care, so we flush it and wait.
-        */
-       if (XFS_DQ_IS_DIRTY(dqp)) {
-               int     error;
-
-               /* dqflush unlocks dqflock */
-               /*
-                * Given that dqpurge is a very rare occurrence, it is OK
-                * that we're holding the hashlist and mplist locks
-                * across the disk write. But, ... XXXsup
-                *
-                * We don't care about getting disk errors here. We need
-                * to purge this dquot anyway, so we go ahead regardless.
-                */
-               error = xfs_qm_dqflush(dqp, SYNC_WAIT);
-               if (error)
-                       xfs_warn(mp, "%s: dquot %p flush failed",
-                               __func__, dqp);
-               xfs_dqflock(dqp);
-       }
-       ASSERT(atomic_read(&dqp->q_pincount) == 0);
-       ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
-              !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
-
-       list_del_init(&dqp->q_hashlist);
-       qh->qh_version++;
-       list_del_init(&dqp->q_mplist);
-       mp->m_quotainfo->qi_dqreclaims++;
-       mp->m_quotainfo->qi_dquots--;
-       /*
-        * XXX Move this to the front of the freelist, if we can get the
-        * freelist lock.
-        */
-       ASSERT(!list_empty(&dqp->q_freelist));
-
-       dqp->q_mount = NULL;
-       dqp->q_hash = NULL;
-       dqp->dq_flags = XFS_DQ_INACTIVE;
-       memset(&dqp->q_core, 0, sizeof(dqp->q_core));
-       xfs_dqfunlock(dqp);
-       xfs_dqunlock(dqp);
-       mutex_unlock(&qh->qh_lock);
-       return (0);
-}
-
-
-/*
- * Give the buffer a little push if it is incore and
- * wait on the flush lock.
- */
-void
-xfs_qm_dqflock_pushbuf_wait(
-       xfs_dquot_t     *dqp)
-{
-       xfs_mount_t     *mp = dqp->q_mount;
-       xfs_buf_t       *bp;
-
-       /*
-        * Check to see if the dquot has been flushed delayed
-        * write.  If so, grab its buffer and send it
-        * out immediately.  We'll be able to acquire
-        * the flush lock when the I/O completes.
-        */
-       bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
-                       mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
-       if (!bp)
-               goto out_lock;
-
-       if (XFS_BUF_ISDELAYWRITE(bp)) {
-               if (xfs_buf_ispinned(bp))
-                       xfs_log_force(mp, 0);
-               xfs_buf_delwri_promote(bp);
-               wake_up_process(bp->b_target->bt_task);
-       }
-       xfs_buf_relse(bp);
-out_lock:
-       xfs_dqflock(dqp);
-}
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
deleted file mode 100644 (file)
index 34b7e94..0000000
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DQUOT_H__
-#define __XFS_DQUOT_H__
-
-/*
- * Dquots are structures that hold quota information about a user or a group,
- * much like inodes are for files. In fact, dquots share many characteristics
- * with inodes. However, dquots can also be a centralized resource, relative
- * to a collection of inodes. In this respect, dquots share some characteristics
- * of the superblock.
- * XFS dquots exploit both those in its algorithms. They make every attempt
- * to not be a bottleneck when quotas are on and have minimal impact, if any,
- * when quotas are off.
- */
-
-/*
- * The hash chain headers (hash buckets)
- */
-typedef struct xfs_dqhash {
-       struct list_head  qh_list;
-       struct mutex      qh_lock;
-       uint              qh_version;   /* ever increasing version */
-       uint              qh_nelems;    /* number of dquots on the list */
-} xfs_dqhash_t;
-
-struct xfs_mount;
-struct xfs_trans;
-
-/*
- * The incore dquot structure
- */
-typedef struct xfs_dquot {
-       uint             dq_flags;      /* various flags (XFS_DQ_*) */
-       struct list_head q_freelist;    /* global free list of dquots */
-       struct list_head q_mplist;      /* mount's list of dquots */
-       struct list_head q_hashlist;    /* gloabl hash list of dquots */
-       xfs_dqhash_t    *q_hash;        /* the hashchain header */
-       struct xfs_mount*q_mount;       /* filesystem this relates to */
-       struct xfs_trans*q_transp;      /* trans this belongs to currently */
-       uint             q_nrefs;       /* # active refs from inodes */
-       xfs_daddr_t      q_blkno;       /* blkno of dquot buffer */
-       int              q_bufoffset;   /* off of dq in buffer (# dquots) */
-       xfs_fileoff_t    q_fileoffset;  /* offset in quotas file */
-
-       struct xfs_dquot*q_gdquot;      /* group dquot, hint only */
-       xfs_disk_dquot_t q_core;        /* actual usage & quotas */
-       xfs_dq_logitem_t q_logitem;     /* dquot log item */
-       xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
-       xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
-       xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
-       struct mutex     q_qlock;       /* quota lock */
-       struct completion q_flush;      /* flush completion queue */
-       atomic_t          q_pincount;   /* dquot pin count */
-       wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
-} xfs_dquot_t;
-
-/*
- * Lock hierarchy for q_qlock:
- *     XFS_QLOCK_NORMAL is the implicit default,
- *     XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
- */
-enum {
-       XFS_QLOCK_NORMAL = 0,
-       XFS_QLOCK_NESTED,
-};
-
-#define XFS_DQHOLD(dqp)                ((dqp)->q_nrefs++)
-
-/*
- * Manage the q_flush completion queue embedded in the dquot.  This completion
- * queue synchronizes processes attempting to flush the in-core dquot back to
- * disk.
- */
-static inline void xfs_dqflock(xfs_dquot_t *dqp)
-{
-       wait_for_completion(&dqp->q_flush);
-}
-
-static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
-{
-       return try_wait_for_completion(&dqp->q_flush);
-}
-
-static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
-{
-       complete(&dqp->q_flush);
-}
-
-#define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
-#define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
-#define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
-#define XFS_QM_ISPDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_PROJ)
-#define XFS_QM_ISGDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_GROUP)
-#define XFS_DQ_TO_QINF(dqp)    ((dqp)->q_mount->m_quotainfo)
-#define XFS_DQ_TO_QIP(dqp)     (XFS_QM_ISUDQ(dqp) ? \
-                                XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
-                                XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
-
-#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
-                                    (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
-                                    (XFS_IS_OQUOTA_ON((d)->q_mount))))
-
-extern void            xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int             xfs_qm_dqflush(xfs_dquot_t *, uint);
-extern int             xfs_qm_dqpurge(xfs_dquot_t *);
-extern void            xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern int             xfs_qm_dqlock_nowait(xfs_dquot_t *);
-extern void            xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
-extern void            xfs_qm_adjust_dqtimers(xfs_mount_t *,
-                                       xfs_disk_dquot_t *);
-extern void            xfs_qm_adjust_dqlimits(xfs_mount_t *,
-                                       xfs_disk_dquot_t *);
-extern int             xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
-                                       xfs_dqid_t, uint, uint, xfs_dquot_t **);
-extern void            xfs_qm_dqput(xfs_dquot_t *);
-extern void            xfs_dqlock(xfs_dquot_t *);
-extern void            xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
-extern void            xfs_dqunlock(xfs_dquot_t *);
-extern void            xfs_dqunlock_nonotify(xfs_dquot_t *);
-
-#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
deleted file mode 100644 (file)
index 9e0e2fa..0000000
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_itable.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
-{
-       return container_of(lip, struct xfs_dq_logitem, qli_item);
-}
-
-/*
- * returns the number of iovecs needed to log the given dquot item.
- */
-STATIC uint
-xfs_qm_dquot_logitem_size(
-       struct xfs_log_item     *lip)
-{
-       /*
-        * we need only two iovecs, one for the format, one for the real thing
-        */
-       return 2;
-}
-
-/*
- * fills in the vector of log iovecs for the given dquot log item.
- */
-STATIC void
-xfs_qm_dquot_logitem_format(
-       struct xfs_log_item     *lip,
-       struct xfs_log_iovec    *logvec)
-{
-       struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-
-       logvec->i_addr = &qlip->qli_format;
-       logvec->i_len  = sizeof(xfs_dq_logformat_t);
-       logvec->i_type = XLOG_REG_TYPE_QFORMAT;
-       logvec++;
-       logvec->i_addr = &qlip->qli_dquot->q_core;
-       logvec->i_len  = sizeof(xfs_disk_dquot_t);
-       logvec->i_type = XLOG_REG_TYPE_DQUOT;
-
-       ASSERT(2 == lip->li_desc->lid_size);
-       qlip->qli_format.qlf_size = 2;
-
-}
-
-/*
- * Increment the pin count of the given dquot.
- */
-STATIC void
-xfs_qm_dquot_logitem_pin(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       atomic_inc(&dqp->q_pincount);
-}
-
-/*
- * Decrement the pin count of the given dquot, and wake up
- * anyone in xfs_dqwait_unpin() if the count goes to 0.         The
- * dquot must have been previously pinned with a call to
- * xfs_qm_dquot_logitem_pin().
- */
-STATIC void
-xfs_qm_dquot_logitem_unpin(
-       struct xfs_log_item     *lip,
-       int                     remove)
-{
-       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-
-       ASSERT(atomic_read(&dqp->q_pincount) > 0);
-       if (atomic_dec_and_test(&dqp->q_pincount))
-               wake_up(&dqp->q_pinwait);
-}
-
-/*
- * Given the logitem, this writes the corresponding dquot entry to disk
- * asynchronously. This is called with the dquot entry securely locked;
- * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
- * at the end.
- */
-STATIC void
-xfs_qm_dquot_logitem_push(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-       int                     error;
-
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       ASSERT(!completion_done(&dqp->q_flush));
-
-       /*
-        * Since we were able to lock the dquot's flush lock and
-        * we found it on the AIL, the dquot must be dirty.  This
-        * is because the dquot is removed from the AIL while still
-        * holding the flush lock in xfs_dqflush_done().  Thus, if
-        * we found it in the AIL and were able to obtain the flush
-        * lock without sleeping, then there must not have been
-        * anyone in the process of flushing the dquot.
-        */
-       error = xfs_qm_dqflush(dqp, 0);
-       if (error)
-               xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
-                       __func__, error, dqp);
-       xfs_dqunlock(dqp);
-}
-
-STATIC xfs_lsn_t
-xfs_qm_dquot_logitem_committed(
-       struct xfs_log_item     *lip,
-       xfs_lsn_t               lsn)
-{
-       /*
-        * We always re-log the entire dquot when it becomes dirty,
-        * so, the latest copy _is_ the only one that matters.
-        */
-       return lsn;
-}
-
-/*
- * This is called to wait for the given dquot to be unpinned.
- * Most of these pin/unpin routines are plagiarized from inode code.
- */
-void
-xfs_qm_dqunpin_wait(
-       struct xfs_dquot        *dqp)
-{
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       if (atomic_read(&dqp->q_pincount) == 0)
-               return;
-
-       /*
-        * Give the log a push so we don't wait here too long.
-        */
-       xfs_log_force(dqp->q_mount, 0);
-       wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
-}
-
-/*
- * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
- * the dquot is locked by us, but the flush lock isn't. So, here we are
- * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
- * If so, we want to push it out to help us take this item off the AIL as soon
- * as possible.
- *
- * We must not be holding the AIL lock at this point. Calling incore() to
- * search the buffer cache can be a time consuming thing, and AIL lock is a
- * spinlock.
- */
-STATIC void
-xfs_qm_dquot_logitem_pushbuf(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
-       struct xfs_dquot        *dqp = qlip->qli_dquot;
-       struct xfs_buf          *bp;
-
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-       /*
-        * If flushlock isn't locked anymore, chances are that the
-        * inode flush completed and the inode was taken off the AIL.
-        * So, just get out.
-        */
-       if (completion_done(&dqp->q_flush) ||
-           !(lip->li_flags & XFS_LI_IN_AIL)) {
-               xfs_dqunlock(dqp);
-               return;
-       }
-
-       bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
-                       dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
-       xfs_dqunlock(dqp);
-       if (!bp)
-               return;
-       if (XFS_BUF_ISDELAYWRITE(bp))
-               xfs_buf_delwri_promote(bp);
-       xfs_buf_relse(bp);
-}
-
-/*
- * This is called to attempt to lock the dquot associated with this
- * dquot log item.  Don't sleep on the dquot lock or the flush lock.
- * If the flush lock is already held, indicating that the dquot has
- * been or is in the process of being flushed, then see if we can
- * find the dquot's buffer in the buffer cache without sleeping.  If
- * we can and it is marked delayed write, then we want to send it out.
- * We delay doing so until the push routine, though, to avoid sleeping
- * in any device strategy routines.
- */
-STATIC uint
-xfs_qm_dquot_logitem_trylock(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-
-       if (atomic_read(&dqp->q_pincount) > 0)
-               return XFS_ITEM_PINNED;
-
-       if (!xfs_qm_dqlock_nowait(dqp))
-               return XFS_ITEM_LOCKED;
-
-       if (!xfs_dqflock_nowait(dqp)) {
-               /*
-                * dquot has already been flushed to the backing buffer,
-                * leave it locked, pushbuf routine will unlock it.
-                */
-               return XFS_ITEM_PUSHBUF;
-       }
-
-       ASSERT(lip->li_flags & XFS_LI_IN_AIL);
-       return XFS_ITEM_SUCCESS;
-}
-
-/*
- * Unlock the dquot associated with the log item.
- * Clear the fields of the dquot and dquot log item that
- * are specific to the current transaction.  If the
- * hold flags is set, do not unlock the dquot.
- */
-STATIC void
-xfs_qm_dquot_logitem_unlock(
-       struct xfs_log_item     *lip)
-{
-       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
-
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-       /*
-        * Clear the transaction pointer in the dquot
-        */
-       dqp->q_transp = NULL;
-
-       /*
-        * dquots are never 'held' from getting unlocked at the end of
-        * a transaction.  Their locking and unlocking is hidden inside the
-        * transaction layer, within trans_commit. Hence, no LI_HOLD flag
-        * for the logitem.
-        */
-       xfs_dqunlock(dqp);
-}
-
-/*
- * this needs to stamp an lsn into the dquot, I think.
- * rpc's that look at user dquot's would then have to
- * push on the dependency recorded in the dquot
- */
-STATIC void
-xfs_qm_dquot_logitem_committing(
-       struct xfs_log_item     *lip,
-       xfs_lsn_t               lsn)
-{
-}
-
-/*
- * This is the ops vector for dquots
- */
-static struct xfs_item_ops xfs_dquot_item_ops = {
-       .iop_size       = xfs_qm_dquot_logitem_size,
-       .iop_format     = xfs_qm_dquot_logitem_format,
-       .iop_pin        = xfs_qm_dquot_logitem_pin,
-       .iop_unpin      = xfs_qm_dquot_logitem_unpin,
-       .iop_trylock    = xfs_qm_dquot_logitem_trylock,
-       .iop_unlock     = xfs_qm_dquot_logitem_unlock,
-       .iop_committed  = xfs_qm_dquot_logitem_committed,
-       .iop_push       = xfs_qm_dquot_logitem_push,
-       .iop_pushbuf    = xfs_qm_dquot_logitem_pushbuf,
-       .iop_committing = xfs_qm_dquot_logitem_committing
-};
-
-/*
- * Initialize the dquot log item for a newly allocated dquot.
- * The dquot isn't locked at this point, but it isn't on any of the lists
- * either, so we don't care.
- */
-void
-xfs_qm_dquot_logitem_init(
-       struct xfs_dquot        *dqp)
-{
-       struct xfs_dq_logitem   *lp = &dqp->q_logitem;
-
-       xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
-                                       &xfs_dquot_item_ops);
-       lp->qli_dquot = dqp;
-       lp->qli_format.qlf_type = XFS_LI_DQUOT;
-       lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
-       lp->qli_format.qlf_blkno = dqp->q_blkno;
-       lp->qli_format.qlf_len = 1;
-       /*
-        * This is just the offset of this dquot within its buffer
-        * (which is currently 1 FSB and probably won't change).
-        * Hence 32 bits for this offset should be just fine.
-        * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
-        * here, and recompute it at recovery time.
-        */
-       lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
-}
-
-/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
-
-static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
-{
-       return container_of(lip, struct xfs_qoff_logitem, qql_item);
-}
-
-
-/*
- * This returns the number of iovecs needed to log the given quotaoff item.
- * We only need 1 iovec for an quotaoff item.  It just logs the
- * quotaoff_log_format structure.
- */
-STATIC uint
-xfs_qm_qoff_logitem_size(
-       struct xfs_log_item     *lip)
-{
-       return 1;
-}
-
-/*
- * This is called to fill in the vector of log iovecs for the
- * given quotaoff log item. We use only 1 iovec, and we point that
- * at the quotaoff_log_format structure embedded in the quotaoff item.
- * It is at this point that we assert that all of the extent
- * slots in the quotaoff item have been filled.
- */
-STATIC void
-xfs_qm_qoff_logitem_format(
-       struct xfs_log_item     *lip,
-       struct xfs_log_iovec    *log_vector)
-{
-       struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
-
-       ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
-
-       log_vector->i_addr = &qflip->qql_format;
-       log_vector->i_len = sizeof(xfs_qoff_logitem_t);
-       log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
-       qflip->qql_format.qf_size = 1;
-}
-
-/*
- * Pinning has no meaning for an quotaoff item, so just return.
- */
-STATIC void
-xfs_qm_qoff_logitem_pin(
-       struct xfs_log_item     *lip)
-{
-}
-
-/*
- * Since pinning has no meaning for an quotaoff item, unpinning does
- * not either.
- */
-STATIC void
-xfs_qm_qoff_logitem_unpin(
-       struct xfs_log_item     *lip,
-       int                     remove)
-{
-}
-
-/*
- * Quotaoff items have no locking, so just return success.
- */
-STATIC uint
-xfs_qm_qoff_logitem_trylock(
-       struct xfs_log_item     *lip)
-{
-       return XFS_ITEM_LOCKED;
-}
-
-/*
- * Quotaoff items have no locking or pushing, so return failure
- * so that the caller doesn't bother with us.
- */
-STATIC void
-xfs_qm_qoff_logitem_unlock(
-       struct xfs_log_item     *lip)
-{
-}
-
-/*
- * The quotaoff-start-item is logged only once and cannot be moved in the log,
- * so simply return the lsn at which it's been logged.
- */
-STATIC xfs_lsn_t
-xfs_qm_qoff_logitem_committed(
-       struct xfs_log_item     *lip,
-       xfs_lsn_t               lsn)
-{
-       return lsn;
-}
-
-/*
- * There isn't much you can do to push on an quotaoff item.  It is simply
- * stuck waiting for the log to be flushed to disk.
- */
-STATIC void
-xfs_qm_qoff_logitem_push(
-       struct xfs_log_item     *lip)
-{
-}
-
-
-STATIC xfs_lsn_t
-xfs_qm_qoffend_logitem_committed(
-       struct xfs_log_item     *lip,
-       xfs_lsn_t               lsn)
-{
-       struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
-       struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
-       struct xfs_ail          *ailp = qfs->qql_item.li_ailp;
-
-       /*
-        * Delete the qoff-start logitem from the AIL.
-        * xfs_trans_ail_delete() drops the AIL lock.
-        */
-       spin_lock(&ailp->xa_lock);
-       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
-
-       kmem_free(qfs);
-       kmem_free(qfe);
-       return (xfs_lsn_t)-1;
-}
-
-/*
- * XXX rcc - don't know quite what to do with this.  I think we can
- * just ignore it.  The only time that isn't the case is if we allow
- * the client to somehow see that quotas have been turned off in which
- * we can't allow that to get back until the quotaoff hits the disk.
- * So how would that happen?  Also, do we need different routines for
- * quotaoff start and quotaoff end?  I suspect the answer is yes but
- * to be sure, I need to look at the recovery code and see how quota off
- * recovery is handled (do we roll forward or back or do something else).
- * If we roll forwards or backwards, then we need two separate routines,
- * one that does nothing and one that stamps in the lsn that matters
- * (truly makes the quotaoff irrevocable).  If we do something else,
- * then maybe we don't need two.
- */
-STATIC void
-xfs_qm_qoff_logitem_committing(
-       struct xfs_log_item     *lip,
-       xfs_lsn_t               commit_lsn)
-{
-}
-
-static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
-       .iop_size       = xfs_qm_qoff_logitem_size,
-       .iop_format     = xfs_qm_qoff_logitem_format,
-       .iop_pin        = xfs_qm_qoff_logitem_pin,
-       .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-       .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-       .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-       .iop_committed  = xfs_qm_qoffend_logitem_committed,
-       .iop_push       = xfs_qm_qoff_logitem_push,
-       .iop_committing = xfs_qm_qoff_logitem_committing
-};
-
-/*
- * This is the ops vector shared by all quotaoff-start log items.
- */
-static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
-       .iop_size       = xfs_qm_qoff_logitem_size,
-       .iop_format     = xfs_qm_qoff_logitem_format,
-       .iop_pin        = xfs_qm_qoff_logitem_pin,
-       .iop_unpin      = xfs_qm_qoff_logitem_unpin,
-       .iop_trylock    = xfs_qm_qoff_logitem_trylock,
-       .iop_unlock     = xfs_qm_qoff_logitem_unlock,
-       .iop_committed  = xfs_qm_qoff_logitem_committed,
-       .iop_push       = xfs_qm_qoff_logitem_push,
-       .iop_committing = xfs_qm_qoff_logitem_committing
-};
-
-/*
- * Allocate and initialize an quotaoff item of the correct quota type(s).
- */
-struct xfs_qoff_logitem *
-xfs_qm_qoff_logitem_init(
-       struct xfs_mount        *mp,
-       struct xfs_qoff_logitem *start,
-       uint                    flags)
-{
-       struct xfs_qoff_logitem *qf;
-
-       qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
-
-       xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
-                       &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
-       qf->qql_item.li_mountp = mp;
-       qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
-       qf->qql_format.qf_flags = flags;
-       qf->qql_start_lip = start;
-       return qf;
-}
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
deleted file mode 100644 (file)
index 5acae2a..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_DQUOT_ITEM_H__
-#define __XFS_DQUOT_ITEM_H__
-
-struct xfs_dquot;
-struct xfs_trans;
-struct xfs_mount;
-struct xfs_qoff_logitem;
-
-typedef struct xfs_dq_logitem {
-       xfs_log_item_t           qli_item;         /* common portion */
-       struct xfs_dquot        *qli_dquot;        /* dquot ptr */
-       xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
-       xfs_dq_logformat_t       qli_format;       /* logged structure */
-} xfs_dq_logitem_t;
-
-typedef struct xfs_qoff_logitem {
-       xfs_log_item_t           qql_item;      /* common portion */
-       struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
-       xfs_qoff_logformat_t     qql_format;    /* logged structure */
-} xfs_qoff_logitem_t;
-
-
-extern void               xfs_qm_dquot_logitem_init(struct xfs_dquot *);
-extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
-                                       struct xfs_qoff_logitem *, uint);
-extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
-                                       struct xfs_qoff_logitem *, uint);
-extern void               xfs_trans_log_quotaoff_item(struct xfs_trans *,
-                                       struct xfs_qoff_logitem *);
-
-#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
deleted file mode 100644 (file)
index 9a0aa76..0000000
+++ /dev/null
@@ -1,2416 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_ialloc.h"
-#include "xfs_itable.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_space.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-#include "xfs_trace.h"
-
-/*
- * The global quota manager. There is only one of these for the entire
- * system, _not_ one per file system. XQM keeps track of the overall
- * quota functionality, including maintaining the freelist and hash
- * tables of dquots.
- */
-struct mutex   xfs_Gqm_lock;
-struct xfs_qm  *xfs_Gqm;
-uint           ndquot;
-
-kmem_zone_t    *qm_dqzone;
-kmem_zone_t    *qm_dqtrxzone;
-
-STATIC void    xfs_qm_list_init(xfs_dqlist_t *, char *, int);
-STATIC void    xfs_qm_list_destroy(xfs_dqlist_t *);
-
-STATIC int     xfs_qm_init_quotainos(xfs_mount_t *);
-STATIC int     xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int     xfs_qm_shake(struct shrinker *, struct shrink_control *);
-
-static struct shrinker xfs_qm_shaker = {
-       .shrink = xfs_qm_shake,
-       .seeks = DEFAULT_SEEKS,
-};
-
-/*
- * Initialize the XQM structure.
- * Note that there is not one quota manager per file system.
- */
-STATIC struct xfs_qm *
-xfs_Gqm_init(void)
-{
-       xfs_dqhash_t    *udqhash, *gdqhash;
-       xfs_qm_t        *xqm;
-       size_t          hsize;
-       uint            i;
-
-       /*
-        * Initialize the dquot hash tables.
-        */
-       udqhash = kmem_zalloc_greedy(&hsize,
-                                    XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
-                                    XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
-       if (!udqhash)
-               goto out;
-
-       gdqhash = kmem_zalloc_large(hsize);
-       if (!gdqhash)
-               goto out_free_udqhash;
-
-       hsize /= sizeof(xfs_dqhash_t);
-       ndquot = hsize << 8;
-
-       xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
-       xqm->qm_dqhashmask = hsize - 1;
-       xqm->qm_usr_dqhtable = udqhash;
-       xqm->qm_grp_dqhtable = gdqhash;
-       ASSERT(xqm->qm_usr_dqhtable != NULL);
-       ASSERT(xqm->qm_grp_dqhtable != NULL);
-
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
-               xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
-       }
-
-       /*
-        * Freelist of all dquots of all file systems
-        */
-       INIT_LIST_HEAD(&xqm->qm_dqfrlist);
-       xqm->qm_dqfrlist_cnt = 0;
-       mutex_init(&xqm->qm_dqfrlist_lock);
-
-       /*
-        * dquot zone. we register our own low-memory callback.
-        */
-       if (!qm_dqzone) {
-               xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
-                                               "xfs_dquots");
-               qm_dqzone = xqm->qm_dqzone;
-       } else
-               xqm->qm_dqzone = qm_dqzone;
-
-       register_shrinker(&xfs_qm_shaker);
-
-       /*
-        * The t_dqinfo portion of transactions.
-        */
-       if (!qm_dqtrxzone) {
-               xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
-                                                  "xfs_dqtrx");
-               qm_dqtrxzone = xqm->qm_dqtrxzone;
-       } else
-               xqm->qm_dqtrxzone = qm_dqtrxzone;
-
-       atomic_set(&xqm->qm_totaldquots, 0);
-       xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
-       xqm->qm_nrefs = 0;
-       return xqm;
-
- out_free_udqhash:
-       kmem_free_large(udqhash);
- out:
-       return NULL;
-}
-
-/*
- * Destroy the global quota manager when its reference count goes to zero.
- */
-STATIC void
-xfs_qm_destroy(
-       struct xfs_qm   *xqm)
-{
-       struct xfs_dquot *dqp, *n;
-       int             hsize, i;
-
-       ASSERT(xqm != NULL);
-       ASSERT(xqm->qm_nrefs == 0);
-       unregister_shrinker(&xfs_qm_shaker);
-       hsize = xqm->qm_dqhashmask + 1;
-       for (i = 0; i < hsize; i++) {
-               xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
-               xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
-       }
-       kmem_free_large(xqm->qm_usr_dqhtable);
-       kmem_free_large(xqm->qm_grp_dqhtable);
-       xqm->qm_usr_dqhtable = NULL;
-       xqm->qm_grp_dqhtable = NULL;
-       xqm->qm_dqhashmask = 0;
-
-       /* frlist cleanup */
-       mutex_lock(&xqm->qm_dqfrlist_lock);
-       list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
-               xfs_dqlock(dqp);
-               list_del_init(&dqp->q_freelist);
-               xfs_Gqm->qm_dqfrlist_cnt--;
-               xfs_dqunlock(dqp);
-               xfs_qm_dqdestroy(dqp);
-       }
-       mutex_unlock(&xqm->qm_dqfrlist_lock);
-       mutex_destroy(&xqm->qm_dqfrlist_lock);
-       kmem_free(xqm);
-}
-
-/*
- * Called at mount time to let XQM know that another file system is
- * starting quotas. This isn't crucial information as the individual mount
- * structures are pretty independent, but it helps the XQM keep a
- * global view of what's going on.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_hold_quotafs_ref(
-       struct xfs_mount *mp)
-{
-       /*
-        * Need to lock the xfs_Gqm structure for things like this. For example,
-        * the structure could disappear between the entry to this routine and
-        * a HOLD operation if not locked.
-        */
-       mutex_lock(&xfs_Gqm_lock);
-
-       if (!xfs_Gqm) {
-               xfs_Gqm = xfs_Gqm_init();
-               if (!xfs_Gqm) {
-                       mutex_unlock(&xfs_Gqm_lock);
-                       return ENOMEM;
-               }
-       }
-
-       /*
-        * We can keep a list of all filesystems with quotas mounted for
-        * debugging and statistical purposes, but ...
-        * Just take a reference and get out.
-        */
-       xfs_Gqm->qm_nrefs++;
-       mutex_unlock(&xfs_Gqm_lock);
-
-       return 0;
-}
-
-
-/*
- * Release the reference that a filesystem took at mount time,
- * so that we know when we need to destroy the entire quota manager.
- */
-/* ARGSUSED */
-STATIC void
-xfs_qm_rele_quotafs_ref(
-       struct xfs_mount *mp)
-{
-       xfs_dquot_t     *dqp, *n;
-
-       ASSERT(xfs_Gqm);
-       ASSERT(xfs_Gqm->qm_nrefs > 0);
-
-       /*
-        * Go thru the freelist and destroy all inactive dquots.
-        */
-       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-
-       list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-               xfs_dqlock(dqp);
-               if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                       ASSERT(dqp->q_mount == NULL);
-                       ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                       ASSERT(list_empty(&dqp->q_hashlist));
-                       ASSERT(list_empty(&dqp->q_mplist));
-                       list_del_init(&dqp->q_freelist);
-                       xfs_Gqm->qm_dqfrlist_cnt--;
-                       xfs_dqunlock(dqp);
-                       xfs_qm_dqdestroy(dqp);
-               } else {
-                       xfs_dqunlock(dqp);
-               }
-       }
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-
-       /*
-        * Destroy the entire XQM. If somebody mounts with quotaon, this'll
-        * be restarted.
-        */
-       mutex_lock(&xfs_Gqm_lock);
-       if (--xfs_Gqm->qm_nrefs == 0) {
-               xfs_qm_destroy(xfs_Gqm);
-               xfs_Gqm = NULL;
-       }
-       mutex_unlock(&xfs_Gqm_lock);
-}
-
-/*
- * Just destroy the quotainfo structure.
- */
-void
-xfs_qm_unmount(
-       struct xfs_mount        *mp)
-{
-       if (mp->m_quotainfo) {
-               xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
-               xfs_qm_destroy_quotainfo(mp);
-       }
-}
-
-
-/*
- * This is called from xfs_mountfs to start quotas and initialize all
- * necessary data structures like quotainfo.  This is also responsible for
- * running a quotacheck as necessary.  We are guaranteed that the superblock
- * is consistently read in at this point.
- *
- * If we fail here, the mount will continue with quota turned off. We don't
- * need to inidicate success or failure at all.
- */
-void
-xfs_qm_mount_quotas(
-       xfs_mount_t     *mp)
-{
-       int             error = 0;
-       uint            sbf;
-
-       /*
-        * If quotas on realtime volumes is not supported, we disable
-        * quotas immediately.
-        */
-       if (mp->m_sb.sb_rextents) {
-               xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
-               mp->m_qflags = 0;
-               goto write_changes;
-       }
-
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       /*
-        * Allocate the quotainfo structure inside the mount struct, and
-        * create quotainode(s), and change/rev superblock if necessary.
-        */
-       error = xfs_qm_init_quotainfo(mp);
-       if (error) {
-               /*
-                * We must turn off quotas.
-                */
-               ASSERT(mp->m_quotainfo == NULL);
-               mp->m_qflags = 0;
-               goto write_changes;
-       }
-       /*
-        * If any of the quotas are not consistent, do a quotacheck.
-        */
-       if (XFS_QM_NEED_QUOTACHECK(mp)) {
-               error = xfs_qm_quotacheck(mp);
-               if (error) {
-                       /* Quotacheck failed and disabled quotas. */
-                       return;
-               }
-       }
-       /* 
-        * If one type of quotas is off, then it will lose its
-        * quotachecked status, since we won't be doing accounting for
-        * that type anymore.
-        */
-       if (!XFS_IS_UQUOTA_ON(mp))
-               mp->m_qflags &= ~XFS_UQUOTA_CHKD;
-       if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
-               mp->m_qflags &= ~XFS_OQUOTA_CHKD;
-
- write_changes:
-       /*
-        * We actually don't have to acquire the m_sb_lock at all.
-        * This can only be called from mount, and that's single threaded. XXX
-        */
-       spin_lock(&mp->m_sb_lock);
-       sbf = mp->m_sb.sb_qflags;
-       mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
-       spin_unlock(&mp->m_sb_lock);
-
-       if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-               if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
-                       /*
-                        * We could only have been turning quotas off.
-                        * We aren't in very good shape actually because
-                        * the incore structures are convinced that quotas are
-                        * off, but the on disk superblock doesn't know that !
-                        */
-                       ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
-                       xfs_alert(mp, "%s: Superblock update failed!",
-                               __func__);
-               }
-       }
-
-       if (error) {
-               xfs_warn(mp, "Failed to initialize disk quotas.");
-               return;
-       }
-}
-
-/*
- * Called from the vfsops layer.
- */
-void
-xfs_qm_unmount_quotas(
-       xfs_mount_t     *mp)
-{
-       /*
-        * Release the dquots that root inode, et al might be holding,
-        * before we flush quotas and blow away the quotainfo structure.
-        */
-       ASSERT(mp->m_rootip);
-       xfs_qm_dqdetach(mp->m_rootip);
-       if (mp->m_rbmip)
-               xfs_qm_dqdetach(mp->m_rbmip);
-       if (mp->m_rsumip)
-               xfs_qm_dqdetach(mp->m_rsumip);
-
-       /*
-        * Release the quota inodes.
-        */
-       if (mp->m_quotainfo) {
-               if (mp->m_quotainfo->qi_uquotaip) {
-                       IRELE(mp->m_quotainfo->qi_uquotaip);
-                       mp->m_quotainfo->qi_uquotaip = NULL;
-               }
-               if (mp->m_quotainfo->qi_gquotaip) {
-                       IRELE(mp->m_quotainfo->qi_gquotaip);
-                       mp->m_quotainfo->qi_gquotaip = NULL;
-               }
-       }
-}
-
-/*
- * Flush all dquots of the given file system to disk. The dquots are
- * _not_ purged from memory here, just their data written to disk.
- */
-STATIC int
-xfs_qm_dqflush_all(
-       struct xfs_mount        *mp,
-       int                     sync_mode)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       int                     recl;
-       struct xfs_dquot        *dqp;
-       int                     error;
-
-       if (!q)
-               return 0;
-again:
-       mutex_lock(&q->qi_dqlist_lock);
-       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-               xfs_dqlock(dqp);
-               if (! XFS_DQ_IS_DIRTY(dqp)) {
-                       xfs_dqunlock(dqp);
-                       continue;
-               }
-
-               /* XXX a sentinel would be better */
-               recl = q->qi_dqreclaims;
-               if (!xfs_dqflock_nowait(dqp)) {
-                       /*
-                        * If we can't grab the flush lock then check
-                        * to see if the dquot has been flushed delayed
-                        * write.  If so, grab its buffer and send it
-                        * out immediately.  We'll be able to acquire
-                        * the flush lock when the I/O completes.
-                        */
-                       xfs_qm_dqflock_pushbuf_wait(dqp);
-               }
-               /*
-                * Let go of the mplist lock. We don't want to hold it
-                * across a disk write.
-                */
-               mutex_unlock(&q->qi_dqlist_lock);
-               error = xfs_qm_dqflush(dqp, sync_mode);
-               xfs_dqunlock(dqp);
-               if (error)
-                       return error;
-
-               mutex_lock(&q->qi_dqlist_lock);
-               if (recl != q->qi_dqreclaims) {
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       /* XXX restart limit */
-                       goto again;
-               }
-       }
-
-       mutex_unlock(&q->qi_dqlist_lock);
-       /* return ! busy */
-       return 0;
-}
-/*
- * Release the group dquot pointers the user dquots may be
- * carrying around as a hint. mplist is locked on entry and exit.
- */
-STATIC void
-xfs_qm_detach_gdquots(
-       struct xfs_mount        *mp)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_dquot        *dqp, *gdqp;
-       int                     nrecl;
-
- again:
-       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-               xfs_dqlock(dqp);
-               if ((gdqp = dqp->q_gdquot)) {
-                       xfs_dqlock(gdqp);
-                       dqp->q_gdquot = NULL;
-               }
-               xfs_dqunlock(dqp);
-
-               if (gdqp) {
-                       /*
-                        * Can't hold the mplist lock across a dqput.
-                        * XXXmust convert to marker based iterations here.
-                        */
-                       nrecl = q->qi_dqreclaims;
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       xfs_qm_dqput(gdqp);
-
-                       mutex_lock(&q->qi_dqlist_lock);
-                       if (nrecl != q->qi_dqreclaims)
-                               goto again;
-               }
-       }
-}
-
-/*
- * Go through all the incore dquots of this file system and take them
- * off the mplist and hashlist, if the dquot type matches the dqtype
- * parameter. This is used when turning off quota accounting for
- * users and/or groups, as well as when the filesystem is unmounting.
- */
-STATIC int
-xfs_qm_dqpurge_int(
-       struct xfs_mount        *mp,
-       uint                    flags)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_dquot        *dqp, *n;
-       uint                    dqtype;
-       int                     nrecl;
-       int                     nmisses;
-
-       if (!q)
-               return 0;
-
-       dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
-       dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
-       dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
-
-       mutex_lock(&q->qi_dqlist_lock);
-
-       /*
-        * In the first pass through all incore dquots of this filesystem,
-        * we release the group dquot pointers the user dquots may be
-        * carrying around as a hint. We need to do this irrespective of
-        * what's being turned off.
-        */
-       xfs_qm_detach_gdquots(mp);
-
-      again:
-       nmisses = 0;
-       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-       /*
-        * Try to get rid of all of the unwanted dquots. The idea is to
-        * get them off mplist and hashlist, but leave them on freelist.
-        */
-       list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
-               /*
-                * It's OK to look at the type without taking dqlock here.
-                * We're holding the mplist lock here, and that's needed for
-                * a dqreclaim.
-                */
-               if ((dqp->dq_flags & dqtype) == 0)
-                       continue;
-
-               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                       nrecl = q->qi_dqreclaims;
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       mutex_lock(&dqp->q_hash->qh_lock);
-                       mutex_lock(&q->qi_dqlist_lock);
-
-                       /*
-                        * XXXTheoretically, we can get into a very long
-                        * ping pong game here.
-                        * No one can be adding dquots to the mplist at
-                        * this point, but somebody might be taking things off.
-                        */
-                       if (nrecl != q->qi_dqreclaims) {
-                               mutex_unlock(&dqp->q_hash->qh_lock);
-                               goto again;
-                       }
-               }
-
-               /*
-                * Take the dquot off the mplist and hashlist. It may remain on
-                * freelist in INACTIVE state.
-                */
-               nmisses += xfs_qm_dqpurge(dqp);
-       }
-       mutex_unlock(&q->qi_dqlist_lock);
-       return nmisses;
-}
-
-int
-xfs_qm_dqpurge_all(
-       xfs_mount_t     *mp,
-       uint            flags)
-{
-       int             ndquots;
-
-       /*
-        * Purge the dquot cache.
-        * None of the dquots should really be busy at this point.
-        */
-       if (mp->m_quotainfo) {
-               while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
-                       delay(ndquots * 10);
-               }
-       }
-       return 0;
-}
-
-STATIC int
-xfs_qm_dqattach_one(
-       xfs_inode_t     *ip,
-       xfs_dqid_t      id,
-       uint            type,
-       uint            doalloc,
-       xfs_dquot_t     *udqhint, /* hint */
-       xfs_dquot_t     **IO_idqpp)
-{
-       xfs_dquot_t     *dqp;
-       int             error;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       error = 0;
-
-       /*
-        * See if we already have it in the inode itself. IO_idqpp is
-        * &i_udquot or &i_gdquot. This made the code look weird, but
-        * made the logic a lot simpler.
-        */
-       dqp = *IO_idqpp;
-       if (dqp) {
-               trace_xfs_dqattach_found(dqp);
-               return 0;
-       }
-
-       /*
-        * udqhint is the i_udquot field in inode, and is non-NULL only
-        * when the type arg is group/project. Its purpose is to save a
-        * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
-        * the user dquot.
-        */
-       if (udqhint) {
-               ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
-               xfs_dqlock(udqhint);
-
-               /*
-                * No need to take dqlock to look at the id.
-                *
-                * The ID can't change until it gets reclaimed, and it won't
-                * be reclaimed as long as we have a ref from inode and we
-                * hold the ilock.
-                */
-               dqp = udqhint->q_gdquot;
-               if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
-                       xfs_dqlock(dqp);
-                       XFS_DQHOLD(dqp);
-                       ASSERT(*IO_idqpp == NULL);
-                       *IO_idqpp = dqp;
-
-                       xfs_dqunlock(dqp);
-                       xfs_dqunlock(udqhint);
-                       return 0;
-               }
-
-               /*
-                * We can't hold a dquot lock when we call the dqget code.
-                * We'll deadlock in no time, because of (not conforming to)
-                * lock ordering - the inodelock comes before any dquot lock,
-                * and we may drop and reacquire the ilock in xfs_qm_dqget().
-                */
-               xfs_dqunlock(udqhint);
-       }
-
-       /*
-        * Find the dquot from somewhere. This bumps the
-        * reference count of dquot and returns it locked.
-        * This can return ENOENT if dquot didn't exist on
-        * disk and we didn't ask it to allocate;
-        * ESRCH if quotas got turned off suddenly.
-        */
-       error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
-       if (error)
-               return error;
-
-       trace_xfs_dqattach_get(dqp);
-
-       /*
-        * dqget may have dropped and re-acquired the ilock, but it guarantees
-        * that the dquot returned is the one that should go in the inode.
-        */
-       *IO_idqpp = dqp;
-       xfs_dqunlock(dqp);
-       return 0;
-}
-
-
-/*
- * Given a udquot and gdquot, attach a ptr to the group dquot in the
- * udquot as a hint for future lookups. The idea sounds simple, but the
- * execution isn't, because the udquot might have a group dquot attached
- * already and getting rid of that gets us into lock ordering constraints.
- * The process is complicated more by the fact that the dquots may or may not
- * be locked on entry.
- */
-STATIC void
-xfs_qm_dqattach_grouphint(
-       xfs_dquot_t     *udq,
-       xfs_dquot_t     *gdq)
-{
-       xfs_dquot_t     *tmp;
-
-       xfs_dqlock(udq);
-
-       if ((tmp = udq->q_gdquot)) {
-               if (tmp == gdq) {
-                       xfs_dqunlock(udq);
-                       return;
-               }
-
-               udq->q_gdquot = NULL;
-               /*
-                * We can't keep any dqlocks when calling dqrele,
-                * because the freelist lock comes before dqlocks.
-                */
-               xfs_dqunlock(udq);
-               /*
-                * we took a hard reference once upon a time in dqget,
-                * so give it back when the udquot no longer points at it
-                * dqput() does the unlocking of the dquot.
-                */
-               xfs_qm_dqrele(tmp);
-
-               xfs_dqlock(udq);
-               xfs_dqlock(gdq);
-
-       } else {
-               ASSERT(XFS_DQ_IS_LOCKED(udq));
-               xfs_dqlock(gdq);
-       }
-
-       ASSERT(XFS_DQ_IS_LOCKED(udq));
-       ASSERT(XFS_DQ_IS_LOCKED(gdq));
-       /*
-        * Somebody could have attached a gdquot here,
-        * when we dropped the uqlock. If so, just do nothing.
-        */
-       if (udq->q_gdquot == NULL) {
-               XFS_DQHOLD(gdq);
-               udq->q_gdquot = gdq;
-       }
-
-       xfs_dqunlock(gdq);
-       xfs_dqunlock(udq);
-}
-
-
-/*
- * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
- * into account.
- * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
- * Inode may get unlocked and relocked in here, and the caller must deal with
- * the consequences.
- */
-int
-xfs_qm_dqattach_locked(
-       xfs_inode_t     *ip,
-       uint            flags)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       uint            nquotas = 0;
-       int             error = 0;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) ||
-           !XFS_IS_QUOTA_ON(mp) ||
-           !XFS_NOT_DQATTACHED(mp, ip) ||
-           ip->i_ino == mp->m_sb.sb_uquotino ||
-           ip->i_ino == mp->m_sb.sb_gquotino)
-               return 0;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-
-       if (XFS_IS_UQUOTA_ON(mp)) {
-               error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
-                                               flags & XFS_QMOPT_DQALLOC,
-                                               NULL, &ip->i_udquot);
-               if (error)
-                       goto done;
-               nquotas++;
-       }
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       if (XFS_IS_OQUOTA_ON(mp)) {
-               error = XFS_IS_GQUOTA_ON(mp) ?
-                       xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
-                                               flags & XFS_QMOPT_DQALLOC,
-                                               ip->i_udquot, &ip->i_gdquot) :
-                       xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
-                                               flags & XFS_QMOPT_DQALLOC,
-                                               ip->i_udquot, &ip->i_gdquot);
-               /*
-                * Don't worry about the udquot that we may have
-                * attached above. It'll get detached, if not already.
-                */
-               if (error)
-                       goto done;
-               nquotas++;
-       }
-
-       /*
-        * Attach this group quota to the user quota as a hint.
-        * This WON'T, in general, result in a thrash.
-        */
-       if (nquotas == 2) {
-               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-               ASSERT(ip->i_udquot);
-               ASSERT(ip->i_gdquot);
-
-               /*
-                * We may or may not have the i_udquot locked at this point,
-                * but this check is OK since we don't depend on the i_gdquot to
-                * be accurate 100% all the time. It is just a hint, and this
-                * will succeed in general.
-                */
-               if (ip->i_udquot->q_gdquot == ip->i_gdquot)
-                       goto done;
-               /*
-                * Attach i_gdquot to the gdquot hint inside the i_udquot.
-                */
-               xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
-       }
-
- done:
-#ifdef DEBUG
-       if (!error) {
-               if (XFS_IS_UQUOTA_ON(mp))
-                       ASSERT(ip->i_udquot);
-               if (XFS_IS_OQUOTA_ON(mp))
-                       ASSERT(ip->i_gdquot);
-       }
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-#endif
-       return error;
-}
-
-int
-xfs_qm_dqattach(
-       struct xfs_inode        *ip,
-       uint                    flags)
-{
-       int                     error;
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       error = xfs_qm_dqattach_locked(ip, flags);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       return error;
-}
-
-/*
- * Release dquots (and their references) if any.
- * The inode should be locked EXCL except when this's called by
- * xfs_ireclaim.
- */
-void
-xfs_qm_dqdetach(
-       xfs_inode_t     *ip)
-{
-       if (!(ip->i_udquot || ip->i_gdquot))
-               return;
-
-       trace_xfs_dquot_dqdetach(ip);
-
-       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
-       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
-       if (ip->i_udquot) {
-               xfs_qm_dqrele(ip->i_udquot);
-               ip->i_udquot = NULL;
-       }
-       if (ip->i_gdquot) {
-               xfs_qm_dqrele(ip->i_gdquot);
-               ip->i_gdquot = NULL;
-       }
-}
-
-int
-xfs_qm_sync(
-       struct xfs_mount        *mp,
-       int                     flags)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       int                     recl, restarts;
-       struct xfs_dquot        *dqp;
-       int                     error;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return 0;
-
-       restarts = 0;
-
-  again:
-       mutex_lock(&q->qi_dqlist_lock);
-       /*
-        * dqpurge_all() also takes the mplist lock and iterate thru all dquots
-        * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
-        * when we have the mplist lock, we know that dquots will be consistent
-        * as long as we have it locked.
-        */
-       if (!XFS_IS_QUOTA_ON(mp)) {
-               mutex_unlock(&q->qi_dqlist_lock);
-               return 0;
-       }
-       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
-       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
-               /*
-                * If this is vfs_sync calling, then skip the dquots that
-                * don't 'seem' to be dirty. ie. don't acquire dqlock.
-                * This is very similar to what xfs_sync does with inodes.
-                */
-               if (flags & SYNC_TRYLOCK) {
-                       if (!XFS_DQ_IS_DIRTY(dqp))
-                               continue;
-                       if (!xfs_qm_dqlock_nowait(dqp))
-                               continue;
-               } else {
-                       xfs_dqlock(dqp);
-               }
-
-               /*
-                * Now, find out for sure if this dquot is dirty or not.
-                */
-               if (! XFS_DQ_IS_DIRTY(dqp)) {
-                       xfs_dqunlock(dqp);
-                       continue;
-               }
-
-               /* XXX a sentinel would be better */
-               recl = q->qi_dqreclaims;
-               if (!xfs_dqflock_nowait(dqp)) {
-                       if (flags & SYNC_TRYLOCK) {
-                               xfs_dqunlock(dqp);
-                               continue;
-                       }
-                       /*
-                        * If we can't grab the flush lock then if the caller
-                        * really wanted us to give this our best shot, so
-                        * see if we can give a push to the buffer before we wait
-                        * on the flush lock. At this point, we know that
-                        * even though the dquot is being flushed,
-                        * it has (new) dirty data.
-                        */
-                       xfs_qm_dqflock_pushbuf_wait(dqp);
-               }
-               /*
-                * Let go of the mplist lock. We don't want to hold it
-                * across a disk write
-                */
-               mutex_unlock(&q->qi_dqlist_lock);
-               error = xfs_qm_dqflush(dqp, flags);
-               xfs_dqunlock(dqp);
-               if (error && XFS_FORCED_SHUTDOWN(mp))
-                       return 0;       /* Need to prevent umount failure */
-               else if (error)
-                       return error;
-
-               mutex_lock(&q->qi_dqlist_lock);
-               if (recl != q->qi_dqreclaims) {
-                       if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
-                               break;
-
-                       mutex_unlock(&q->qi_dqlist_lock);
-                       goto again;
-               }
-       }
-
-       mutex_unlock(&q->qi_dqlist_lock);
-       return 0;
-}
-
-/*
- * The hash chains and the mplist use the same xfs_dqhash structure as
- * their list head, but we can take the mplist qh_lock and one of the
- * hash qh_locks at the same time without any problem as they aren't
- * related.
- */
-static struct lock_class_key xfs_quota_mplist_class;
-
-/*
- * This initializes all the quota information that's kept in the
- * mount structure
- */
-STATIC int
-xfs_qm_init_quotainfo(
-       xfs_mount_t     *mp)
-{
-       xfs_quotainfo_t *qinf;
-       int             error;
-       xfs_dquot_t     *dqp;
-
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       /*
-        * Tell XQM that we exist as soon as possible.
-        */
-       if ((error = xfs_qm_hold_quotafs_ref(mp))) {
-               return error;
-       }
-
-       qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
-
-       /*
-        * See if quotainodes are setup, and if not, allocate them,
-        * and change the superblock accordingly.
-        */
-       if ((error = xfs_qm_init_quotainos(mp))) {
-               kmem_free(qinf);
-               mp->m_quotainfo = NULL;
-               return error;
-       }
-
-       INIT_LIST_HEAD(&qinf->qi_dqlist);
-       mutex_init(&qinf->qi_dqlist_lock);
-       lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
-
-       qinf->qi_dqreclaims = 0;
-
-       /* mutex used to serialize quotaoffs */
-       mutex_init(&qinf->qi_quotaofflock);
-
-       /* Precalc some constants */
-       qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-       ASSERT(qinf->qi_dqchunklen);
-       qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
-       do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
-
-       mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
-
-       /*
-        * We try to get the limits from the superuser's limits fields.
-        * This is quite hacky, but it is standard quota practice.
-        * We look at the USR dquot with id == 0 first, but if user quotas
-        * are not enabled we goto the GRP dquot with id == 0.
-        * We don't really care to keep separate default limits for user
-        * and group quotas, at least not at this point.
-        */
-       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
-                            XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : 
-                            (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
-                               XFS_DQ_PROJ),
-                            XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
-                            &dqp);
-       if (! error) {
-               xfs_disk_dquot_t        *ddqp = &dqp->q_core;
-
-               /*
-                * The warnings and timers set the grace period given to
-                * a user or group before he or she can not perform any
-                * more writing. If it is zero, a default is used.
-                */
-               qinf->qi_btimelimit = ddqp->d_btimer ?
-                       be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
-               qinf->qi_itimelimit = ddqp->d_itimer ?
-                       be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
-               qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
-                       be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
-               qinf->qi_bwarnlimit = ddqp->d_bwarns ?
-                       be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
-               qinf->qi_iwarnlimit = ddqp->d_iwarns ?
-                       be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
-               qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
-                       be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
-               qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
-               qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
-               qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
-               qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
-               qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
-               qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
-               /*
-                * We sent the XFS_QMOPT_DQSUSER flag to dqget because
-                * we don't want this dquot cached. We haven't done a
-                * quotacheck yet, and quotacheck doesn't like incore dquots.
-                */
-               xfs_qm_dqdestroy(dqp);
-       } else {
-               qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
-               qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
-               qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
-               qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
-               qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
-               qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
-       }
-
-       return 0;
-}
-
-
-/*
- * Gets called when unmounting a filesystem or when all quotas get
- * turned off.
- * This purges the quota inodes, destroys locks and frees itself.
- */
-void
-xfs_qm_destroy_quotainfo(
-       xfs_mount_t     *mp)
-{
-       xfs_quotainfo_t *qi;
-
-       qi = mp->m_quotainfo;
-       ASSERT(qi != NULL);
-       ASSERT(xfs_Gqm != NULL);
-
-       /*
-        * Release the reference that XQM kept, so that we know
-        * when the XQM structure should be freed. We cannot assume
-        * that xfs_Gqm is non-null after this point.
-        */
-       xfs_qm_rele_quotafs_ref(mp);
-
-       ASSERT(list_empty(&qi->qi_dqlist));
-       mutex_destroy(&qi->qi_dqlist_lock);
-
-       if (qi->qi_uquotaip) {
-               IRELE(qi->qi_uquotaip);
-               qi->qi_uquotaip = NULL; /* paranoia */
-       }
-       if (qi->qi_gquotaip) {
-               IRELE(qi->qi_gquotaip);
-               qi->qi_gquotaip = NULL;
-       }
-       mutex_destroy(&qi->qi_quotaofflock);
-       kmem_free(qi);
-       mp->m_quotainfo = NULL;
-}
-
-
-
-/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
-
-/* ARGSUSED */
-STATIC void
-xfs_qm_list_init(
-       xfs_dqlist_t    *list,
-       char            *str,
-       int             n)
-{
-       mutex_init(&list->qh_lock);
-       INIT_LIST_HEAD(&list->qh_list);
-       list->qh_version = 0;
-       list->qh_nelems = 0;
-}
-
-STATIC void
-xfs_qm_list_destroy(
-       xfs_dqlist_t    *list)
-{
-       mutex_destroy(&(list->qh_lock));
-}
-
-/*
- * Create an inode and return with a reference already taken, but unlocked
- * This is how we create quota inodes
- */
-STATIC int
-xfs_qm_qino_alloc(
-       xfs_mount_t     *mp,
-       xfs_inode_t     **ip,
-       __int64_t       sbfields,
-       uint            flags)
-{
-       xfs_trans_t     *tp;
-       int             error;
-       int             committed;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
-       if ((error = xfs_trans_reserve(tp,
-                                     XFS_QM_QINOCREATE_SPACE_RES(mp),
-                                     XFS_CREATE_LOG_RES(mp), 0,
-                                     XFS_TRANS_PERM_LOG_RES,
-                                     XFS_CREATE_LOG_COUNT))) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-
-       error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
-       if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                XFS_TRANS_ABORT);
-               return error;
-       }
-
-       /*
-        * Make the changes in the superblock, and log those too.
-        * sbfields arg may contain fields other than *QUOTINO;
-        * VERSIONNUM for example.
-        */
-       spin_lock(&mp->m_sb_lock);
-       if (flags & XFS_QMOPT_SBVERSION) {
-               ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
-               ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                  XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
-                      (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                       XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
-
-               xfs_sb_version_addquota(&mp->m_sb);
-               mp->m_sb.sb_uquotino = NULLFSINO;
-               mp->m_sb.sb_gquotino = NULLFSINO;
-
-               /* qflags will get updated _after_ quotacheck */
-               mp->m_sb.sb_qflags = 0;
-       }
-       if (flags & XFS_QMOPT_UQUOTA)
-               mp->m_sb.sb_uquotino = (*ip)->i_ino;
-       else
-               mp->m_sb.sb_gquotino = (*ip)->i_ino;
-       spin_unlock(&mp->m_sb_lock);
-       xfs_mod_sb(tp, sbfields);
-
-       if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
-               xfs_alert(mp, "%s failed (error %d)!", __func__, error);
-               return error;
-       }
-       return 0;
-}
-
-
-STATIC void
-xfs_qm_reset_dqcounts(
-       xfs_mount_t     *mp,
-       xfs_buf_t       *bp,
-       xfs_dqid_t      id,
-       uint            type)
-{
-       xfs_disk_dquot_t        *ddq;
-       int                     j;
-
-       trace_xfs_reset_dqcounts(bp, _RET_IP_);
-
-       /*
-        * Reset all counters and timers. They'll be
-        * started afresh by xfs_qm_quotacheck.
-        */
-#ifdef DEBUG
-       j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-       do_div(j, sizeof(xfs_dqblk_t));
-       ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
-#endif
-       ddq = bp->b_addr;
-       for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
-               /*
-                * Do a sanity check, and if needed, repair the dqblk. Don't
-                * output any warnings because it's perfectly possible to
-                * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
-                */
-               (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
-                                     "xfs_quotacheck");
-               ddq->d_bcount = 0;
-               ddq->d_icount = 0;
-               ddq->d_rtbcount = 0;
-               ddq->d_btimer = 0;
-               ddq->d_itimer = 0;
-               ddq->d_rtbtimer = 0;
-               ddq->d_bwarns = 0;
-               ddq->d_iwarns = 0;
-               ddq->d_rtbwarns = 0;
-               ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
-       }
-}
-
-STATIC int
-xfs_qm_dqiter_bufs(
-       xfs_mount_t     *mp,
-       xfs_dqid_t      firstid,
-       xfs_fsblock_t   bno,
-       xfs_filblks_t   blkcnt,
-       uint            flags)
-{
-       xfs_buf_t       *bp;
-       int             error;
-       int             type;
-
-       ASSERT(blkcnt > 0);
-       type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
-               (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
-       error = 0;
-
-       /*
-        * Blkcnt arg can be a very big number, and might even be
-        * larger than the log itself. So, we have to break it up into
-        * manageable-sized transactions.
-        * Note that we don't start a permanent transaction here; we might
-        * not be able to get a log reservation for the whole thing up front,
-        * and we don't really care to either, because we just discard
-        * everything if we were to crash in the middle of this loop.
-        */
-       while (blkcnt--) {
-               error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                             XFS_FSB_TO_DADDR(mp, bno),
-                             mp->m_quotainfo->qi_dqchunklen, 0, &bp);
-               if (error)
-                       break;
-
-               xfs_qm_reset_dqcounts(mp, bp, firstid, type);
-               xfs_bdwrite(mp, bp);
-               /*
-                * goto the next block.
-                */
-               bno++;
-               firstid += mp->m_quotainfo->qi_dqperchunk;
-       }
-       return error;
-}
-
-/*
- * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
- * caller supplied function for every chunk of dquots that we find.
- */
-STATIC int
-xfs_qm_dqiterate(
-       xfs_mount_t     *mp,
-       xfs_inode_t     *qip,
-       uint            flags)
-{
-       xfs_bmbt_irec_t         *map;
-       int                     i, nmaps;       /* number of map entries */
-       int                     error;          /* return value */
-       xfs_fileoff_t           lblkno;
-       xfs_filblks_t           maxlblkcnt;
-       xfs_dqid_t              firstid;
-       xfs_fsblock_t           rablkno;
-       xfs_filblks_t           rablkcnt;
-
-       error = 0;
-       /*
-        * This looks racy, but we can't keep an inode lock across a
-        * trans_reserve. But, this gets called during quotacheck, and that
-        * happens only at mount time which is single threaded.
-        */
-       if (qip->i_d.di_nblocks == 0)
-               return 0;
-
-       map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
-
-       lblkno = 0;
-       maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
-       do {
-               nmaps = XFS_DQITER_MAP_SIZE;
-               /*
-                * We aren't changing the inode itself. Just changing
-                * some of its data. No new blocks are added here, and
-                * the inode is never added to the transaction.
-                */
-               xfs_ilock(qip, XFS_ILOCK_SHARED);
-               error = xfs_bmapi(NULL, qip, lblkno,
-                                 maxlblkcnt - lblkno,
-                                 XFS_BMAPI_METADATA,
-                                 NULL,
-                                 0, map, &nmaps, NULL);
-               xfs_iunlock(qip, XFS_ILOCK_SHARED);
-               if (error)
-                       break;
-
-               ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
-               for (i = 0; i < nmaps; i++) {
-                       ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
-                       ASSERT(map[i].br_blockcount);
-
-
-                       lblkno += map[i].br_blockcount;
-
-                       if (map[i].br_startblock == HOLESTARTBLOCK)
-                               continue;
-
-                       firstid = (xfs_dqid_t) map[i].br_startoff *
-                               mp->m_quotainfo->qi_dqperchunk;
-                       /*
-                        * Do a read-ahead on the next extent.
-                        */
-                       if ((i+1 < nmaps) &&
-                           (map[i+1].br_startblock != HOLESTARTBLOCK)) {
-                               rablkcnt =  map[i+1].br_blockcount;
-                               rablkno = map[i+1].br_startblock;
-                               while (rablkcnt--) {
-                                       xfs_buf_readahead(mp->m_ddev_targp,
-                                              XFS_FSB_TO_DADDR(mp, rablkno),
-                                              mp->m_quotainfo->qi_dqchunklen);
-                                       rablkno++;
-                               }
-                       }
-                       /*
-                        * Iterate thru all the blks in the extent and
-                        * reset the counters of all the dquots inside them.
-                        */
-                       if ((error = xfs_qm_dqiter_bufs(mp,
-                                                      firstid,
-                                                      map[i].br_startblock,
-                                                      map[i].br_blockcount,
-                                                      flags))) {
-                               break;
-                       }
-               }
-
-               if (error)
-                       break;
-       } while (nmaps > 0);
-
-       kmem_free(map);
-
-       return error;
-}
-
-/*
- * Called by dqusage_adjust in doing a quotacheck.
- *
- * Given the inode, and a dquot id this updates both the incore dqout as well
- * as the buffer copy. This is so that once the quotacheck is done, we can
- * just log all the buffers, as opposed to logging numerous updates to
- * individual dquots.
- */
-STATIC int
-xfs_qm_quotacheck_dqadjust(
-       struct xfs_inode        *ip,
-       xfs_dqid_t              id,
-       uint                    type,
-       xfs_qcnt_t              nblks,
-       xfs_qcnt_t              rtblks)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_dquot        *dqp;
-       int                     error;
-
-       error = xfs_qm_dqget(mp, ip, id, type,
-                            XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
-       if (error) {
-               /*
-                * Shouldn't be able to turn off quotas here.
-                */
-               ASSERT(error != ESRCH);
-               ASSERT(error != ENOENT);
-               return error;
-       }
-
-       trace_xfs_dqadjust(dqp);
-
-       /*
-        * Adjust the inode count and the block count to reflect this inode's
-        * resource usage.
-        */
-       be64_add_cpu(&dqp->q_core.d_icount, 1);
-       dqp->q_res_icount++;
-       if (nblks) {
-               be64_add_cpu(&dqp->q_core.d_bcount, nblks);
-               dqp->q_res_bcount += nblks;
-       }
-       if (rtblks) {
-               be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
-               dqp->q_res_rtbcount += rtblks;
-       }
-
-       /*
-        * Set default limits, adjust timers (since we changed usages)
-        *
-        * There are no timers for the default values set in the root dquot.
-        */
-       if (dqp->q_core.d_id) {
-               xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
-               xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
-       }
-
-       dqp->dq_flags |= XFS_DQ_DIRTY;
-       xfs_qm_dqput(dqp);
-       return 0;
-}
-
-STATIC int
-xfs_qm_get_rtblks(
-       xfs_inode_t     *ip,
-       xfs_qcnt_t      *O_rtblks)
-{
-       xfs_filblks_t   rtblks;                 /* total rt blks */
-       xfs_extnum_t    idx;                    /* extent record index */
-       xfs_ifork_t     *ifp;                   /* inode fork pointer */
-       xfs_extnum_t    nextents;               /* number of extent entries */
-       int             error;
-
-       ASSERT(XFS_IS_REALTIME_INODE(ip));
-       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
-               if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
-                       return error;
-       }
-       rtblks = 0;
-       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-       for (idx = 0; idx < nextents; idx++)
-               rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
-       *O_rtblks = (xfs_qcnt_t)rtblks;
-       return 0;
-}
-
-/*
- * callback routine supplied to bulkstat(). Given an inumber, find its
- * dquots and update them to account for resources taken by that inode.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_dqusage_adjust(
-       xfs_mount_t     *mp,            /* mount point for filesystem */
-       xfs_ino_t       ino,            /* inode number to get data for */
-       void            __user *buffer, /* not used */
-       int             ubsize,         /* not used */
-       int             *ubused,        /* not used */
-       int             *res)           /* result code value */
-{
-       xfs_inode_t     *ip;
-       xfs_qcnt_t      nblks, rtblks = 0;
-       int             error;
-
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       /*
-        * rootino must have its resources accounted for, not so with the quota
-        * inodes.
-        */
-       if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
-               *res = BULKSTAT_RV_NOTHING;
-               return XFS_ERROR(EINVAL);
-       }
-
-       /*
-        * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
-        * interface expects the inode to be exclusively locked because that's
-        * the case in all other instances. It's OK that we do this because
-        * quotacheck is done only at mount time.
-        */
-       error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
-       if (error) {
-               *res = BULKSTAT_RV_NOTHING;
-               return error;
-       }
-
-       ASSERT(ip->i_delayed_blks == 0);
-
-       if (XFS_IS_REALTIME_INODE(ip)) {
-               /*
-                * Walk thru the extent list and count the realtime blocks.
-                */
-               error = xfs_qm_get_rtblks(ip, &rtblks);
-               if (error)
-                       goto error0;
-       }
-
-       nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
-
-       /*
-        * Add the (disk blocks and inode) resources occupied by this
-        * inode to its dquots. We do this adjustment in the incore dquot,
-        * and also copy the changes to its buffer.
-        * We don't care about putting these changes in a transaction
-        * envelope because if we crash in the middle of a 'quotacheck'
-        * we have to start from the beginning anyway.
-        * Once we're done, we'll log all the dquot bufs.
-        *
-        * The *QUOTA_ON checks below may look pretty racy, but quotachecks
-        * and quotaoffs don't race. (Quotachecks happen at mount time only).
-        */
-       if (XFS_IS_UQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
-                                                  XFS_DQ_USER, nblks, rtblks);
-               if (error)
-                       goto error0;
-       }
-
-       if (XFS_IS_GQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
-                                                  XFS_DQ_GROUP, nblks, rtblks);
-               if (error)
-                       goto error0;
-       }
-
-       if (XFS_IS_PQUOTA_ON(mp)) {
-               error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
-                                                  XFS_DQ_PROJ, nblks, rtblks);
-               if (error)
-                       goto error0;
-       }
-
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       IRELE(ip);
-       *res = BULKSTAT_RV_DIDONE;
-       return 0;
-
-error0:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       IRELE(ip);
-       *res = BULKSTAT_RV_GIVEUP;
-       return error;
-}
-
-/*
- * Walk thru all the filesystem inodes and construct a consistent view
- * of the disk quota world. If the quotacheck fails, disable quotas.
- */
-int
-xfs_qm_quotacheck(
-       xfs_mount_t     *mp)
-{
-       int             done, count, error;
-       xfs_ino_t       lastino;
-       size_t          structsz;
-       xfs_inode_t     *uip, *gip;
-       uint            flags;
-
-       count = INT_MAX;
-       structsz = 1;
-       lastino = 0;
-       flags = 0;
-
-       ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       /*
-        * There should be no cached dquots. The (simplistic) quotacheck
-        * algorithm doesn't like that.
-        */
-       ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
-
-       xfs_notice(mp, "Quotacheck needed: Please wait.");
-
-       /*
-        * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
-        * their counters to zero. We need a clean slate.
-        * We don't log our changes till later.
-        */
-       uip = mp->m_quotainfo->qi_uquotaip;
-       if (uip) {
-               error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
-               if (error)
-                       goto error_return;
-               flags |= XFS_UQUOTA_CHKD;
-       }
-
-       gip = mp->m_quotainfo->qi_gquotaip;
-       if (gip) {
-               error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
-                                       XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
-               if (error)
-                       goto error_return;
-               flags |= XFS_OQUOTA_CHKD;
-       }
-
-       do {
-               /*
-                * Iterate thru all the inodes in the file system,
-                * adjusting the corresponding dquot counters in core.
-                */
-               error = xfs_bulkstat(mp, &lastino, &count,
-                                    xfs_qm_dqusage_adjust,
-                                    structsz, NULL, &done);
-               if (error)
-                       break;
-
-       } while (!done);
-
-       /*
-        * We've made all the changes that we need to make incore.
-        * Flush them down to disk buffers if everything was updated
-        * successfully.
-        */
-       if (!error)
-               error = xfs_qm_dqflush_all(mp, 0);
-
-       /*
-        * We can get this error if we couldn't do a dquot allocation inside
-        * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
-        * dirty dquots that might be cached, we just want to get rid of them
-        * and turn quotaoff. The dquots won't be attached to any of the inodes
-        * at this point (because we intentionally didn't in dqget_noattach).
-        */
-       if (error) {
-               xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
-               goto error_return;
-       }
-
-       /*
-        * We didn't log anything, because if we crashed, we'll have to
-        * start the quotacheck from scratch anyway. However, we must make
-        * sure that our dquot changes are secure before we put the
-        * quotacheck'd stamp on the superblock. So, here we do a synchronous
-        * flush.
-        */
-       XFS_bflush(mp->m_ddev_targp);
-
-       /*
-        * If one type of quotas is off, then it will lose its
-        * quotachecked status, since we won't be doing accounting for
-        * that type anymore.
-        */
-       mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
-       mp->m_qflags |= flags;
-
- error_return:
-       if (error) {
-               xfs_warn(mp,
-       "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
-                       error);
-               /*
-                * We must turn off quotas.
-                */
-               ASSERT(mp->m_quotainfo != NULL);
-               ASSERT(xfs_Gqm != NULL);
-               xfs_qm_destroy_quotainfo(mp);
-               if (xfs_mount_reset_sbqflags(mp)) {
-                       xfs_warn(mp,
-                               "Quotacheck: Failed to reset quota flags.");
-               }
-       } else
-               xfs_notice(mp, "Quotacheck: Done.");
-       return (error);
-}
-
-/*
- * This is called after the superblock has been read in and we're ready to
- * iget the quota inodes.
- */
-STATIC int
-xfs_qm_init_quotainos(
-       xfs_mount_t     *mp)
-{
-       xfs_inode_t     *uip, *gip;
-       int             error;
-       __int64_t       sbflags;
-       uint            flags;
-
-       ASSERT(mp->m_quotainfo);
-       uip = gip = NULL;
-       sbflags = 0;
-       flags = 0;
-
-       /*
-        * Get the uquota and gquota inodes
-        */
-       if (xfs_sb_version_hasquota(&mp->m_sb)) {
-               if (XFS_IS_UQUOTA_ON(mp) &&
-                   mp->m_sb.sb_uquotino != NULLFSINO) {
-                       ASSERT(mp->m_sb.sb_uquotino > 0);
-                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                            0, 0, &uip)))
-                               return XFS_ERROR(error);
-               }
-               if (XFS_IS_OQUOTA_ON(mp) &&
-                   mp->m_sb.sb_gquotino != NULLFSINO) {
-                       ASSERT(mp->m_sb.sb_gquotino > 0);
-                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                            0, 0, &gip))) {
-                               if (uip)
-                                       IRELE(uip);
-                               return XFS_ERROR(error);
-                       }
-               }
-       } else {
-               flags |= XFS_QMOPT_SBVERSION;
-               sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                           XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
-       }
-
-       /*
-        * Create the two inodes, if they don't exist already. The changes
-        * made above will get added to a transaction and logged in one of
-        * the qino_alloc calls below.  If the device is readonly,
-        * temporarily switch to read-write to do this.
-        */
-       if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
-               if ((error = xfs_qm_qino_alloc(mp, &uip,
-                                             sbflags | XFS_SB_UQUOTINO,
-                                             flags | XFS_QMOPT_UQUOTA)))
-                       return XFS_ERROR(error);
-
-               flags &= ~XFS_QMOPT_SBVERSION;
-       }
-       if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
-               flags |= (XFS_IS_GQUOTA_ON(mp) ?
-                               XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
-               error = xfs_qm_qino_alloc(mp, &gip,
-                                         sbflags | XFS_SB_GQUOTINO, flags);
-               if (error) {
-                       if (uip)
-                               IRELE(uip);
-
-                       return XFS_ERROR(error);
-               }
-       }
-
-       mp->m_quotainfo->qi_uquotaip = uip;
-       mp->m_quotainfo->qi_gquotaip = gip;
-
-       return 0;
-}
-
-
-
-/*
- * Just pop the least recently used dquot off the freelist and
- * recycle it. The returned dquot is locked.
- */
-STATIC xfs_dquot_t *
-xfs_qm_dqreclaim_one(void)
-{
-       xfs_dquot_t     *dqpout;
-       xfs_dquot_t     *dqp;
-       int             restarts;
-       int             startagain;
-
-       restarts = 0;
-       dqpout = NULL;
-
-       /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
-again:
-       startagain = 0;
-       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
-
-       list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
-               struct xfs_mount *mp = dqp->q_mount;
-               xfs_dqlock(dqp);
-
-               /*
-                * We are racing with dqlookup here. Naturally we don't
-                * want to reclaim a dquot that lookup wants. We release the
-                * freelist lock and start over, so that lookup will grab
-                * both the dquot and the freelistlock.
-                */
-               if (dqp->dq_flags & XFS_DQ_WANT) {
-                       ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
-
-                       trace_xfs_dqreclaim_want(dqp);
-                       XQM_STATS_INC(xqmstats.xs_qm_dqwants);
-                       restarts++;
-                       startagain = 1;
-                       goto dqunlock;
-               }
-
-               /*
-                * If the dquot is inactive, we are assured that it is
-                * not on the mplist or the hashlist, and that makes our
-                * life easier.
-                */
-               if (dqp->dq_flags & XFS_DQ_INACTIVE) {
-                       ASSERT(mp == NULL);
-                       ASSERT(! XFS_DQ_IS_DIRTY(dqp));
-                       ASSERT(list_empty(&dqp->q_hashlist));
-                       ASSERT(list_empty(&dqp->q_mplist));
-                       list_del_init(&dqp->q_freelist);
-                       xfs_Gqm->qm_dqfrlist_cnt--;
-                       dqpout = dqp;
-                       XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
-                       goto dqunlock;
-               }
-
-               ASSERT(dqp->q_hash);
-               ASSERT(!list_empty(&dqp->q_mplist));
-
-               /*
-                * Try to grab the flush lock. If this dquot is in the process
-                * of getting flushed to disk, we don't want to reclaim it.
-                */
-               if (!xfs_dqflock_nowait(dqp))
-                       goto dqunlock;
-
-               /*
-                * We have the flush lock so we know that this is not in the
-                * process of being flushed. So, if this is dirty, flush it
-                * DELWRI so that we don't get a freelist infested with
-                * dirty dquots.
-                */
-               if (XFS_DQ_IS_DIRTY(dqp)) {
-                       int     error;
-
-                       trace_xfs_dqreclaim_dirty(dqp);
-
-                       /*
-                        * We flush it delayed write, so don't bother
-                        * releasing the freelist lock.
-                        */
-                       error = xfs_qm_dqflush(dqp, 0);
-                       if (error) {
-                               xfs_warn(mp, "%s: dquot %p flush failed",
-                                       __func__, dqp);
-                       }
-                       goto dqunlock;
-               }
-
-               /*
-                * We're trying to get the hashlock out of order. This races
-                * with dqlookup; so, we giveup and goto the next dquot if
-                * we couldn't get the hashlock. This way, we won't starve
-                * a dqlookup process that holds the hashlock that is
-                * waiting for the freelist lock.
-                */
-               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
-                       restarts++;
-                       goto dqfunlock;
-               }
-
-               /*
-                * This races with dquot allocation code as well as dqflush_all
-                * and reclaim code. So, if we failed to grab the mplist lock,
-                * giveup everything and start over.
-                */
-               if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
-                       restarts++;
-                       startagain = 1;
-                       goto qhunlock;
-               }
-
-               ASSERT(dqp->q_nrefs == 0);
-               list_del_init(&dqp->q_mplist);
-               mp->m_quotainfo->qi_dquots--;
-               mp->m_quotainfo->qi_dqreclaims++;
-               list_del_init(&dqp->q_hashlist);
-               dqp->q_hash->qh_version++;
-               list_del_init(&dqp->q_freelist);
-               xfs_Gqm->qm_dqfrlist_cnt--;
-               dqpout = dqp;
-               mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
-qhunlock:
-               mutex_unlock(&dqp->q_hash->qh_lock);
-dqfunlock:
-               xfs_dqfunlock(dqp);
-dqunlock:
-               xfs_dqunlock(dqp);
-               if (dqpout)
-                       break;
-               if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
-                       break;
-               if (startagain) {
-                       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-                       goto again;
-               }
-       }
-       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
-       return dqpout;
-}
-
-/*
- * Traverse the freelist of dquots and attempt to reclaim a maximum of
- * 'howmany' dquots. This operation races with dqlookup(), and attempts to
- * favor the lookup function ...
- */
-STATIC int
-xfs_qm_shake_freelist(
-       int     howmany)
-{
-       int             nreclaimed = 0;
-       xfs_dquot_t     *dqp;
-
-       if (howmany <= 0)
-               return 0;
-
-       while (nreclaimed < howmany) {
-               dqp = xfs_qm_dqreclaim_one();
-               if (!dqp)
-                       return nreclaimed;
-               xfs_qm_dqdestroy(dqp);
-               nreclaimed++;
-       }
-       return nreclaimed;
-}
-
-/*
- * The kmem_shake interface is invoked when memory is running low.
- */
-/* ARGSUSED */
-STATIC int
-xfs_qm_shake(
-       struct shrinker *shrink,
-       struct shrink_control *sc)
-{
-       int     ndqused, nfree, n;
-       gfp_t gfp_mask = sc->gfp_mask;
-
-       if (!kmem_shake_allow(gfp_mask))
-               return 0;
-       if (!xfs_Gqm)
-               return 0;
-
-       nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
-       /* incore dquots in all f/s's */
-       ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
-
-       ASSERT(ndqused >= 0);
-
-       if (nfree <= ndqused && nfree < ndquot)
-               return 0;
-
-       ndqused *= xfs_Gqm->qm_dqfree_ratio;    /* target # of free dquots */
-       n = nfree - ndqused - ndquot;           /* # over target */
-
-       return xfs_qm_shake_freelist(MAX(nfree, n));
-}
-
-
-/*------------------------------------------------------------------*/
-
-/*
- * Return a new incore dquot. Depending on the number of
- * dquots in the system, we either allocate a new one on the kernel heap,
- * or reclaim a free one.
- * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
- * to reclaim an existing one from the freelist.
- */
-boolean_t
-xfs_qm_dqalloc_incore(
-       xfs_dquot_t **O_dqpp)
-{
-       xfs_dquot_t     *dqp;
-
-       /*
-        * Check against high water mark to see if we want to pop
-        * a nincompoop dquot off the freelist.
-        */
-       if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
-               /*
-                * Try to recycle a dquot from the freelist.
-                */
-               if ((dqp = xfs_qm_dqreclaim_one())) {
-                       XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
-                       /*
-                        * Just zero the core here. The rest will get
-                        * reinitialized by caller. XXX we shouldn't even
-                        * do this zero ...
-                        */
-                       memset(&dqp->q_core, 0, sizeof(dqp->q_core));
-                       *O_dqpp = dqp;
-                       return B_FALSE;
-               }
-               XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
-       }
-
-       /*
-        * Allocate a brand new dquot on the kernel heap and return it
-        * to the caller to initialize.
-        */
-       ASSERT(xfs_Gqm->qm_dqzone != NULL);
-       *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
-       atomic_inc(&xfs_Gqm->qm_totaldquots);
-
-       return B_TRUE;
-}
-
-
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
-       xfs_mount_t     *mp,
-       __int64_t       flags)
-{
-       xfs_trans_t     *tp;
-       int             error;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-       if ((error = xfs_trans_reserve(tp, 0,
-                                     mp->m_sb.sb_sectsize + 128, 0,
-                                     0,
-                                     XFS_DEFAULT_LOG_COUNT))) {
-               xfs_trans_cancel(tp, 0);
-               return error;
-       }
-
-       xfs_mod_sb(tp, flags);
-       error = xfs_trans_commit(tp, 0);
-
-       return error;
-}
-
-
-/* --------------- utility functions for vnodeops ---------------- */
-
-
-/*
- * Given an inode, a uid, gid and prid make sure that we have
- * allocated relevant dquot(s) on disk, and that we won't exceed inode
- * quotas by creating this file.
- * This also attaches dquot(s) to the given inode after locking it,
- * and returns the dquots corresponding to the uid and/or gid.
- *
- * in  : inode (unlocked)
- * out : udquot, gdquot with references taken and unlocked
- */
-int
-xfs_qm_vop_dqalloc(
-       struct xfs_inode        *ip,
-       uid_t                   uid,
-       gid_t                   gid,
-       prid_t                  prid,
-       uint                    flags,
-       struct xfs_dquot        **O_udqpp,
-       struct xfs_dquot        **O_gdqpp)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-       struct xfs_dquot        *uq, *gq;
-       int                     error;
-       uint                    lockflags;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return 0;
-
-       lockflags = XFS_ILOCK_EXCL;
-       xfs_ilock(ip, lockflags);
-
-       if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
-               gid = ip->i_d.di_gid;
-
-       /*
-        * Attach the dquot(s) to this inode, doing a dquot allocation
-        * if necessary. The dquot(s) will not be locked.
-        */
-       if (XFS_NOT_DQATTACHED(mp, ip)) {
-               error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
-               if (error) {
-                       xfs_iunlock(ip, lockflags);
-                       return error;
-               }
-       }
-
-       uq = gq = NULL;
-       if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
-               if (ip->i_d.di_uid != uid) {
-                       /*
-                        * What we need is the dquot that has this uid, and
-                        * if we send the inode to dqget, the uid of the inode
-                        * takes priority over what's sent in the uid argument.
-                        * We must unlock inode here before calling dqget if
-                        * we're not sending the inode, because otherwise
-                        * we'll deadlock by doing trans_reserve while
-                        * holding ilock.
-                        */
-                       xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
-                                                XFS_DQ_USER,
-                                                XFS_QMOPT_DQALLOC |
-                                                XFS_QMOPT_DOWARN,
-                                                &uq))) {
-                               ASSERT(error != ENOENT);
-                               return error;
-                       }
-                       /*
-                        * Get the ilock in the right order.
-                        */
-                       xfs_dqunlock(uq);
-                       lockflags = XFS_ILOCK_SHARED;
-                       xfs_ilock(ip, lockflags);
-               } else {
-                       /*
-                        * Take an extra reference, because we'll return
-                        * this to caller
-                        */
-                       ASSERT(ip->i_udquot);
-                       uq = ip->i_udquot;
-                       xfs_dqlock(uq);
-                       XFS_DQHOLD(uq);
-                       xfs_dqunlock(uq);
-               }
-       }
-       if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
-               if (ip->i_d.di_gid != gid) {
-                       xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
-                                                XFS_DQ_GROUP,
-                                                XFS_QMOPT_DQALLOC |
-                                                XFS_QMOPT_DOWARN,
-                                                &gq))) {
-                               if (uq)
-                                       xfs_qm_dqrele(uq);
-                               ASSERT(error != ENOENT);
-                               return error;
-                       }
-                       xfs_dqunlock(gq);
-                       lockflags = XFS_ILOCK_SHARED;
-                       xfs_ilock(ip, lockflags);
-               } else {
-                       ASSERT(ip->i_gdquot);
-                       gq = ip->i_gdquot;
-                       xfs_dqlock(gq);
-                       XFS_DQHOLD(gq);
-                       xfs_dqunlock(gq);
-               }
-       } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
-               if (xfs_get_projid(ip) != prid) {
-                       xfs_iunlock(ip, lockflags);
-                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
-                                                XFS_DQ_PROJ,
-                                                XFS_QMOPT_DQALLOC |
-                                                XFS_QMOPT_DOWARN,
-                                                &gq))) {
-                               if (uq)
-                                       xfs_qm_dqrele(uq);
-                               ASSERT(error != ENOENT);
-                               return (error);
-                       }
-                       xfs_dqunlock(gq);
-                       lockflags = XFS_ILOCK_SHARED;
-                       xfs_ilock(ip, lockflags);
-               } else {
-                       ASSERT(ip->i_gdquot);
-                       gq = ip->i_gdquot;
-                       xfs_dqlock(gq);
-                       XFS_DQHOLD(gq);
-                       xfs_dqunlock(gq);
-               }
-       }
-       if (uq)
-               trace_xfs_dquot_dqalloc(ip);
-
-       xfs_iunlock(ip, lockflags);
-       if (O_udqpp)
-               *O_udqpp = uq;
-       else if (uq)
-               xfs_qm_dqrele(uq);
-       if (O_gdqpp)
-               *O_gdqpp = gq;
-       else if (gq)
-               xfs_qm_dqrele(gq);
-       return 0;
-}
-
-/*
- * Actually transfer ownership, and do dquot modifications.
- * These were already reserved.
- */
-xfs_dquot_t *
-xfs_qm_vop_chown(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       xfs_dquot_t     **IO_olddq,
-       xfs_dquot_t     *newdq)
-{
-       xfs_dquot_t     *prevdq;
-       uint            bfield = XFS_IS_REALTIME_INODE(ip) ?
-                                XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
-
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
-
-       /* old dquot */
-       prevdq = *IO_olddq;
-       ASSERT(prevdq);
-       ASSERT(prevdq != newdq);
-
-       xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
-       xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
-
-       /* the sparkling new dquot */
-       xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
-       xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
-
-       /*
-        * Take an extra reference, because the inode
-        * is going to keep this dquot pointer even
-        * after the trans_commit.
-        */
-       xfs_dqlock(newdq);
-       XFS_DQHOLD(newdq);
-       xfs_dqunlock(newdq);
-       *IO_olddq = newdq;
-
-       return prevdq;
-}
-
-/*
- * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
- */
-int
-xfs_qm_vop_chown_reserve(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       xfs_dquot_t     *udqp,
-       xfs_dquot_t     *gdqp,
-       uint            flags)
-{
-       xfs_mount_t     *mp = ip->i_mount;
-       uint            delblks, blkflags, prjflags = 0;
-       xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
-       int             error;
-
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       delblks = ip->i_delayed_blks;
-       delblksudq = delblksgdq = unresudq = unresgdq = NULL;
-       blkflags = XFS_IS_REALTIME_INODE(ip) ?
-                       XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
-
-       if (XFS_IS_UQUOTA_ON(mp) && udqp &&
-           ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
-               delblksudq = udqp;
-               /*
-                * If there are delayed allocation blocks, then we have to
-                * unreserve those from the old dquot, and add them to the
-                * new dquot.
-                */
-               if (delblks) {
-                       ASSERT(ip->i_udquot);
-                       unresudq = ip->i_udquot;
-               }
-       }
-       if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
-               if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
-                    xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
-                       prjflags = XFS_QMOPT_ENOSPC;
-
-               if (prjflags ||
-                   (XFS_IS_GQUOTA_ON(ip->i_mount) &&
-                    ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
-                       delblksgdq = gdqp;
-                       if (delblks) {
-                               ASSERT(ip->i_gdquot);
-                               unresgdq = ip->i_gdquot;
-                       }
-               }
-       }
-
-       if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
-                               delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
-                               flags | blkflags | prjflags)))
-               return (error);
-
-       /*
-        * Do the delayed blks reservations/unreservations now. Since, these
-        * are done without the help of a transaction, if a reservation fails
-        * its previous reservations won't be automatically undone by trans
-        * code. So, we have to do it manually here.
-        */
-       if (delblks) {
-               /*
-                * Do the reservations first. Unreservation can't fail.
-                */
-               ASSERT(delblksudq || delblksgdq);
-               ASSERT(unresudq || unresgdq);
-               if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                               delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
-                               flags | blkflags | prjflags)))
-                       return (error);
-               xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
-                               unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
-                               blkflags);
-       }
-
-       return (0);
-}
-
-int
-xfs_qm_vop_rename_dqattach(
-       struct xfs_inode        **i_tab)
-{
-       struct xfs_mount        *mp = i_tab[0]->i_mount;
-       int                     i;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return 0;
-
-       for (i = 0; (i < 4 && i_tab[i]); i++) {
-               struct xfs_inode        *ip = i_tab[i];
-               int                     error;
-
-               /*
-                * Watch out for duplicate entries in the table.
-                */
-               if (i == 0 || ip != i_tab[i-1]) {
-                       if (XFS_NOT_DQATTACHED(mp, ip)) {
-                               error = xfs_qm_dqattach(ip, 0);
-                               if (error)
-                                       return error;
-                       }
-               }
-       }
-       return 0;
-}
-
-void
-xfs_qm_vop_create_dqattach(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *ip,
-       struct xfs_dquot        *udqp,
-       struct xfs_dquot        *gdqp)
-{
-       struct xfs_mount        *mp = tp->t_mountp;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return;
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
-
-       if (udqp) {
-               xfs_dqlock(udqp);
-               XFS_DQHOLD(udqp);
-               xfs_dqunlock(udqp);
-               ASSERT(ip->i_udquot == NULL);
-               ip->i_udquot = udqp;
-               ASSERT(XFS_IS_UQUOTA_ON(mp));
-               ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
-               xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
-       }
-       if (gdqp) {
-               xfs_dqlock(gdqp);
-               XFS_DQHOLD(gdqp);
-               xfs_dqunlock(gdqp);
-               ASSERT(ip->i_gdquot == NULL);
-               ip->i_gdquot = gdqp;
-               ASSERT(XFS_IS_OQUOTA_ON(mp));
-               ASSERT((XFS_IS_GQUOTA_ON(mp) ?
-                       ip->i_d.di_gid : xfs_get_projid(ip)) ==
-                               be32_to_cpu(gdqp->q_core.d_id));
-               xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
-       }
-}
-
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
deleted file mode 100644 (file)
index 43b9abe..0000000
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_H__
-#define __XFS_QM_H__
-
-#include "xfs_dquot_item.h"
-#include "xfs_dquot.h"
-#include "xfs_quota_priv.h"
-#include "xfs_qm_stats.h"
-
-struct xfs_qm;
-struct xfs_inode;
-
-extern uint            ndquot;
-extern struct mutex    xfs_Gqm_lock;
-extern struct xfs_qm   *xfs_Gqm;
-extern kmem_zone_t     *qm_dqzone;
-extern kmem_zone_t     *qm_dqtrxzone;
-
-/*
- * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
- * iterate over the mountpt's dquot list in one call.
- */
-#define XFS_QM_SYNC_MAX_RESTARTS       7
-
-/*
- * Ditto, for xfs_qm_dqreclaim_one.
- */
-#define XFS_QM_RECLAIM_MAX_RESTARTS    4
-
-/*
- * Ideal ratio of free to in use dquots. Quota manager makes an attempt
- * to keep this balance.
- */
-#define XFS_QM_DQFREE_RATIO            2
-
-/*
- * Dquot hashtable constants/threshold values.
- */
-#define XFS_QM_HASHSIZE_LOW            (PAGE_SIZE / sizeof(xfs_dqhash_t))
-#define XFS_QM_HASHSIZE_HIGH           ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
-
-/*
- * This defines the unit of allocation of dquots.
- * Currently, it is just one file system block, and a 4K blk contains 30
- * (136 * 30 = 4080) dquots. It's probably not worth trying to make
- * this more dynamic.
- * XXXsup However, if this number is changed, we have to make sure that we don't
- * implicitly assume that we do allocations in chunks of a single filesystem
- * block in the dquot/xqm code.
- */
-#define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
-
-typedef xfs_dqhash_t   xfs_dqlist_t;
-
-/*
- * Quota Manager (global) structure. Lives only in core.
- */
-typedef struct xfs_qm {
-       xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
-       xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
-       uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
-       struct list_head qm_dqfrlist;    /* freelist of dquots */
-       struct mutex     qm_dqfrlist_lock;
-       int              qm_dqfrlist_cnt;
-       atomic_t         qm_totaldquots; /* total incore dquots */
-       uint             qm_nrefs;       /* file systems with quota on */
-       int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
-       kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
-       kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
-} xfs_qm_t;
-
-/*
- * Various quota information for individual filesystems.
- * The mount structure keeps a pointer to this.
- */
-typedef struct xfs_quotainfo {
-       xfs_inode_t     *qi_uquotaip;    /* user quota inode */
-       xfs_inode_t     *qi_gquotaip;    /* group quota inode */
-       struct list_head qi_dqlist;      /* all dquots in filesys */
-       struct mutex     qi_dqlist_lock;
-       int              qi_dquots;
-       int              qi_dqreclaims;  /* a change here indicates
-                                           a removal in the dqlist */
-       time_t           qi_btimelimit;  /* limit for blks timer */
-       time_t           qi_itimelimit;  /* limit for inodes timer */
-       time_t           qi_rtbtimelimit;/* limit for rt blks timer */
-       xfs_qwarncnt_t   qi_bwarnlimit;  /* limit for blks warnings */
-       xfs_qwarncnt_t   qi_iwarnlimit;  /* limit for inodes warnings */
-       xfs_qwarncnt_t   qi_rtbwarnlimit;/* limit for rt blks warnings */
-       struct mutex     qi_quotaofflock;/* to serialize quotaoff */
-       xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
-       uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
-       xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
-       xfs_qcnt_t       qi_bsoftlimit;  /* default data blk soft limit */
-       xfs_qcnt_t       qi_ihardlimit;  /* default inode count hard limit */
-       xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
-       xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
-       xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
-} xfs_quotainfo_t;
-
-
-extern void    xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
-extern int     xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
-                       xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
-extern void    xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
-extern void    xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
-
-/*
- * We keep the usr and grp dquots separately so that locking will be easier
- * to do at commit time. All transactions that we know of at this point
- * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
- */
-#define XFS_QM_TRANS_MAXDQS            2
-typedef struct xfs_dquot_acct {
-       xfs_dqtrx_t     dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
-       xfs_dqtrx_t     dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
-} xfs_dquot_acct_t;
-
-/*
- * Users are allowed to have a usage exceeding their softlimit for
- * a period this long.
- */
-#define XFS_QM_BTIMELIMIT      (7 * 24*60*60)          /* 1 week */
-#define XFS_QM_RTBTIMELIMIT    (7 * 24*60*60)          /* 1 week */
-#define XFS_QM_ITIMELIMIT      (7 * 24*60*60)          /* 1 week */
-
-#define XFS_QM_BWARNLIMIT      5
-#define XFS_QM_IWARNLIMIT      5
-#define XFS_QM_RTBWARNLIMIT    5
-
-extern void            xfs_qm_destroy_quotainfo(xfs_mount_t *);
-extern int             xfs_qm_quotacheck(xfs_mount_t *);
-extern int             xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
-
-/* dquot stuff */
-extern boolean_t       xfs_qm_dqalloc_incore(xfs_dquot_t **);
-extern int             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
-extern void            xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
-
-/* quota ops */
-extern int             xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
-extern int             xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
-extern int             xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
-                                       fs_disk_quota_t *);
-extern int             xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
-extern int             xfs_qm_scall_quotaon(xfs_mount_t *, uint);
-extern int             xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
-
-#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
deleted file mode 100644 (file)
index a0a829a..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-
-STATIC void
-xfs_fill_statvfs_from_dquot(
-       struct kstatfs          *statp,
-       xfs_disk_dquot_t        *dp)
-{
-       __uint64_t              limit;
-
-       limit = dp->d_blk_softlimit ?
-               be64_to_cpu(dp->d_blk_softlimit) :
-               be64_to_cpu(dp->d_blk_hardlimit);
-       if (limit && statp->f_blocks > limit) {
-               statp->f_blocks = limit;
-               statp->f_bfree = statp->f_bavail =
-                       (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
-                        (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
-       }
-
-       limit = dp->d_ino_softlimit ?
-               be64_to_cpu(dp->d_ino_softlimit) :
-               be64_to_cpu(dp->d_ino_hardlimit);
-       if (limit && statp->f_files > limit) {
-               statp->f_files = limit;
-               statp->f_ffree =
-                       (statp->f_files > be64_to_cpu(dp->d_icount)) ?
-                        (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
-       }
-}
-
-
-/*
- * Directory tree accounting is implemented using project quotas, where
- * the project identifier is inherited from parent directories.
- * A statvfs (df, etc.) of a directory that is using project quota should
- * return a statvfs of the project, not the entire filesystem.
- * This makes such trees appear as if they are filesystems in themselves.
- */
-void
-xfs_qm_statvfs(
-       xfs_inode_t             *ip,
-       struct kstatfs          *statp)
-{
-       xfs_mount_t             *mp = ip->i_mount;
-       xfs_dquot_t             *dqp;
-
-       if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
-               xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
-               xfs_qm_dqput(dqp);
-       }
-}
-
-int
-xfs_qm_newmount(
-       xfs_mount_t     *mp,
-       uint            *needquotamount,
-       uint            *quotaflags)
-{
-       uint            quotaondisk;
-       uint            uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
-
-       quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
-                               (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
-
-       if (quotaondisk) {
-               uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
-               pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
-               gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
-       }
-
-       /*
-        * If the device itself is read-only, we can't allow
-        * the user to change the state of quota on the mount -
-        * this would generate a transaction on the ro device,
-        * which would lead to an I/O error and shutdown
-        */
-
-       if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
-           (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
-            (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
-           (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)) ||
-            (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
-           (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
-           xfs_dev_is_read_only(mp, "changing quota state")) {
-               xfs_warn(mp, "please mount with%s%s%s%s.",
-                       (!quotaondisk ? "out quota" : ""),
-                       (uquotaondisk ? " usrquota" : ""),
-                       (pquotaondisk ? " prjquota" : ""),
-                       (gquotaondisk ? " grpquota" : ""));
-               return XFS_ERROR(EPERM);
-       }
-
-       if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
-               /*
-                * Call mount_quotas at this point only if we won't have to do
-                * a quotacheck.
-                */
-               if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
-                       /*
-                        * If an error occurred, qm_mount_quotas code
-                        * has already disabled quotas. So, just finish
-                        * mounting, and get on with the boring life
-                        * without disk quotas.
-                        */
-                       xfs_qm_mount_quotas(mp);
-               } else {
-                       /*
-                        * Clear the quota flags, but remember them. This
-                        * is so that the quota code doesn't get invoked
-                        * before we're ready. This can happen when an
-                        * inode goes inactive and wants to free blocks,
-                        * or via xfs_log_mount_finish.
-                        */
-                       *needquotamount = B_TRUE;
-                       *quotaflags = mp->m_qflags;
-                       mp->m_qflags = 0;
-               }
-       }
-
-       return 0;
-}
-
-void __init
-xfs_qm_init(void)
-{
-       printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
-       mutex_init(&xfs_Gqm_lock);
-       xfs_qm_init_procfs();
-}
-
-void __exit
-xfs_qm_exit(void)
-{
-       xfs_qm_cleanup_procfs();
-       if (qm_dqzone)
-               kmem_zone_destroy(qm_dqzone);
-       if (qm_dqtrxzone)
-               kmem_zone_destroy(qm_dqtrxzone);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
deleted file mode 100644 (file)
index 8671a0b..0000000
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_qm.h"
-
-struct xqmstats xqmstats;
-
-static int xqm_proc_show(struct seq_file *m, void *v)
-{
-       /* maximum; incore; ratio free to inuse; freelist */
-       seq_printf(m, "%d\t%d\t%d\t%u\n",
-                       ndquot,
-                       xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
-                       xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
-                       xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
-       return 0;
-}
-
-static int xqm_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xqm_proc_show, NULL);
-}
-
-static const struct file_operations xqm_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = xqm_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-static int xqmstat_proc_show(struct seq_file *m, void *v)
-{
-       /* quota performance statistics */
-       seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
-                       xqmstats.xs_qm_dqreclaims,
-                       xqmstats.xs_qm_dqreclaim_misses,
-                       xqmstats.xs_qm_dquot_dups,
-                       xqmstats.xs_qm_dqcachemisses,
-                       xqmstats.xs_qm_dqcachehits,
-                       xqmstats.xs_qm_dqwants,
-                       xqmstats.xs_qm_dqshake_reclaims,
-                       xqmstats.xs_qm_dqinact_reclaims);
-       return 0;
-}
-
-static int xqmstat_proc_open(struct inode *inode, struct file *file)
-{
-       return single_open(file, xqmstat_proc_show, NULL);
-}
-
-static const struct file_operations xqmstat_proc_fops = {
-       .owner          = THIS_MODULE,
-       .open           = xqmstat_proc_open,
-       .read           = seq_read,
-       .llseek         = seq_lseek,
-       .release        = single_release,
-};
-
-void
-xfs_qm_init_procfs(void)
-{
-       proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
-       proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
-}
-
-void
-xfs_qm_cleanup_procfs(void)
-{
-       remove_proc_entry("fs/xfs/xqm", NULL);
-       remove_proc_entry("fs/xfs/xqmstat", NULL);
-}
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
deleted file mode 100644 (file)
index 5b964fc..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QM_STATS_H__
-#define __XFS_QM_STATS_H__
-
-#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
-
-/*
- * XQM global statistics
- */
-struct xqmstats {
-       __uint32_t              xs_qm_dqreclaims;
-       __uint32_t              xs_qm_dqreclaim_misses;
-       __uint32_t              xs_qm_dquot_dups;
-       __uint32_t              xs_qm_dqcachemisses;
-       __uint32_t              xs_qm_dqcachehits;
-       __uint32_t              xs_qm_dqwants;
-       __uint32_t              xs_qm_dqshake_reclaims;
-       __uint32_t              xs_qm_dqinact_reclaims;
-};
-
-extern struct xqmstats xqmstats;
-
-# define XQM_STATS_INC(count)  ( (count)++ )
-
-extern void xfs_qm_init_procfs(void);
-extern void xfs_qm_cleanup_procfs(void);
-
-#else
-
-# define XQM_STATS_INC(count)  do { } while (0)
-
-static inline void xfs_qm_init_procfs(void) { };
-static inline void xfs_qm_cleanup_procfs(void) { };
-
-#endif
-
-#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
deleted file mode 100644 (file)
index 609246f..0000000
+++ /dev/null
@@ -1,906 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-
-#include <linux/capability.h>
-
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_utils.h"
-#include "xfs_qm.h"
-#include "xfs_trace.h"
-
-STATIC int     xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int     xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
-                                       uint);
-STATIC uint    xfs_qm_export_flags(uint);
-STATIC uint    xfs_qm_export_qtype_flags(uint);
-STATIC void    xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
-                                       fs_disk_quota_t *);
-
-
-/*
- * Turn off quota accounting and/or enforcement for all udquots and/or
- * gdquots. Called only at unmount time.
- *
- * This assumes that there are no dquots of this file system cached
- * incore, and modifies the ondisk dquot directly. Therefore, for example,
- * it is an error to call this twice, without purging the cache.
- */
-int
-xfs_qm_scall_quotaoff(
-       xfs_mount_t             *mp,
-       uint                    flags)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       uint                    dqtype;
-       int                     error;
-       uint                    inactivate_flags;
-       xfs_qoff_logitem_t      *qoffstart;
-       int                     nculprits;
-
-       /*
-        * No file system can have quotas enabled on disk but not in core.
-        * Note that quota utilities (like quotaoff) _expect_
-        * errno == EEXIST here.
-        */
-       if ((mp->m_qflags & flags) == 0)
-               return XFS_ERROR(EEXIST);
-       error = 0;
-
-       flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-
-       /*
-        * We don't want to deal with two quotaoffs messing up each other,
-        * so we're going to serialize it. quotaoff isn't exactly a performance
-        * critical thing.
-        * If quotaoff, then we must be dealing with the root filesystem.
-        */
-       ASSERT(q);
-       mutex_lock(&q->qi_quotaofflock);
-
-       /*
-        * If we're just turning off quota enforcement, change mp and go.
-        */
-       if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
-               mp->m_qflags &= ~(flags);
-
-               spin_lock(&mp->m_sb_lock);
-               mp->m_sb.sb_qflags = mp->m_qflags;
-               spin_unlock(&mp->m_sb_lock);
-               mutex_unlock(&q->qi_quotaofflock);
-
-               /* XXX what to do if error ? Revert back to old vals incore ? */
-               error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
-               return (error);
-       }
-
-       dqtype = 0;
-       inactivate_flags = 0;
-       /*
-        * If accounting is off, we must turn enforcement off, clear the
-        * quota 'CHKD' certificate to make it known that we have to
-        * do a quotacheck the next time this quota is turned on.
-        */
-       if (flags & XFS_UQUOTA_ACCT) {
-               dqtype |= XFS_QMOPT_UQUOTA;
-               flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
-               inactivate_flags |= XFS_UQUOTA_ACTIVE;
-       }
-       if (flags & XFS_GQUOTA_ACCT) {
-               dqtype |= XFS_QMOPT_GQUOTA;
-               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
-               inactivate_flags |= XFS_GQUOTA_ACTIVE;
-       } else if (flags & XFS_PQUOTA_ACCT) {
-               dqtype |= XFS_QMOPT_PQUOTA;
-               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
-               inactivate_flags |= XFS_PQUOTA_ACTIVE;
-       }
-
-       /*
-        * Nothing to do?  Don't complain. This happens when we're just
-        * turning off quota enforcement.
-        */
-       if ((mp->m_qflags & flags) == 0)
-               goto out_unlock;
-
-       /*
-        * Write the LI_QUOTAOFF log record, and do SB changes atomically,
-        * and synchronously. If we fail to write, we should abort the
-        * operation as it cannot be recovered safely if we crash.
-        */
-       error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
-       if (error)
-               goto out_unlock;
-
-       /*
-        * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
-        * to take care of the race between dqget and quotaoff. We don't take
-        * any special locks to reset these bits. All processes need to check
-        * these bits *after* taking inode lock(s) to see if the particular
-        * quota type is in the process of being turned off. If *ACTIVE, it is
-        * guaranteed that all dquot structures and all quotainode ptrs will all
-        * stay valid as long as that inode is kept locked.
-        *
-        * There is no turning back after this.
-        */
-       mp->m_qflags &= ~inactivate_flags;
-
-       /*
-        * Give back all the dquot reference(s) held by inodes.
-        * Here we go thru every single incore inode in this file system, and
-        * do a dqrele on the i_udquot/i_gdquot that it may have.
-        * Essentially, as long as somebody has an inode locked, this guarantees
-        * that quotas will not be turned off. This is handy because in a
-        * transaction once we lock the inode(s) and check for quotaon, we can
-        * depend on the quota inodes (and other things) being valid as long as
-        * we keep the lock(s).
-        */
-       xfs_qm_dqrele_all_inodes(mp, flags);
-
-       /*
-        * Next we make the changes in the quota flag in the mount struct.
-        * This isn't protected by a particular lock directly, because we
-        * don't want to take a mrlock every time we depend on quotas being on.
-        */
-       mp->m_qflags &= ~(flags);
-
-       /*
-        * Go through all the dquots of this file system and purge them,
-        * according to what was turned off. We may not be able to get rid
-        * of all dquots, because dquots can have temporary references that
-        * are not attached to inodes. eg. xfs_setattr, xfs_create.
-        * So, if we couldn't purge all the dquots from the filesystem,
-        * we can't get rid of the incore data structures.
-        */
-       while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
-               delay(10 * nculprits);
-
-       /*
-        * Transactions that had started before ACTIVE state bit was cleared
-        * could have logged many dquots, so they'd have higher LSNs than
-        * the first QUOTAOFF log record does. If we happen to crash when
-        * the tail of the log has gone past the QUOTAOFF record, but
-        * before the last dquot modification, those dquots __will__
-        * recover, and that's not good.
-        *
-        * So, we have QUOTAOFF start and end logitems; the start
-        * logitem won't get overwritten until the end logitem appears...
-        */
-       error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
-       if (error) {
-               /* We're screwed now. Shutdown is the only option. */
-               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
-               goto out_unlock;
-       }
-
-       /*
-        * If quotas is completely disabled, close shop.
-        */
-       if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
-           ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
-               mutex_unlock(&q->qi_quotaofflock);
-               xfs_qm_destroy_quotainfo(mp);
-               return (0);
-       }
-
-       /*
-        * Release our quotainode references if we don't need them anymore.
-        */
-       if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
-               IRELE(q->qi_uquotaip);
-               q->qi_uquotaip = NULL;
-       }
-       if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
-               IRELE(q->qi_gquotaip);
-               q->qi_gquotaip = NULL;
-       }
-
-out_unlock:
-       mutex_unlock(&q->qi_quotaofflock);
-       return error;
-}
-
-STATIC int
-xfs_qm_scall_trunc_qfile(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino)
-{
-       struct xfs_inode        *ip;
-       struct xfs_trans        *tp;
-       int                     error;
-
-       if (ino == NULLFSINO)
-               return 0;
-
-       error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
-       if (error)
-               return error;
-
-       xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
-       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
-                                 XFS_TRANS_PERM_LOG_RES,
-                                 XFS_ITRUNCATE_LOG_COUNT);
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-               goto out_put;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_trans_ijoin(tp, ip);
-
-       error = xfs_itruncate_data(&tp, ip, 0);
-       if (error) {
-               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
-                                    XFS_TRANS_ABORT);
-               goto out_unlock;
-       }
-
-       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
-
-out_unlock:
-       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-out_put:
-       IRELE(ip);
-       return error;
-}
-
-int
-xfs_qm_scall_trunc_qfiles(
-       xfs_mount_t     *mp,
-       uint            flags)
-{
-       int             error = 0, error2 = 0;
-
-       if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
-               xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
-                       __func__, flags, mp->m_qflags);
-               return XFS_ERROR(EINVAL);
-       }
-
-       if (flags & XFS_DQ_USER)
-               error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
-       if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
-               error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
-
-       return error ? error : error2;
-}
-
-/*
- * Switch on (a given) quota enforcement for a filesystem.  This takes
- * effect immediately.
- * (Switching on quota accounting must be done at mount time.)
- */
-int
-xfs_qm_scall_quotaon(
-       xfs_mount_t     *mp,
-       uint            flags)
-{
-       int             error;
-       uint            qf;
-       __int64_t       sbflags;
-
-       flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
-       /*
-        * Switching on quota accounting must be done at mount time.
-        */
-       flags &= ~(XFS_ALL_QUOTA_ACCT);
-
-       sbflags = 0;
-
-       if (flags == 0) {
-               xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
-                       __func__, mp->m_qflags);
-               return XFS_ERROR(EINVAL);
-       }
-
-       /* No fs can turn on quotas with a delayed effect */
-       ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
-
-       /*
-        * Can't enforce without accounting. We check the superblock
-        * qflags here instead of m_qflags because rootfs can have
-        * quota acct on ondisk without m_qflags' knowing.
-        */
-       if (((flags & XFS_UQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-           (flags & XFS_UQUOTA_ENFD))
-           ||
-           ((flags & XFS_PQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-           (flags & XFS_GQUOTA_ACCT) == 0 &&
-           (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-           (flags & XFS_OQUOTA_ENFD))) {
-               xfs_debug(mp,
-                       "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
-                       __func__, flags, mp->m_sb.sb_qflags);
-               return XFS_ERROR(EINVAL);
-       }
-       /*
-        * If everything's up to-date incore, then don't waste time.
-        */
-       if ((mp->m_qflags & flags) == flags)
-               return XFS_ERROR(EEXIST);
-
-       /*
-        * Change sb_qflags on disk but not incore mp->qflags
-        * if this is the root filesystem.
-        */
-       spin_lock(&mp->m_sb_lock);
-       qf = mp->m_sb.sb_qflags;
-       mp->m_sb.sb_qflags = qf | flags;
-       spin_unlock(&mp->m_sb_lock);
-
-       /*
-        * There's nothing to change if it's the same.
-        */
-       if ((qf & flags) == flags && sbflags == 0)
-               return XFS_ERROR(EEXIST);
-       sbflags |= XFS_SB_QFLAGS;
-
-       if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
-               return (error);
-       /*
-        * If we aren't trying to switch on quota enforcement, we are done.
-        */
-       if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
-            (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
-            ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
-            (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
-            ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
-            (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
-           (flags & XFS_ALL_QUOTA_ENFD) == 0)
-               return (0);
-
-       if (! XFS_IS_QUOTA_RUNNING(mp))
-               return XFS_ERROR(ESRCH);
-
-       /*
-        * Switch on quota enforcement in core.
-        */
-       mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
-       mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
-       mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
-
-       return (0);
-}
-
-
-/*
- * Return quota status information, such as uquota-off, enforcements, etc.
- */
-int
-xfs_qm_scall_getqstat(
-       struct xfs_mount        *mp,
-       struct fs_quota_stat    *out)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       struct xfs_inode        *uip, *gip;
-       boolean_t               tempuqip, tempgqip;
-
-       uip = gip = NULL;
-       tempuqip = tempgqip = B_FALSE;
-       memset(out, 0, sizeof(fs_quota_stat_t));
-
-       out->qs_version = FS_QSTAT_VERSION;
-       if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-               out->qs_uquota.qfs_ino = NULLFSINO;
-               out->qs_gquota.qfs_ino = NULLFSINO;
-               return (0);
-       }
-       out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
-                                                       (XFS_ALL_QUOTA_ACCT|
-                                                        XFS_ALL_QUOTA_ENFD));
-       out->qs_pad = 0;
-       out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
-       out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
-
-       if (q) {
-               uip = q->qi_uquotaip;
-               gip = q->qi_gquotaip;
-       }
-       if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
-               if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
-                                       0, 0, &uip) == 0)
-                       tempuqip = B_TRUE;
-       }
-       if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
-               if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
-                                       0, 0, &gip) == 0)
-                       tempgqip = B_TRUE;
-       }
-       if (uip) {
-               out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
-               out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
-               if (tempuqip)
-                       IRELE(uip);
-       }
-       if (gip) {
-               out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
-               out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
-               if (tempgqip)
-                       IRELE(gip);
-       }
-       if (q) {
-               out->qs_incoredqs = q->qi_dquots;
-               out->qs_btimelimit = q->qi_btimelimit;
-               out->qs_itimelimit = q->qi_itimelimit;
-               out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-               out->qs_bwarnlimit = q->qi_bwarnlimit;
-               out->qs_iwarnlimit = q->qi_iwarnlimit;
-       }
-       return 0;
-}
-
-#define XFS_DQ_MASK \
-       (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
-
-/*
- * Adjust quota limits, and start/stop timers accordingly.
- */
-int
-xfs_qm_scall_setqlim(
-       xfs_mount_t             *mp,
-       xfs_dqid_t              id,
-       uint                    type,
-       fs_disk_quota_t         *newlim)
-{
-       struct xfs_quotainfo    *q = mp->m_quotainfo;
-       xfs_disk_dquot_t        *ddq;
-       xfs_dquot_t             *dqp;
-       xfs_trans_t             *tp;
-       int                     error;
-       xfs_qcnt_t              hard, soft;
-
-       if (newlim->d_fieldmask & ~XFS_DQ_MASK)
-               return EINVAL;
-       if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
-               return 0;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-       if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
-                                     0, 0, XFS_DEFAULT_LOG_COUNT))) {
-               xfs_trans_cancel(tp, 0);
-               return (error);
-       }
-
-       /*
-        * We don't want to race with a quotaoff so take the quotaoff lock.
-        * (We don't hold an inode lock, so there's nothing else to stop
-        * a quotaoff from happening). (XXXThis doesn't currently happen
-        * because we take the vfslock before calling xfs_qm_sysent).
-        */
-       mutex_lock(&q->qi_quotaofflock);
-
-       /*
-        * Get the dquot (locked), and join it to the transaction.
-        * Allocate the dquot if this doesn't exist.
-        */
-       if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
-               xfs_trans_cancel(tp, XFS_TRANS_ABORT);
-               ASSERT(error != ENOENT);
-               goto out_unlock;
-       }
-       xfs_trans_dqjoin(tp, dqp);
-       ddq = &dqp->q_core;
-
-       /*
-        * Make sure that hardlimits are >= soft limits before changing.
-        */
-       hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
-               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
-                       be64_to_cpu(ddq->d_blk_hardlimit);
-       soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
-               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
-                       be64_to_cpu(ddq->d_blk_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_blk_hardlimit = cpu_to_be64(hard);
-               ddq->d_blk_softlimit = cpu_to_be64(soft);
-               if (id == 0) {
-                       q->qi_bhardlimit = hard;
-                       q->qi_bsoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
-       }
-       hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
-               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
-                       be64_to_cpu(ddq->d_rtb_hardlimit);
-       soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
-               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
-                       be64_to_cpu(ddq->d_rtb_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_rtb_hardlimit = cpu_to_be64(hard);
-               ddq->d_rtb_softlimit = cpu_to_be64(soft);
-               if (id == 0) {
-                       q->qi_rtbhardlimit = hard;
-                       q->qi_rtbsoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
-       }
-
-       hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
-               (xfs_qcnt_t) newlim->d_ino_hardlimit :
-                       be64_to_cpu(ddq->d_ino_hardlimit);
-       soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
-               (xfs_qcnt_t) newlim->d_ino_softlimit :
-                       be64_to_cpu(ddq->d_ino_softlimit);
-       if (hard == 0 || hard >= soft) {
-               ddq->d_ino_hardlimit = cpu_to_be64(hard);
-               ddq->d_ino_softlimit = cpu_to_be64(soft);
-               if (id == 0) {
-                       q->qi_ihardlimit = hard;
-                       q->qi_isoftlimit = soft;
-               }
-       } else {
-               xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
-       }
-
-       /*
-        * Update warnings counter(s) if requested
-        */
-       if (newlim->d_fieldmask & FS_DQ_BWARNS)
-               ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
-       if (newlim->d_fieldmask & FS_DQ_IWARNS)
-               ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
-       if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-               ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
-
-       if (id == 0) {
-               /*
-                * Timelimits for the super user set the relative time
-                * the other users can be over quota for this file system.
-                * If it is zero a default is used.  Ditto for the default
-                * soft and hard limit values (already done, above), and
-                * for warnings.
-                */
-               if (newlim->d_fieldmask & FS_DQ_BTIMER) {
-                       q->qi_btimelimit = newlim->d_btimer;
-                       ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
-               }
-               if (newlim->d_fieldmask & FS_DQ_ITIMER) {
-                       q->qi_itimelimit = newlim->d_itimer;
-                       ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
-               }
-               if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
-                       q->qi_rtbtimelimit = newlim->d_rtbtimer;
-                       ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
-               }
-               if (newlim->d_fieldmask & FS_DQ_BWARNS)
-                       q->qi_bwarnlimit = newlim->d_bwarns;
-               if (newlim->d_fieldmask & FS_DQ_IWARNS)
-                       q->qi_iwarnlimit = newlim->d_iwarns;
-               if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
-                       q->qi_rtbwarnlimit = newlim->d_rtbwarns;
-       } else {
-               /*
-                * If the user is now over quota, start the timelimit.
-                * The user will not be 'warned'.
-                * Note that we keep the timers ticking, whether enforcement
-                * is on or off. We don't really want to bother with iterating
-                * over all ondisk dquots and turning the timers on/off.
-                */
-               xfs_qm_adjust_dqtimers(mp, ddq);
-       }
-       dqp->dq_flags |= XFS_DQ_DIRTY;
-       xfs_trans_log_dquot(tp, dqp);
-
-       error = xfs_trans_commit(tp, 0);
-       xfs_qm_dqrele(dqp);
-
- out_unlock:
-       mutex_unlock(&q->qi_quotaofflock);
-       return error;
-}
-
-int
-xfs_qm_scall_getquota(
-       xfs_mount_t     *mp,
-       xfs_dqid_t      id,
-       uint            type,
-       fs_disk_quota_t *out)
-{
-       xfs_dquot_t     *dqp;
-       int             error;
-
-       /*
-        * Try to get the dquot. We don't want it allocated on disk, so
-        * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
-        * exist, we'll get ENOENT back.
-        */
-       if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
-               return (error);
-       }
-
-       /*
-        * If everything's NULL, this dquot doesn't quite exist as far as
-        * our utility programs are concerned.
-        */
-       if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
-               xfs_qm_dqput(dqp);
-               return XFS_ERROR(ENOENT);
-       }
-       /*
-        * Convert the disk dquot to the exportable format
-        */
-       xfs_qm_export_dquot(mp, &dqp->q_core, out);
-       xfs_qm_dqput(dqp);
-       return (error ? XFS_ERROR(EFAULT) : 0);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff_end(
-       xfs_mount_t             *mp,
-       xfs_qoff_logitem_t      *startqoff,
-       uint                    flags)
-{
-       xfs_trans_t             *tp;
-       int                     error;
-       xfs_qoff_logitem_t      *qoffi;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-
-       if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
-                                     0, 0, XFS_DEFAULT_LOG_COUNT))) {
-               xfs_trans_cancel(tp, 0);
-               return (error);
-       }
-
-       qoffi = xfs_trans_get_qoff_item(tp, startqoff,
-                                       flags & XFS_ALL_QUOTA_ACCT);
-       xfs_trans_log_quotaoff_item(tp, qoffi);
-
-       /*
-        * We have to make sure that the transaction is secure on disk before we
-        * return and actually stop quota accounting. So, make it synchronous.
-        * We don't care about quotoff's performance.
-        */
-       xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
-       return (error);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
-       xfs_mount_t            *mp,
-       xfs_qoff_logitem_t     **qoffstartp,
-       uint                   flags)
-{
-       xfs_trans_t            *tp;
-       int                     error;
-       xfs_qoff_logitem_t     *qoffi=NULL;
-       uint                    oldsbqflag=0;
-
-       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-       if ((error = xfs_trans_reserve(tp, 0,
-                                     sizeof(xfs_qoff_logitem_t) * 2 +
-                                     mp->m_sb.sb_sectsize + 128,
-                                     0,
-                                     0,
-                                     XFS_DEFAULT_LOG_COUNT))) {
-               goto error0;
-       }
-
-       qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
-       xfs_trans_log_quotaoff_item(tp, qoffi);
-
-       spin_lock(&mp->m_sb_lock);
-       oldsbqflag = mp->m_sb.sb_qflags;
-       mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
-       spin_unlock(&mp->m_sb_lock);
-
-       xfs_mod_sb(tp, XFS_SB_QFLAGS);
-
-       /*
-        * We have to make sure that the transaction is secure on disk before we
-        * return and actually stop quota accounting. So, make it synchronous.
-        * We don't care about quotoff's performance.
-        */
-       xfs_trans_set_sync(tp);
-       error = xfs_trans_commit(tp, 0);
-
-error0:
-       if (error) {
-               xfs_trans_cancel(tp, 0);
-               /*
-                * No one else is modifying sb_qflags, so this is OK.
-                * We still hold the quotaofflock.
-                */
-               spin_lock(&mp->m_sb_lock);
-               mp->m_sb.sb_qflags = oldsbqflag;
-               spin_unlock(&mp->m_sb_lock);
-       }
-       *qoffstartp = qoffi;
-       return (error);
-}
-
-
-/*
- * Translate an internal style on-disk-dquot to the exportable format.
- * The main differences are that the counters/limits are all in Basic
- * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
- * to be converted to the native endianness.
- */
-STATIC void
-xfs_qm_export_dquot(
-       xfs_mount_t             *mp,
-       xfs_disk_dquot_t        *src,
-       struct fs_disk_quota    *dst)
-{
-       memset(dst, 0, sizeof(*dst));
-       dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
-       dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
-       dst->d_id = be32_to_cpu(src->d_id);
-       dst->d_blk_hardlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
-       dst->d_blk_softlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
-       dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
-       dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
-       dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
-       dst->d_icount = be64_to_cpu(src->d_icount);
-       dst->d_btimer = be32_to_cpu(src->d_btimer);
-       dst->d_itimer = be32_to_cpu(src->d_itimer);
-       dst->d_iwarns = be16_to_cpu(src->d_iwarns);
-       dst->d_bwarns = be16_to_cpu(src->d_bwarns);
-       dst->d_rtb_hardlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
-       dst->d_rtb_softlimit =
-               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
-       dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
-       dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
-       dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
-
-       /*
-        * Internally, we don't reset all the timers when quota enforcement
-        * gets turned off. No need to confuse the user level code,
-        * so return zeroes in that case.
-        */
-       if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
-           (!XFS_IS_OQUOTA_ENFORCED(mp) &&
-                       (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
-               dst->d_btimer = 0;
-               dst->d_itimer = 0;
-               dst->d_rtbtimer = 0;
-       }
-
-#ifdef DEBUG
-       if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
-            (XFS_IS_OQUOTA_ENFORCED(mp) &&
-                       (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
-           dst->d_id != 0) {
-               if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
-                   (dst->d_blk_softlimit > 0)) {
-                       ASSERT(dst->d_btimer != 0);
-               }
-               if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
-                   (dst->d_ino_softlimit > 0)) {
-                       ASSERT(dst->d_itimer != 0);
-               }
-       }
-#endif
-}
-
-STATIC uint
-xfs_qm_export_qtype_flags(
-       uint flags)
-{
-       /*
-        * Can't be more than one, or none.
-        */
-       ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-               (FS_PROJ_QUOTA | FS_USER_QUOTA));
-       ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-               (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-       ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-               (FS_USER_QUOTA | FS_GROUP_QUOTA));
-       ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-
-       return (flags & XFS_DQ_USER) ?
-               FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-                       FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-
-STATIC uint
-xfs_qm_export_flags(
-       uint flags)
-{
-       uint uflags;
-
-       uflags = 0;
-       if (flags & XFS_UQUOTA_ACCT)
-               uflags |= FS_QUOTA_UDQ_ACCT;
-       if (flags & XFS_PQUOTA_ACCT)
-               uflags |= FS_QUOTA_PDQ_ACCT;
-       if (flags & XFS_GQUOTA_ACCT)
-               uflags |= FS_QUOTA_GDQ_ACCT;
-       if (flags & XFS_UQUOTA_ENFD)
-               uflags |= FS_QUOTA_UDQ_ENFD;
-       if (flags & (XFS_OQUOTA_ENFD)) {
-               uflags |= (flags & XFS_GQUOTA_ACCT) ?
-                       FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
-       }
-       return (uflags);
-}
-
-
-STATIC int
-xfs_dqrele_inode(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     flags)
-{
-       /* skip quota inodes */
-       if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
-           ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
-               ASSERT(ip->i_udquot == NULL);
-               ASSERT(ip->i_gdquot == NULL);
-               return 0;
-       }
-
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
-               xfs_qm_dqrele(ip->i_udquot);
-               ip->i_udquot = NULL;
-       }
-       if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
-               xfs_qm_dqrele(ip->i_gdquot);
-               ip->i_gdquot = NULL;
-       }
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       return 0;
-}
-
-
-/*
- * Go thru all the inodes in the file system, releasing their dquots.
- *
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff.
- */
-void
-xfs_qm_dqrele_all_inodes(
-       struct xfs_mount *mp,
-       uint             flags)
-{
-       ASSERT(mp->m_quotainfo);
-       xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
-}
diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h
deleted file mode 100644 (file)
index 94a3d92..0000000
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2000-2003 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_QUOTA_PRIV_H__
-#define __XFS_QUOTA_PRIV_H__
-
-/*
- * Number of bmaps that we ask from bmapi when doing a quotacheck.
- * We make this restriction to keep the memory usage to a minimum.
- */
-#define XFS_DQITER_MAP_SIZE    10
-
-/*
- * Hash into a bucket in the dquot hash table, based on <mp, id>.
- */
-#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
-                                (__psunsigned_t)(id)) & \
-                               (xfs_Gqm->qm_dqhashmask - 1))
-#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
-                                    (xfs_Gqm->qm_usr_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)) : \
-                                    (xfs_Gqm->qm_grp_dqhtable + \
-                                     XFS_DQ_HASHVAL(mp, id)))
-#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
-       !dqp->q_core.d_blk_hardlimit && \
-       !dqp->q_core.d_blk_softlimit && \
-       !dqp->q_core.d_rtb_hardlimit && \
-       !dqp->q_core.d_rtb_softlimit && \
-       !dqp->q_core.d_ino_hardlimit && \
-       !dqp->q_core.d_ino_softlimit && \
-       !dqp->q_core.d_bcount && \
-       !dqp->q_core.d_rtbcount && \
-       !dqp->q_core.d_icount)
-
-#define DQFLAGTO_TYPESTR(d)    (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
-                                (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
-                                (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
-
-#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c
deleted file mode 100644 (file)
index 4d00ee6..0000000
+++ /dev/null
@@ -1,890 +0,0 @@
-/*
- * Copyright (c) 2000-2002 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_alloc.h"
-#include "xfs_quota.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_itable.h"
-#include "xfs_bmap.h"
-#include "xfs_rtalloc.h"
-#include "xfs_error.h"
-#include "xfs_attr.h"
-#include "xfs_buf_item.h"
-#include "xfs_trans_priv.h"
-#include "xfs_qm.h"
-
-STATIC void    xfs_trans_alloc_dqinfo(xfs_trans_t *);
-
-/*
- * Add the locked dquot to the transaction.
- * The dquot must be locked, and it cannot be associated with any
- * transaction.
- */
-void
-xfs_trans_dqjoin(
-       xfs_trans_t     *tp,
-       xfs_dquot_t     *dqp)
-{
-       ASSERT(dqp->q_transp != tp);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-       ASSERT(dqp->q_logitem.qli_dquot == dqp);
-
-       /*
-        * Get a log_item_desc to point at the new item.
-        */
-       xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
-
-       /*
-        * Initialize d_transp so we can later determine if this dquot is
-        * associated with this transaction.
-        */
-       dqp->q_transp = tp;
-}
-
-
-/*
- * This is called to mark the dquot as needing
- * to be logged when the transaction is committed.  The dquot must
- * already be associated with the given transaction.
- * Note that it marks the entire transaction as dirty. In the ordinary
- * case, this gets called via xfs_trans_commit, after the transaction
- * is already dirty. However, there's nothing stop this from getting
- * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
- * flag.
- */
-void
-xfs_trans_log_dquot(
-       xfs_trans_t     *tp,
-       xfs_dquot_t     *dqp)
-{
-       ASSERT(dqp->q_transp == tp);
-       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-
-       tp->t_flags |= XFS_TRANS_DIRTY;
-       dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-}
-
-/*
- * Carry forward whatever is left of the quota blk reservation to
- * the spanky new transaction
- */
-void
-xfs_trans_dup_dqinfo(
-       xfs_trans_t     *otp,
-       xfs_trans_t     *ntp)
-{
-       xfs_dqtrx_t     *oq, *nq;
-       int             i,j;
-       xfs_dqtrx_t     *oqa, *nqa;
-
-       if (!otp->t_dqinfo)
-               return;
-
-       xfs_trans_alloc_dqinfo(ntp);
-       oqa = otp->t_dqinfo->dqa_usrdquots;
-       nqa = ntp->t_dqinfo->dqa_usrdquots;
-
-       /*
-        * Because the quota blk reservation is carried forward,
-        * it is also necessary to carry forward the DQ_DIRTY flag.
-        */
-       if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
-               ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
-
-       for (j = 0; j < 2; j++) {
-               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
-                       if (oqa[i].qt_dquot == NULL)
-                               break;
-                       oq = &oqa[i];
-                       nq = &nqa[i];
-
-                       nq->qt_dquot = oq->qt_dquot;
-                       nq->qt_bcount_delta = nq->qt_icount_delta = 0;
-                       nq->qt_rtbcount_delta = 0;
-
-                       /*
-                        * Transfer whatever is left of the reservations.
-                        */
-                       nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
-                       oq->qt_blk_res = oq->qt_blk_res_used;
-
-                       nq->qt_rtblk_res = oq->qt_rtblk_res -
-                               oq->qt_rtblk_res_used;
-                       oq->qt_rtblk_res = oq->qt_rtblk_res_used;
-
-                       nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
-                       oq->qt_ino_res = oq->qt_ino_res_used;
-
-               }
-               oqa = otp->t_dqinfo->dqa_grpdquots;
-               nqa = ntp->t_dqinfo->dqa_grpdquots;
-       }
-}
-
-/*
- * Wrap around mod_dquot to account for both user and group quotas.
- */
-void
-xfs_trans_mod_dquot_byino(
-       xfs_trans_t     *tp,
-       xfs_inode_t     *ip,
-       uint            field,
-       long            delta)
-{
-       xfs_mount_t     *mp = tp->t_mountp;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) ||
-           !XFS_IS_QUOTA_ON(mp) ||
-           ip->i_ino == mp->m_sb.sb_uquotino ||
-           ip->i_ino == mp->m_sb.sb_gquotino)
-               return;
-
-       if (tp->t_dqinfo == NULL)
-               xfs_trans_alloc_dqinfo(tp);
-
-       if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
-               (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
-       if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
-               (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
-}
-
-STATIC xfs_dqtrx_t *
-xfs_trans_get_dqtrx(
-       xfs_trans_t     *tp,
-       xfs_dquot_t     *dqp)
-{
-       int             i;
-       xfs_dqtrx_t     *qa;
-
-       qa = XFS_QM_ISUDQ(dqp) ?
-               tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
-
-       for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
-               if (qa[i].qt_dquot == NULL ||
-                   qa[i].qt_dquot == dqp)
-                       return &qa[i];
-       }
-
-       return NULL;
-}
-
-/*
- * Make the changes in the transaction structure.
- * The moral equivalent to xfs_trans_mod_sb().
- * We don't touch any fields in the dquot, so we don't care
- * if it's locked or not (most of the time it won't be).
- */
-void
-xfs_trans_mod_dquot(
-       xfs_trans_t     *tp,
-       xfs_dquot_t     *dqp,
-       uint            field,
-       long            delta)
-{
-       xfs_dqtrx_t     *qtrx;
-
-       ASSERT(tp);
-       ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
-       qtrx = NULL;
-
-       if (tp->t_dqinfo == NULL)
-               xfs_trans_alloc_dqinfo(tp);
-       /*
-        * Find either the first free slot or the slot that belongs
-        * to this dquot.
-        */
-       qtrx = xfs_trans_get_dqtrx(tp, dqp);
-       ASSERT(qtrx);
-       if (qtrx->qt_dquot == NULL)
-               qtrx->qt_dquot = dqp;
-
-       switch (field) {
-
-               /*
-                * regular disk blk reservation
-                */
-             case XFS_TRANS_DQ_RES_BLKS:
-               qtrx->qt_blk_res += (ulong)delta;
-               break;
-
-               /*
-                * inode reservation
-                */
-             case XFS_TRANS_DQ_RES_INOS:
-               qtrx->qt_ino_res += (ulong)delta;
-               break;
-
-               /*
-                * disk blocks used.
-                */
-             case XFS_TRANS_DQ_BCOUNT:
-               if (qtrx->qt_blk_res && delta > 0) {
-                       qtrx->qt_blk_res_used += (ulong)delta;
-                       ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
-               }
-               qtrx->qt_bcount_delta += delta;
-               break;
-
-             case XFS_TRANS_DQ_DELBCOUNT:
-               qtrx->qt_delbcnt_delta += delta;
-               break;
-
-               /*
-                * Inode Count
-                */
-             case XFS_TRANS_DQ_ICOUNT:
-               if (qtrx->qt_ino_res && delta > 0) {
-                       qtrx->qt_ino_res_used += (ulong)delta;
-                       ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
-               }
-               qtrx->qt_icount_delta += delta;
-               break;
-
-               /*
-                * rtblk reservation
-                */
-             case XFS_TRANS_DQ_RES_RTBLKS:
-               qtrx->qt_rtblk_res += (ulong)delta;
-               break;
-
-               /*
-                * rtblk count
-                */
-             case XFS_TRANS_DQ_RTBCOUNT:
-               if (qtrx->qt_rtblk_res && delta > 0) {
-                       qtrx->qt_rtblk_res_used += (ulong)delta;
-                       ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
-               }
-               qtrx->qt_rtbcount_delta += delta;
-               break;
-
-             case XFS_TRANS_DQ_DELRTBCOUNT:
-               qtrx->qt_delrtb_delta += delta;
-               break;
-
-             default:
-               ASSERT(0);
-       }
-       tp->t_flags |= XFS_TRANS_DQ_DIRTY;
-}
-
-
-/*
- * Given an array of dqtrx structures, lock all the dquots associated
- * and join them to the transaction, provided they have been modified.
- * We know that the highest number of dquots (of one type - usr OR grp),
- * involved in a transaction is 2 and that both usr and grp combined - 3.
- * So, we don't attempt to make this very generic.
- */
-STATIC void
-xfs_trans_dqlockedjoin(
-       xfs_trans_t     *tp,
-       xfs_dqtrx_t     *q)
-{
-       ASSERT(q[0].qt_dquot != NULL);
-       if (q[1].qt_dquot == NULL) {
-               xfs_dqlock(q[0].qt_dquot);
-               xfs_trans_dqjoin(tp, q[0].qt_dquot);
-       } else {
-               ASSERT(XFS_QM_TRANS_MAXDQS == 2);
-               xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
-               xfs_trans_dqjoin(tp, q[0].qt_dquot);
-               xfs_trans_dqjoin(tp, q[1].qt_dquot);
-       }
-}
-
-
-/*
- * Called by xfs_trans_commit() and similar in spirit to
- * xfs_trans_apply_sb_deltas().
- * Go thru all the dquots belonging to this transaction and modify the
- * INCORE dquot to reflect the actual usages.
- * Unreserve just the reservations done by this transaction.
- * dquot is still left locked at exit.
- */
-void
-xfs_trans_apply_dquot_deltas(
-       xfs_trans_t             *tp)
-{
-       int                     i, j;
-       xfs_dquot_t             *dqp;
-       xfs_dqtrx_t             *qtrx, *qa;
-       xfs_disk_dquot_t        *d;
-       long                    totalbdelta;
-       long                    totalrtbdelta;
-
-       if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
-               return;
-
-       ASSERT(tp->t_dqinfo);
-       qa = tp->t_dqinfo->dqa_usrdquots;
-       for (j = 0; j < 2; j++) {
-               if (qa[0].qt_dquot == NULL) {
-                       qa = tp->t_dqinfo->dqa_grpdquots;
-                       continue;
-               }
-
-               /*
-                * Lock all of the dquots and join them to the transaction.
-                */
-               xfs_trans_dqlockedjoin(tp, qa);
-
-               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
-                       qtrx = &qa[i];
-                       /*
-                        * The array of dquots is filled
-                        * sequentially, not sparsely.
-                        */
-                       if ((dqp = qtrx->qt_dquot) == NULL)
-                               break;
-
-                       ASSERT(XFS_DQ_IS_LOCKED(dqp));
-                       ASSERT(dqp->q_transp == tp);
-
-                       /*
-                        * adjust the actual number of blocks used
-                        */
-                       d = &dqp->q_core;
-
-                       /*
-                        * The issue here is - sometimes we don't make a blkquota
-                        * reservation intentionally to be fair to users
-                        * (when the amount is small). On the other hand,
-                        * delayed allocs do make reservations, but that's
-                        * outside of a transaction, so we have no
-                        * idea how much was really reserved.
-                        * So, here we've accumulated delayed allocation blks and
-                        * non-delay blks. The assumption is that the
-                        * delayed ones are always reserved (outside of a
-                        * transaction), and the others may or may not have
-                        * quota reservations.
-                        */
-                       totalbdelta = qtrx->qt_bcount_delta +
-                               qtrx->qt_delbcnt_delta;
-                       totalrtbdelta = qtrx->qt_rtbcount_delta +
-                               qtrx->qt_delrtb_delta;
-#ifdef DEBUG
-                       if (totalbdelta < 0)
-                               ASSERT(be64_to_cpu(d->d_bcount) >=
-                                      -totalbdelta);
-
-                       if (totalrtbdelta < 0)
-                               ASSERT(be64_to_cpu(d->d_rtbcount) >=
-                                      -totalrtbdelta);
-
-                       if (qtrx->qt_icount_delta < 0)
-                               ASSERT(be64_to_cpu(d->d_icount) >=
-                                      -qtrx->qt_icount_delta);
-#endif
-                       if (totalbdelta)
-                               be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
-
-                       if (qtrx->qt_icount_delta)
-                               be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
-
-                       if (totalrtbdelta)
-                               be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
-
-                       /*
-                        * Get any default limits in use.
-                        * Start/reset the timer(s) if needed.
-                        */
-                       if (d->d_id) {
-                               xfs_qm_adjust_dqlimits(tp->t_mountp, d);
-                               xfs_qm_adjust_dqtimers(tp->t_mountp, d);
-                       }
-
-                       dqp->dq_flags |= XFS_DQ_DIRTY;
-                       /*
-                        * add this to the list of items to get logged
-                        */
-                       xfs_trans_log_dquot(tp, dqp);
-                       /*
-                        * Take off what's left of the original reservation.
-                        * In case of delayed allocations, there's no
-                        * reservation that a transaction structure knows of.
-                        */
-                       if (qtrx->qt_blk_res != 0) {
-                               if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
-                                       if (qtrx->qt_blk_res >
-                                           qtrx->qt_blk_res_used)
-                                               dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (qtrx->qt_blk_res -
-                                                        qtrx->qt_blk_res_used);
-                                       else
-                                               dqp->q_res_bcount -= (xfs_qcnt_t)
-                                                       (qtrx->qt_blk_res_used -
-                                                        qtrx->qt_blk_res);
-                               }
-                       } else {
-                               /*
-                                * These blks were never reserved, either inside
-                                * a transaction or outside one (in a delayed
-                                * allocation). Also, this isn't always a
-                                * negative number since we sometimes
-                                * deliberately skip quota reservations.
-                                */
-                               if (qtrx->qt_bcount_delta) {
-                                       dqp->q_res_bcount +=
-                                             (xfs_qcnt_t)qtrx->qt_bcount_delta;
-                               }
-                       }
-                       /*
-                        * Adjust the RT reservation.
-                        */
-                       if (qtrx->qt_rtblk_res != 0) {
-                               if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
-                                       if (qtrx->qt_rtblk_res >
-                                           qtrx->qt_rtblk_res_used)
-                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
-                                                      (qtrx->qt_rtblk_res -
-                                                       qtrx->qt_rtblk_res_used);
-                                       else
-                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
-                                                      (qtrx->qt_rtblk_res_used -
-                                                       qtrx->qt_rtblk_res);
-                               }
-                       } else {
-                               if (qtrx->qt_rtbcount_delta)
-                                       dqp->q_res_rtbcount +=
-                                           (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
-                       }
-
-                       /*
-                        * Adjust the inode reservation.
-                        */
-                       if (qtrx->qt_ino_res != 0) {
-                               ASSERT(qtrx->qt_ino_res >=
-                                      qtrx->qt_ino_res_used);
-                               if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
-                                       dqp->q_res_icount -= (xfs_qcnt_t)
-                                               (qtrx->qt_ino_res -
-                                                qtrx->qt_ino_res_used);
-                       } else {
-                               if (qtrx->qt_icount_delta)
-                                       dqp->q_res_icount +=
-                                           (xfs_qcnt_t)qtrx->qt_icount_delta;
-                       }
-
-                       ASSERT(dqp->q_res_bcount >=
-                               be64_to_cpu(dqp->q_core.d_bcount));
-                       ASSERT(dqp->q_res_icount >=
-                               be64_to_cpu(dqp->q_core.d_icount));
-                       ASSERT(dqp->q_res_rtbcount >=
-                               be64_to_cpu(dqp->q_core.d_rtbcount));
-               }
-               /*
-                * Do the group quotas next
-                */
-               qa = tp->t_dqinfo->dqa_grpdquots;
-       }
-}
-
-/*
- * Release the reservations, and adjust the dquots accordingly.
- * This is called only when the transaction is being aborted. If by
- * any chance we have done dquot modifications incore (ie. deltas) already,
- * we simply throw those away, since that's the expected behavior
- * when a transaction is curtailed without a commit.
- */
-void
-xfs_trans_unreserve_and_mod_dquots(
-       xfs_trans_t             *tp)
-{
-       int                     i, j;
-       xfs_dquot_t             *dqp;
-       xfs_dqtrx_t             *qtrx, *qa;
-       boolean_t               locked;
-
-       if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
-               return;
-
-       qa = tp->t_dqinfo->dqa_usrdquots;
-
-       for (j = 0; j < 2; j++) {
-               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
-                       qtrx = &qa[i];
-                       /*
-                        * We assume that the array of dquots is filled
-                        * sequentially, not sparsely.
-                        */
-                       if ((dqp = qtrx->qt_dquot) == NULL)
-                               break;
-                       /*
-                        * Unreserve the original reservation. We don't care
-                        * about the number of blocks used field, or deltas.
-                        * Also we don't bother to zero the fields.
-                        */
-                       locked = B_FALSE;
-                       if (qtrx->qt_blk_res) {
-                               xfs_dqlock(dqp);
-                               locked = B_TRUE;
-                               dqp->q_res_bcount -=
-                                       (xfs_qcnt_t)qtrx->qt_blk_res;
-                       }
-                       if (qtrx->qt_ino_res) {
-                               if (!locked) {
-                                       xfs_dqlock(dqp);
-                                       locked = B_TRUE;
-                               }
-                               dqp->q_res_icount -=
-                                       (xfs_qcnt_t)qtrx->qt_ino_res;
-                       }
-
-                       if (qtrx->qt_rtblk_res) {
-                               if (!locked) {
-                                       xfs_dqlock(dqp);
-                                       locked = B_TRUE;
-                               }
-                               dqp->q_res_rtbcount -=
-                                       (xfs_qcnt_t)qtrx->qt_rtblk_res;
-                       }
-                       if (locked)
-                               xfs_dqunlock(dqp);
-
-               }
-               qa = tp->t_dqinfo->dqa_grpdquots;
-       }
-}
-
-STATIC void
-xfs_quota_warn(
-       struct xfs_mount        *mp,
-       struct xfs_dquot        *dqp,
-       int                     type)
-{
-       /* no warnings for project quotas - we just return ENOSPC later */
-       if (dqp->dq_flags & XFS_DQ_PROJ)
-               return;
-       quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
-                          be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
-                          type);
-}
-
-/*
- * This reserves disk blocks and inodes against a dquot.
- * Flags indicate if the dquot is to be locked here and also
- * if the blk reservation is for RT or regular blocks.
- * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
- */
-STATIC int
-xfs_trans_dqresv(
-       xfs_trans_t     *tp,
-       xfs_mount_t     *mp,
-       xfs_dquot_t     *dqp,
-       long            nblks,
-       long            ninos,
-       uint            flags)
-{
-       xfs_qcnt_t      hardlimit;
-       xfs_qcnt_t      softlimit;
-       time_t          timer;
-       xfs_qwarncnt_t  warns;
-       xfs_qwarncnt_t  warnlimit;
-       xfs_qcnt_t      count;
-       xfs_qcnt_t      *resbcountp;
-       xfs_quotainfo_t *q = mp->m_quotainfo;
-
-
-       xfs_dqlock(dqp);
-
-       if (flags & XFS_TRANS_DQ_RES_BLKS) {
-               hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
-               if (!hardlimit)
-                       hardlimit = q->qi_bhardlimit;
-               softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
-               if (!softlimit)
-                       softlimit = q->qi_bsoftlimit;
-               timer = be32_to_cpu(dqp->q_core.d_btimer);
-               warns = be16_to_cpu(dqp->q_core.d_bwarns);
-               warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
-               resbcountp = &dqp->q_res_bcount;
-       } else {
-               ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
-               hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
-               if (!hardlimit)
-                       hardlimit = q->qi_rtbhardlimit;
-               softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
-               if (!softlimit)
-                       softlimit = q->qi_rtbsoftlimit;
-               timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-               warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
-               warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
-               resbcountp = &dqp->q_res_rtbcount;
-       }
-
-       if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
-           dqp->q_core.d_id &&
-           ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
-            (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
-             (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
-               if (nblks > 0) {
-                       /*
-                        * dquot is locked already. See if we'd go over the
-                        * hardlimit or exceed the timelimit if we allocate
-                        * nblks.
-                        */
-                       if (hardlimit > 0ULL &&
-                           hardlimit <= nblks + *resbcountp) {
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
-                               goto error_return;
-                       }
-                       if (softlimit > 0ULL &&
-                           softlimit <= nblks + *resbcountp) {
-                               if ((timer != 0 && get_seconds() > timer) ||
-                                   (warns != 0 && warns >= warnlimit)) {
-                                       xfs_quota_warn(mp, dqp,
-                                                      QUOTA_NL_BSOFTLONGWARN);
-                                       goto error_return;
-                               }
-
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
-                       }
-               }
-               if (ninos > 0) {
-                       count = be64_to_cpu(dqp->q_core.d_icount);
-                       timer = be32_to_cpu(dqp->q_core.d_itimer);
-                       warns = be16_to_cpu(dqp->q_core.d_iwarns);
-                       warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
-                       hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
-                       if (!hardlimit)
-                               hardlimit = q->qi_ihardlimit;
-                       softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-                       if (!softlimit)
-                               softlimit = q->qi_isoftlimit;
-
-                       if (hardlimit > 0ULL && count >= hardlimit) {
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
-                               goto error_return;
-                       }
-                       if (softlimit > 0ULL && count >= softlimit) {
-                               if  ((timer != 0 && get_seconds() > timer) ||
-                                    (warns != 0 && warns >= warnlimit)) {
-                                       xfs_quota_warn(mp, dqp,
-                                                      QUOTA_NL_ISOFTLONGWARN);
-                                       goto error_return;
-                               }
-                               xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
-                       }
-               }
-       }
-
-       /*
-        * Change the reservation, but not the actual usage.
-        * Note that q_res_bcount = q_core.d_bcount + resv
-        */
-       (*resbcountp) += (xfs_qcnt_t)nblks;
-       if (ninos != 0)
-               dqp->q_res_icount += (xfs_qcnt_t)ninos;
-
-       /*
-        * note the reservation amt in the trans struct too,
-        * so that the transaction knows how much was reserved by
-        * it against this particular dquot.
-        * We don't do this when we are reserving for a delayed allocation,
-        * because we don't have the luxury of a transaction envelope then.
-        */
-       if (tp) {
-               ASSERT(tp->t_dqinfo);
-               ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
-               if (nblks != 0)
-                       xfs_trans_mod_dquot(tp, dqp,
-                                           flags & XFS_QMOPT_RESBLK_MASK,
-                                           nblks);
-               if (ninos != 0)
-                       xfs_trans_mod_dquot(tp, dqp,
-                                           XFS_TRANS_DQ_RES_INOS,
-                                           ninos);
-       }
-       ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
-       ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
-       ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
-
-       xfs_dqunlock(dqp);
-       return 0;
-
-error_return:
-       xfs_dqunlock(dqp);
-       if (flags & XFS_QMOPT_ENOSPC)
-               return ENOSPC;
-       return EDQUOT;
-}
-
-
-/*
- * Given dquot(s), make disk block and/or inode reservations against them.
- * The fact that this does the reservation against both the usr and
- * grp/prj quotas is important, because this follows a both-or-nothing
- * approach.
- *
- * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
- *        XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
- *        XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
- *        XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
- * dquots are unlocked on return, if they were not locked by caller.
- */
-int
-xfs_trans_reserve_quota_bydquots(
-       xfs_trans_t     *tp,
-       xfs_mount_t     *mp,
-       xfs_dquot_t     *udqp,
-       xfs_dquot_t     *gdqp,
-       long            nblks,
-       long            ninos,
-       uint            flags)
-{
-       int             resvd = 0, error;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return 0;
-
-       if (tp && tp->t_dqinfo == NULL)
-               xfs_trans_alloc_dqinfo(tp);
-
-       ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
-
-       if (udqp) {
-               error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
-                                       (flags & ~XFS_QMOPT_ENOSPC));
-               if (error)
-                       return error;
-               resvd = 1;
-       }
-
-       if (gdqp) {
-               error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
-               if (error) {
-                       /*
-                        * can't do it, so backout previous reservation
-                        */
-                       if (resvd) {
-                               flags |= XFS_QMOPT_FORCE_RES;
-                               xfs_trans_dqresv(tp, mp, udqp,
-                                                -nblks, -ninos, flags);
-                       }
-                       return error;
-               }
-       }
-
-       /*
-        * Didn't change anything critical, so, no need to log
-        */
-       return 0;
-}
-
-
-/*
- * Lock the dquot and change the reservation if we can.
- * This doesn't change the actual usage, just the reservation.
- * The inode sent in is locked.
- */
-int
-xfs_trans_reserve_quota_nblks(
-       struct xfs_trans        *tp,
-       struct xfs_inode        *ip,
-       long                    nblks,
-       long                    ninos,
-       uint                    flags)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
-               return 0;
-       if (XFS_IS_PQUOTA_ON(mp))
-               flags |= XFS_QMOPT_ENOSPC;
-
-       ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
-       ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
-
-       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-       ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
-                               XFS_TRANS_DQ_RES_RTBLKS ||
-              (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
-                               XFS_TRANS_DQ_RES_BLKS);
-
-       /*
-        * Reserve nblks against these dquots, with trans as the mediator.
-        */
-       return xfs_trans_reserve_quota_bydquots(tp, mp,
-                                               ip->i_udquot, ip->i_gdquot,
-                                               nblks, ninos, flags);
-}
-
-/*
- * This routine is called to allocate a quotaoff log item.
- */
-xfs_qoff_logitem_t *
-xfs_trans_get_qoff_item(
-       xfs_trans_t             *tp,
-       xfs_qoff_logitem_t      *startqoff,
-       uint                    flags)
-{
-       xfs_qoff_logitem_t      *q;
-
-       ASSERT(tp != NULL);
-
-       q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
-       ASSERT(q != NULL);
-
-       /*
-        * Get a log_item_desc to point at the new item.
-        */
-       xfs_trans_add_item(tp, &q->qql_item);
-       return q;
-}
-
-
-/*
- * This is called to mark the quotaoff logitem as needing
- * to be logged when the transaction is committed.  The logitem must
- * already be associated with the given transaction.
- */
-void
-xfs_trans_log_quotaoff_item(
-       xfs_trans_t             *tp,
-       xfs_qoff_logitem_t      *qlp)
-{
-       tp->t_flags |= XFS_TRANS_DIRTY;
-       qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
-}
-
-STATIC void
-xfs_trans_alloc_dqinfo(
-       xfs_trans_t     *tp)
-{
-       tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
-}
-
-void
-xfs_trans_free_dqinfo(
-       xfs_trans_t     *tp)
-{
-       if (!tp->t_dqinfo)
-               return;
-       kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
-       tp->t_dqinfo = NULL;
-}
diff --git a/fs/xfs/support/uuid.c b/fs/xfs/support/uuid.c
deleted file mode 100644 (file)
index b83f76b..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include <xfs.h>
-
-/* IRIX interpretation of an uuid_t */
-typedef struct {
-       __be32  uu_timelow;
-       __be16  uu_timemid;
-       __be16  uu_timehi;
-       __be16  uu_clockseq;
-       __be16  uu_node[3];
-} xfs_uu_t;
-
-/*
- * uuid_getnodeuniq - obtain the node unique fields of a UUID.
- *
- * This is not in any way a standard or condoned UUID function;
- * it just something that's needed for user-level file handles.
- */
-void
-uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
-{
-       xfs_uu_t *uup = (xfs_uu_t *)uuid;
-
-       fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
-                  be16_to_cpu(uup->uu_timemid);
-       fsid[1] = be32_to_cpu(uup->uu_timelow);
-}
-
-int
-uuid_is_nil(uuid_t *uuid)
-{
-       int     i;
-       char    *cp = (char *)uuid;
-
-       if (uuid == NULL)
-               return 0;
-       /* implied check of version number here... */
-       for (i = 0; i < sizeof *uuid; i++)
-               if (*cp++) return 0;    /* not nil */
-       return 1;       /* is nil */
-}
-
-int
-uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
-{
-       return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
-}
diff --git a/fs/xfs/support/uuid.h b/fs/xfs/support/uuid.h
deleted file mode 100644 (file)
index 4732d71..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef __XFS_SUPPORT_UUID_H__
-#define __XFS_SUPPORT_UUID_H__
-
-typedef struct {
-       unsigned char   __u_bits[16];
-} uuid_t;
-
-extern int uuid_is_nil(uuid_t *uuid);
-extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
-extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
-
-#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/time.h b/fs/xfs/time.h
new file mode 100644 (file)
index 0000000..387e695
--- /dev/null
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_TIME_H__
+#define __XFS_SUPPORT_TIME_H__
+
+#include <linux/sched.h>
+#include <linux/time.h>
+
+typedef struct timespec timespec_t;
+
+static inline void delay(long ticks)
+{
+       schedule_timeout_uninterruptible(ticks);
+}
+
+static inline void nanotime(struct timespec *tvp)
+{
+       *tvp = CURRENT_TIME;
+}
+
+#endif /* __XFS_SUPPORT_TIME_H__ */
diff --git a/fs/xfs/uuid.c b/fs/xfs/uuid.c
new file mode 100644 (file)
index 0000000..b83f76b
--- /dev/null
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <xfs.h>
+
+/* IRIX interpretation of an uuid_t */
+typedef struct {
+       __be32  uu_timelow;
+       __be16  uu_timemid;
+       __be16  uu_timehi;
+       __be16  uu_clockseq;
+       __be16  uu_node[3];
+} xfs_uu_t;
+
+/*
+ * uuid_getnodeuniq - obtain the node unique fields of a UUID.
+ *
+ * This is not in any way a standard or condoned UUID function;
+ * it just something that's needed for user-level file handles.
+ */
+void
+uuid_getnodeuniq(uuid_t *uuid, int fsid [2])
+{
+       xfs_uu_t *uup = (xfs_uu_t *)uuid;
+
+       fsid[0] = (be16_to_cpu(uup->uu_clockseq) << 16) |
+                  be16_to_cpu(uup->uu_timemid);
+       fsid[1] = be32_to_cpu(uup->uu_timelow);
+}
+
+int
+uuid_is_nil(uuid_t *uuid)
+{
+       int     i;
+       char    *cp = (char *)uuid;
+
+       if (uuid == NULL)
+               return 0;
+       /* implied check of version number here... */
+       for (i = 0; i < sizeof *uuid; i++)
+               if (*cp++) return 0;    /* not nil */
+       return 1;       /* is nil */
+}
+
+int
+uuid_equal(uuid_t *uuid1, uuid_t *uuid2)
+{
+       return memcmp(uuid1, uuid2, sizeof(uuid_t)) ? 0 : 1;
+}
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
new file mode 100644 (file)
index 0000000..4732d71
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPPORT_UUID_H__
+#define __XFS_SUPPORT_UUID_H__
+
+typedef struct {
+       unsigned char   __u_bits[16];
+} uuid_t;
+
+extern int uuid_is_nil(uuid_t *uuid);
+extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
+extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+
+#endif /* __XFS_SUPPORT_UUID_H__ */
index 53ec3ea..d8b11b7 100644 (file)
@@ -24,5 +24,6 @@
 #define XFS_BUF_LOCK_TRACKING 1
 #endif
 
-#include <linux-2.6/xfs_linux.h>
+#include "xfs_linux.h"
+
 #endif /* __XFS_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
new file mode 100644 (file)
index 0000000..b6c4b37
--- /dev/null
@@ -0,0 +1,420 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_acl.h"
+#include "xfs_attr.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include <linux/posix_acl_xattr.h>
+
+
+/*
+ * Locking scheme:
+ *  - all ACL updates are protected by inode->i_mutex, which is taken before
+ *    calling into this file.
+ */
+
+STATIC struct posix_acl *
+xfs_acl_from_disk(struct xfs_acl *aclp)
+{
+       struct posix_acl_entry *acl_e;
+       struct posix_acl *acl;
+       struct xfs_acl_entry *ace;
+       int count, i;
+
+       count = be32_to_cpu(aclp->acl_cnt);
+
+       acl = posix_acl_alloc(count, GFP_KERNEL);
+       if (!acl)
+               return ERR_PTR(-ENOMEM);
+
+       for (i = 0; i < count; i++) {
+               acl_e = &acl->a_entries[i];
+               ace = &aclp->acl_entry[i];
+
+               /*
+                * The tag is 32 bits on disk and 16 bits in core.
+                *
+                * Because every access to it goes through the core
+                * format first this is not a problem.
+                */
+               acl_e->e_tag = be32_to_cpu(ace->ae_tag);
+               acl_e->e_perm = be16_to_cpu(ace->ae_perm);
+
+               switch (acl_e->e_tag) {
+               case ACL_USER:
+               case ACL_GROUP:
+                       acl_e->e_id = be32_to_cpu(ace->ae_id);
+                       break;
+               case ACL_USER_OBJ:
+               case ACL_GROUP_OBJ:
+               case ACL_MASK:
+               case ACL_OTHER:
+                       acl_e->e_id = ACL_UNDEFINED_ID;
+                       break;
+               default:
+                       goto fail;
+               }
+       }
+       return acl;
+
+fail:
+       posix_acl_release(acl);
+       return ERR_PTR(-EINVAL);
+}
+
+STATIC void
+xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
+{
+       const struct posix_acl_entry *acl_e;
+       struct xfs_acl_entry *ace;
+       int i;
+
+       aclp->acl_cnt = cpu_to_be32(acl->a_count);
+       for (i = 0; i < acl->a_count; i++) {
+               ace = &aclp->acl_entry[i];
+               acl_e = &acl->a_entries[i];
+
+               ace->ae_tag = cpu_to_be32(acl_e->e_tag);
+               ace->ae_id = cpu_to_be32(acl_e->e_id);
+               ace->ae_perm = cpu_to_be16(acl_e->e_perm);
+       }
+}
+
+struct posix_acl *
+xfs_get_acl(struct inode *inode, int type)
+{
+       struct xfs_inode *ip = XFS_I(inode);
+       struct posix_acl *acl;
+       struct xfs_acl *xfs_acl;
+       int len = sizeof(struct xfs_acl);
+       unsigned char *ea_name;
+       int error;
+
+       acl = get_cached_acl(inode, type);
+       if (acl != ACL_NOT_CACHED)
+               return acl;
+
+       trace_xfs_get_acl(ip);
+
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               ea_name = SGI_ACL_FILE;
+               break;
+       case ACL_TYPE_DEFAULT:
+               ea_name = SGI_ACL_DEFAULT;
+               break;
+       default:
+               BUG();
+       }
+
+       /*
+        * If we have a cached ACLs value just return it, not need to
+        * go out to the disk.
+        */
+
+       xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+       if (!xfs_acl)
+               return ERR_PTR(-ENOMEM);
+
+       error = -xfs_attr_get(ip, ea_name, (unsigned char *)xfs_acl,
+                                                       &len, ATTR_ROOT);
+       if (error) {
+               /*
+                * If the attribute doesn't exist make sure we have a negative
+                * cache entry, for any other error assume it is transient and
+                * leave the cache entry as ACL_NOT_CACHED.
+                */
+               if (error == -ENOATTR) {
+                       acl = NULL;
+                       goto out_update_cache;
+               }
+               goto out;
+       }
+
+       acl = xfs_acl_from_disk(xfs_acl);
+       if (IS_ERR(acl))
+               goto out;
+
+ out_update_cache:
+       set_cached_acl(inode, type, acl);
+ out:
+       kfree(xfs_acl);
+       return acl;
+}
+
+STATIC int
+xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
+{
+       struct xfs_inode *ip = XFS_I(inode);
+       unsigned char *ea_name;
+       int error;
+
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       switch (type) {
+       case ACL_TYPE_ACCESS:
+               ea_name = SGI_ACL_FILE;
+               break;
+       case ACL_TYPE_DEFAULT:
+               if (!S_ISDIR(inode->i_mode))
+                       return acl ? -EACCES : 0;
+               ea_name = SGI_ACL_DEFAULT;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       if (acl) {
+               struct xfs_acl *xfs_acl;
+               int len;
+
+               xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL);
+               if (!xfs_acl)
+                       return -ENOMEM;
+
+               xfs_acl_to_disk(xfs_acl, acl);
+               len = sizeof(struct xfs_acl) -
+                       (sizeof(struct xfs_acl_entry) *
+                        (XFS_ACL_MAX_ENTRIES - acl->a_count));
+
+               error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
+                               len, ATTR_ROOT);
+
+               kfree(xfs_acl);
+       } else {
+               /*
+                * A NULL ACL argument means we want to remove the ACL.
+                */
+               error = -xfs_attr_remove(ip, ea_name, ATTR_ROOT);
+
+               /*
+                * If the attribute didn't exist to start with that's fine.
+                */
+               if (error == -ENOATTR)
+                       error = 0;
+       }
+
+       if (!error)
+               set_cached_acl(inode, type, acl);
+       return error;
+}
+
+static int
+xfs_set_mode(struct inode *inode, umode_t mode)
+{
+       int error = 0;
+
+       if (mode != inode->i_mode) {
+               struct iattr iattr;
+
+               iattr.ia_valid = ATTR_MODE | ATTR_CTIME;
+               iattr.ia_mode = mode;
+               iattr.ia_ctime = current_fs_time(inode->i_sb);
+
+               error = -xfs_setattr_nonsize(XFS_I(inode), &iattr, XFS_ATTR_NOACL);
+       }
+
+       return error;
+}
+
+static int
+xfs_acl_exists(struct inode *inode, unsigned char *name)
+{
+       int len = sizeof(struct xfs_acl);
+
+       return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
+                           ATTR_ROOT|ATTR_KERNOVAL) == 0);
+}
+
+int
+posix_acl_access_exists(struct inode *inode)
+{
+       return xfs_acl_exists(inode, SGI_ACL_FILE);
+}
+
+int
+posix_acl_default_exists(struct inode *inode)
+{
+       if (!S_ISDIR(inode->i_mode))
+               return 0;
+       return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
+}
+
+/*
+ * No need for i_mutex because the inode is not yet exposed to the VFS.
+ */
+int
+xfs_inherit_acl(struct inode *inode, struct posix_acl *acl)
+{
+       umode_t mode = inode->i_mode;
+       int error = 0, inherit = 0;
+
+       if (S_ISDIR(inode->i_mode)) {
+               error = xfs_set_acl(inode, ACL_TYPE_DEFAULT, acl);
+               if (error)
+                       goto out;
+       }
+
+       error = posix_acl_create(&acl, GFP_KERNEL, &mode);
+       if (error < 0)
+               return error;
+
+       /*
+        * If posix_acl_create returns a positive value we need to
+        * inherit a permission that can't be represented using the Unix
+        * mode bits and we actually need to set an ACL.
+        */
+       if (error > 0)
+               inherit = 1;
+
+       error = xfs_set_mode(inode, mode);
+       if (error)
+               goto out;
+
+       if (inherit)
+               error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+
+out:
+       posix_acl_release(acl);
+       return error;
+}
+
+int
+xfs_acl_chmod(struct inode *inode)
+{
+       struct posix_acl *acl;
+       int error;
+
+       if (S_ISLNK(inode->i_mode))
+               return -EOPNOTSUPP;
+
+       acl = xfs_get_acl(inode, ACL_TYPE_ACCESS);
+       if (IS_ERR(acl) || !acl)
+               return PTR_ERR(acl);
+
+       error = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
+       if (error)
+               return error;
+
+       error = xfs_set_acl(inode, ACL_TYPE_ACCESS, acl);
+       posix_acl_release(acl);
+       return error;
+}
+
+static int
+xfs_xattr_acl_get(struct dentry *dentry, const char *name,
+               void *value, size_t size, int type)
+{
+       struct posix_acl *acl;
+       int error;
+
+       acl = xfs_get_acl(dentry->d_inode, type);
+       if (IS_ERR(acl))
+               return PTR_ERR(acl);
+       if (acl == NULL)
+               return -ENODATA;
+
+       error = posix_acl_to_xattr(acl, value, size);
+       posix_acl_release(acl);
+
+       return error;
+}
+
+static int
+xfs_xattr_acl_set(struct dentry *dentry, const char *name,
+               const void *value, size_t size, int flags, int type)
+{
+       struct inode *inode = dentry->d_inode;
+       struct posix_acl *acl = NULL;
+       int error = 0;
+
+       if (flags & XATTR_CREATE)
+               return -EINVAL;
+       if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode))
+               return value ? -EACCES : 0;
+       if ((current_fsuid() != inode->i_uid) && !capable(CAP_FOWNER))
+               return -EPERM;
+
+       if (!value)
+               goto set_acl;
+
+       acl = posix_acl_from_xattr(value, size);
+       if (!acl) {
+               /*
+                * acl_set_file(3) may request that we set default ACLs with
+                * zero length -- defend (gracefully) against that here.
+                */
+               goto out;
+       }
+       if (IS_ERR(acl)) {
+               error = PTR_ERR(acl);
+               goto out;
+       }
+
+       error = posix_acl_valid(acl);
+       if (error)
+               goto out_release;
+
+       error = -EINVAL;
+       if (acl->a_count > XFS_ACL_MAX_ENTRIES)
+               goto out_release;
+
+       if (type == ACL_TYPE_ACCESS) {
+               umode_t mode = inode->i_mode;
+               error = posix_acl_equiv_mode(acl, &mode);
+
+               if (error <= 0) {
+                       posix_acl_release(acl);
+                       acl = NULL;
+
+                       if (error < 0)
+                               return error;
+               }
+
+               error = xfs_set_mode(inode, mode);
+               if (error)
+                       goto out_release;
+       }
+
+ set_acl:
+       error = xfs_set_acl(inode, type, acl);
+ out_release:
+       posix_acl_release(acl);
+ out:
+       return error;
+}
+
+const struct xattr_handler xfs_xattr_acl_access_handler = {
+       .prefix = POSIX_ACL_XATTR_ACCESS,
+       .flags  = ACL_TYPE_ACCESS,
+       .get    = xfs_xattr_acl_get,
+       .set    = xfs_xattr_acl_set,
+};
+
+const struct xattr_handler xfs_xattr_acl_default_handler = {
+       .prefix = POSIX_ACL_XATTR_DEFAULT,
+       .flags  = ACL_TYPE_DEFAULT,
+       .get    = xfs_xattr_acl_get,
+       .set    = xfs_xattr_acl_set,
+};
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
new file mode 100644 (file)
index 0000000..63e971e
--- /dev/null
@@ -0,0 +1,1499 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_rw.h"
+#include "xfs_iomap.h"
+#include "xfs_vnodeops.h"
+#include "xfs_trace.h"
+#include "xfs_bmap.h"
+#include <linux/gfp.h>
+#include <linux/mpage.h>
+#include <linux/pagevec.h>
+#include <linux/writeback.h>
+
+
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC         37
+#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+
+void __init
+xfs_ioend_init(void)
+{
+       int i;
+
+       for (i = 0; i < NVSYNC; i++)
+               init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+
+void
+xfs_ioend_wait(
+       xfs_inode_t     *ip)
+{
+       wait_queue_head_t *wq = to_ioend_wq(ip);
+
+       wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+
+STATIC void
+xfs_ioend_wake(
+       xfs_inode_t     *ip)
+{
+       if (atomic_dec_and_test(&ip->i_iocount))
+               wake_up(to_ioend_wq(ip));
+}
+
+void
+xfs_count_page_state(
+       struct page             *page,
+       int                     *delalloc,
+       int                     *unwritten)
+{
+       struct buffer_head      *bh, *head;
+
+       *delalloc = *unwritten = 0;
+
+       bh = head = page_buffers(page);
+       do {
+               if (buffer_unwritten(bh))
+                       (*unwritten) = 1;
+               else if (buffer_delay(bh))
+                       (*delalloc) = 1;
+       } while ((bh = bh->b_this_page) != head);
+}
+
+STATIC struct block_device *
+xfs_find_bdev_for_inode(
+       struct inode            *inode)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+
+       if (XFS_IS_REALTIME_INODE(ip))
+               return mp->m_rtdev_targp->bt_bdev;
+       else
+               return mp->m_ddev_targp->bt_bdev;
+}
+
+/*
+ * We're now finished for good with this ioend structure.
+ * Update the page state via the associated buffer_heads,
+ * release holds on the inode and bio, and finally free
+ * up memory.  Do not use the ioend after this.
+ */
+STATIC void
+xfs_destroy_ioend(
+       xfs_ioend_t             *ioend)
+{
+       struct buffer_head      *bh, *next;
+       struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+
+       for (bh = ioend->io_buffer_head; bh; bh = next) {
+               next = bh->b_private;
+               bh->b_end_io(bh, !ioend->io_error);
+       }
+
+       /*
+        * Volume managers supporting multiple paths can send back ENODEV
+        * when the final path disappears.  In this case continuing to fill
+        * the page cache with dirty data which cannot be written out is
+        * evil, so prevent that.
+        */
+       if (unlikely(ioend->io_error == -ENODEV)) {
+               xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+                                     __FILE__, __LINE__);
+       }
+
+       xfs_ioend_wake(ip);
+       mempool_free(ioend, xfs_ioend_pool);
+}
+
+/*
+ * If the end of the current ioend is beyond the current EOF,
+ * return the new EOF value, otherwise zero.
+ */
+STATIC xfs_fsize_t
+xfs_ioend_new_eof(
+       xfs_ioend_t             *ioend)
+{
+       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
+       xfs_fsize_t             isize;
+       xfs_fsize_t             bsize;
+
+       bsize = ioend->io_offset + ioend->io_size;
+       isize = MAX(ip->i_size, ip->i_new_size);
+       isize = MIN(isize, bsize);
+       return isize > ip->i_d.di_size ? isize : 0;
+}
+
+/*
+ * Update on-disk file size now that data has been written to disk.  The
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated.  If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
+ */
+STATIC int
+xfs_setfilesize(
+       xfs_ioend_t             *ioend)
+{
+       xfs_inode_t             *ip = XFS_I(ioend->io_inode);
+       xfs_fsize_t             isize;
+
+       if (unlikely(ioend->io_error))
+               return 0;
+
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+               return EAGAIN;
+
+       isize = xfs_ioend_new_eof(ioend);
+       if (isize) {
+               trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+               ip->i_d.di_size = isize;
+               xfs_mark_inode_dirty(ip);
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return 0;
+}
+
+/*
+ * Schedule IO completion handling on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend(
+       struct xfs_ioend        *ioend)
+{
+       if (atomic_dec_and_test(&ioend->io_remaining)) {
+               if (ioend->io_type == IO_UNWRITTEN)
+                       queue_work(xfsconvertd_workqueue, &ioend->io_work);
+               else
+                       queue_work(xfsdatad_workqueue, &ioend->io_work);
+       }
+}
+
+/*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+       struct work_struct *work)
+{
+       xfs_ioend_t     *ioend = container_of(work, xfs_ioend_t, io_work);
+       struct xfs_inode *ip = XFS_I(ioend->io_inode);
+       int             error = 0;
+
+       /*
+        * For unwritten extents we need to issue transactions to convert a
+        * range to normal written extens after the data I/O has finished.
+        */
+       if (ioend->io_type == IO_UNWRITTEN &&
+           likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+
+               error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                                                ioend->io_size);
+               if (error)
+                       ioend->io_error = error;
+       }
+
+       /*
+        * We might have to update the on-disk file size after extending
+        * writes.
+        */
+       error = xfs_setfilesize(ioend);
+       ASSERT(!error || error == EAGAIN);
+
+       /*
+        * If we didn't complete processing of the ioend, requeue it to the
+        * tail of the workqueue for another attempt later. Otherwise destroy
+        * it.
+        */
+       if (error == EAGAIN) {
+               atomic_inc(&ioend->io_remaining);
+               xfs_finish_ioend(ioend);
+               /* ensure we don't spin on blocked ioends */
+               delay(1);
+       } else {
+               if (ioend->io_iocb)
+                       aio_complete(ioend->io_iocb, ioend->io_result, 0);
+               xfs_destroy_ioend(ioend);
+       }
+}
+
+/*
+ * Call IO completion handling in caller context on the final put of an ioend.
+ */
+STATIC void
+xfs_finish_ioend_sync(
+       struct xfs_ioend        *ioend)
+{
+       if (atomic_dec_and_test(&ioend->io_remaining))
+               xfs_end_io(&ioend->io_work);
+}
+
+/*
+ * Allocate and initialise an IO completion structure.
+ * We need to track unwritten extent write completion here initially.
+ * We'll need to extend this for updating the ondisk inode size later
+ * (vs. incore size).
+ */
+STATIC xfs_ioend_t *
+xfs_alloc_ioend(
+       struct inode            *inode,
+       unsigned int            type)
+{
+       xfs_ioend_t             *ioend;
+
+       ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS);
+
+       /*
+        * Set the count to 1 initially, which will prevent an I/O
+        * completion callback from happening before we have started
+        * all the I/O from calling the completion routine too early.
+        */
+       atomic_set(&ioend->io_remaining, 1);
+       ioend->io_error = 0;
+       ioend->io_list = NULL;
+       ioend->io_type = type;
+       ioend->io_inode = inode;
+       ioend->io_buffer_head = NULL;
+       ioend->io_buffer_tail = NULL;
+       atomic_inc(&XFS_I(ioend->io_inode)->i_iocount);
+       ioend->io_offset = 0;
+       ioend->io_size = 0;
+       ioend->io_iocb = NULL;
+       ioend->io_result = 0;
+
+       INIT_WORK(&ioend->io_work, xfs_end_io);
+       return ioend;
+}
+
+STATIC int
+xfs_map_blocks(
+       struct inode            *inode,
+       loff_t                  offset,
+       struct xfs_bmbt_irec    *imap,
+       int                     type,
+       int                     nonblocking)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 count = 1 << inode->i_blkbits;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     error = 0;
+       int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+       int                     nimaps = 1;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       if (type == IO_UNWRITTEN)
+               bmapi_flags |= XFS_BMAPI_IGSTATE;
+
+       if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
+               if (nonblocking)
+                       return -XFS_ERROR(EAGAIN);
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+       }
+
+       ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
+              (ip->i_df.if_flags & XFS_IFEXTENTS));
+       ASSERT(offset <= mp->m_maxioffset);
+
+       if (offset + count > mp->m_maxioffset)
+               count = mp->m_maxioffset - offset;
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                         bmapi_flags,  NULL, 0, imap, &nimaps, NULL);
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (error)
+               return -XFS_ERROR(error);
+
+       if (type == IO_DELALLOC &&
+           (!nimaps || isnullstartblock(imap->br_startblock))) {
+               error = xfs_iomap_write_allocate(ip, offset, count, imap);
+               if (!error)
+                       trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
+               return -XFS_ERROR(error);
+       }
+
+#ifdef DEBUG
+       if (type == IO_UNWRITTEN) {
+               ASSERT(nimaps);
+               ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+               ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+       }
+#endif
+       if (nimaps)
+               trace_xfs_map_blocks_found(ip, offset, count, type, imap);
+       return 0;
+}
+
+STATIC int
+xfs_imap_valid(
+       struct inode            *inode,
+       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset)
+{
+       offset >>= inode->i_blkbits;
+
+       return offset >= imap->br_startoff &&
+               offset < imap->br_startoff + imap->br_blockcount;
+}
+
+/*
+ * BIO completion handler for buffered IO.
+ */
+STATIC void
+xfs_end_bio(
+       struct bio              *bio,
+       int                     error)
+{
+       xfs_ioend_t             *ioend = bio->bi_private;
+
+       ASSERT(atomic_read(&bio->bi_cnt) >= 1);
+       ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error;
+
+       /* Toss bio and pass work off to an xfsdatad thread */
+       bio->bi_private = NULL;
+       bio->bi_end_io = NULL;
+       bio_put(bio);
+
+       xfs_finish_ioend(ioend);
+}
+
+STATIC void
+xfs_submit_ioend_bio(
+       struct writeback_control *wbc,
+       xfs_ioend_t             *ioend,
+       struct bio              *bio)
+{
+       atomic_inc(&ioend->io_remaining);
+       bio->bi_private = ioend;
+       bio->bi_end_io = xfs_end_bio;
+
+       /*
+        * If the I/O is beyond EOF we mark the inode dirty immediately
+        * but don't update the inode size until I/O completion.
+        */
+       if (xfs_ioend_new_eof(ioend))
+               xfs_mark_inode_dirty(XFS_I(ioend->io_inode));
+
+       submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio);
+}
+
+STATIC struct bio *
+xfs_alloc_ioend_bio(
+       struct buffer_head      *bh)
+{
+       int                     nvecs = bio_get_nr_vecs(bh->b_bdev);
+       struct bio              *bio = bio_alloc(GFP_NOIO, nvecs);
+
+       ASSERT(bio->bi_private == NULL);
+       bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+       bio->bi_bdev = bh->b_bdev;
+       return bio;
+}
+
+STATIC void
+xfs_start_buffer_writeback(
+       struct buffer_head      *bh)
+{
+       ASSERT(buffer_mapped(bh));
+       ASSERT(buffer_locked(bh));
+       ASSERT(!buffer_delay(bh));
+       ASSERT(!buffer_unwritten(bh));
+
+       mark_buffer_async_write(bh);
+       set_buffer_uptodate(bh);
+       clear_buffer_dirty(bh);
+}
+
+STATIC void
+xfs_start_page_writeback(
+       struct page             *page,
+       int                     clear_dirty,
+       int                     buffers)
+{
+       ASSERT(PageLocked(page));
+       ASSERT(!PageWriteback(page));
+       if (clear_dirty)
+               clear_page_dirty_for_io(page);
+       set_page_writeback(page);
+       unlock_page(page);
+       /* If no buffers on the page are to be written, finish it here */
+       if (!buffers)
+               end_page_writeback(page);
+}
+
+static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
+{
+       return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+}
+
+/*
+ * Submit all of the bios for all of the ioends we have saved up, covering the
+ * initial writepage page and also any probed pages.
+ *
+ * Because we may have multiple ioends spanning a page, we need to start
+ * writeback on all the buffers before we submit them for I/O. If we mark the
+ * buffers as we got, then we can end up with a page that only has buffers
+ * marked async write and I/O complete on can occur before we mark the other
+ * buffers async write.
+ *
+ * The end result of this is that we trip a bug in end_page_writeback() because
+ * we call it twice for the one page as the code in end_buffer_async_write()
+ * assumes that all buffers on the page are started at the same time.
+ *
+ * The fix is two passes across the ioend list - one to start writeback on the
+ * buffer_heads, and then submit them for I/O on the second pass.
+ */
+STATIC void
+xfs_submit_ioend(
+       struct writeback_control *wbc,
+       xfs_ioend_t             *ioend)
+{
+       xfs_ioend_t             *head = ioend;
+       xfs_ioend_t             *next;
+       struct buffer_head      *bh;
+       struct bio              *bio;
+       sector_t                lastblock = 0;
+
+       /* Pass 1 - start writeback */
+       do {
+               next = ioend->io_list;
+               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private)
+                       xfs_start_buffer_writeback(bh);
+       } while ((ioend = next) != NULL);
+
+       /* Pass 2 - submit I/O */
+       ioend = head;
+       do {
+               next = ioend->io_list;
+               bio = NULL;
+
+               for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
+
+                       if (!bio) {
+ retry:
+                               bio = xfs_alloc_ioend_bio(bh);
+                       } else if (bh->b_blocknr != lastblock + 1) {
+                               xfs_submit_ioend_bio(wbc, ioend, bio);
+                               goto retry;
+                       }
+
+                       if (bio_add_buffer(bio, bh) != bh->b_size) {
+                               xfs_submit_ioend_bio(wbc, ioend, bio);
+                               goto retry;
+                       }
+
+                       lastblock = bh->b_blocknr;
+               }
+               if (bio)
+                       xfs_submit_ioend_bio(wbc, ioend, bio);
+               xfs_finish_ioend(ioend);
+       } while ((ioend = next) != NULL);
+}
+
+/*
+ * Cancel submission of all buffer_heads so far in this endio.
+ * Toss the endio too.  Only ever called for the initial page
+ * in a writepage request, so only ever one page.
+ */
+STATIC void
+xfs_cancel_ioend(
+       xfs_ioend_t             *ioend)
+{
+       xfs_ioend_t             *next;
+       struct buffer_head      *bh, *next_bh;
+
+       do {
+               next = ioend->io_list;
+               bh = ioend->io_buffer_head;
+               do {
+                       next_bh = bh->b_private;
+                       clear_buffer_async_write(bh);
+                       unlock_buffer(bh);
+               } while ((bh = next_bh) != NULL);
+
+               xfs_ioend_wake(XFS_I(ioend->io_inode));
+               mempool_free(ioend, xfs_ioend_pool);
+       } while ((ioend = next) != NULL);
+}
+
+/*
+ * Test to see if we've been building up a completion structure for
+ * earlier buffers -- if so, we try to append to this ioend if we
+ * can, otherwise we finish off any current ioend and start another.
+ * Return true if we've finished the given ioend.
+ */
+STATIC void
+xfs_add_to_ioend(
+       struct inode            *inode,
+       struct buffer_head      *bh,
+       xfs_off_t               offset,
+       unsigned int            type,
+       xfs_ioend_t             **result,
+       int                     need_ioend)
+{
+       xfs_ioend_t             *ioend = *result;
+
+       if (!ioend || need_ioend || type != ioend->io_type) {
+               xfs_ioend_t     *previous = *result;
+
+               ioend = xfs_alloc_ioend(inode, type);
+               ioend->io_offset = offset;
+               ioend->io_buffer_head = bh;
+               ioend->io_buffer_tail = bh;
+               if (previous)
+                       previous->io_list = ioend;
+               *result = ioend;
+       } else {
+               ioend->io_buffer_tail->b_private = bh;
+               ioend->io_buffer_tail = bh;
+       }
+
+       bh->b_private = NULL;
+       ioend->io_size += bh->b_size;
+}
+
+STATIC void
+xfs_map_buffer(
+       struct inode            *inode,
+       struct buffer_head      *bh,
+       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset)
+{
+       sector_t                bn;
+       struct xfs_mount        *m = XFS_I(inode)->i_mount;
+       xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
+       xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
+
+       ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+       ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+       bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
+             ((offset - iomap_offset) >> inode->i_blkbits);
+
+       ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
+
+       bh->b_blocknr = bn;
+       set_buffer_mapped(bh);
+}
+
+STATIC void
+xfs_map_at_offset(
+       struct inode            *inode,
+       struct buffer_head      *bh,
+       struct xfs_bmbt_irec    *imap,
+       xfs_off_t               offset)
+{
+       ASSERT(imap->br_startblock != HOLESTARTBLOCK);
+       ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
+
+       xfs_map_buffer(inode, bh, imap, offset);
+       set_buffer_mapped(bh);
+       clear_buffer_delay(bh);
+       clear_buffer_unwritten(bh);
+}
+
+/*
+ * Test if a given page is suitable for writing as part of an unwritten
+ * or delayed allocate extent.
+ */
+STATIC int
+xfs_is_delayed_page(
+       struct page             *page,
+       unsigned int            type)
+{
+       if (PageWriteback(page))
+               return 0;
+
+       if (page->mapping && page_has_buffers(page)) {
+               struct buffer_head      *bh, *head;
+               int                     acceptable = 0;
+
+               bh = head = page_buffers(page);
+               do {
+                       if (buffer_unwritten(bh))
+                               acceptable = (type == IO_UNWRITTEN);
+                       else if (buffer_delay(bh))
+                               acceptable = (type == IO_DELALLOC);
+                       else if (buffer_dirty(bh) && buffer_mapped(bh))
+                               acceptable = (type == IO_OVERWRITE);
+                       else
+                               break;
+               } while ((bh = bh->b_this_page) != head);
+
+               if (acceptable)
+                       return 1;
+       }
+
+       return 0;
+}
+
+/*
+ * Allocate & map buffers for page given the extent map. Write it out.
+ * except for the original page of a writepage, this is called on
+ * delalloc/unwritten pages only, for the original page it is possible
+ * that the page has no mapping at all.
+ */
+STATIC int
+xfs_convert_page(
+       struct inode            *inode,
+       struct page             *page,
+       loff_t                  tindex,
+       struct xfs_bmbt_irec    *imap,
+       xfs_ioend_t             **ioendp,
+       struct writeback_control *wbc)
+{
+       struct buffer_head      *bh, *head;
+       xfs_off_t               end_offset;
+       unsigned long           p_offset;
+       unsigned int            type;
+       int                     len, page_dirty;
+       int                     count = 0, done = 0, uptodate = 1;
+       xfs_off_t               offset = page_offset(page);
+
+       if (page->index != tindex)
+               goto fail;
+       if (!trylock_page(page))
+               goto fail;
+       if (PageWriteback(page))
+               goto fail_unlock_page;
+       if (page->mapping != inode->i_mapping)
+               goto fail_unlock_page;
+       if (!xfs_is_delayed_page(page, (*ioendp)->io_type))
+               goto fail_unlock_page;
+
+       /*
+        * page_dirty is initially a count of buffers on the page before
+        * EOF and is decremented as we move each into a cleanable state.
+        *
+        * Derivation:
+        *
+        * End offset is the highest offset that this page should represent.
+        * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
+        * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
+        * hence give us the correct page_dirty count. On any other page,
+        * it will be zero and in that case we need page_dirty to be the
+        * count of buffers on the page.
+        */
+       end_offset = min_t(unsigned long long,
+                       (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+                       i_size_read(inode));
+
+       len = 1 << inode->i_blkbits;
+       p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
+                                       PAGE_CACHE_SIZE);
+       p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
+       page_dirty = p_offset / len;
+
+       bh = head = page_buffers(page);
+       do {
+               if (offset >= end_offset)
+                       break;
+               if (!buffer_uptodate(bh))
+                       uptodate = 0;
+               if (!(PageUptodate(page) || buffer_uptodate(bh))) {
+                       done = 1;
+                       continue;
+               }
+
+               if (buffer_unwritten(bh) || buffer_delay(bh) ||
+                   buffer_mapped(bh)) {
+                       if (buffer_unwritten(bh))
+                               type = IO_UNWRITTEN;
+                       else if (buffer_delay(bh))
+                               type = IO_DELALLOC;
+                       else
+                               type = IO_OVERWRITE;
+
+                       if (!xfs_imap_valid(inode, imap, offset)) {
+                               done = 1;
+                               continue;
+                       }
+
+                       lock_buffer(bh);
+                       if (type != IO_OVERWRITE)
+                               xfs_map_at_offset(inode, bh, imap, offset);
+                       xfs_add_to_ioend(inode, bh, offset, type,
+                                        ioendp, done);
+
+                       page_dirty--;
+                       count++;
+               } else {
+                       done = 1;
+               }
+       } while (offset += len, (bh = bh->b_this_page) != head);
+
+       if (uptodate && bh == head)
+               SetPageUptodate(page);
+
+       if (count) {
+               if (--wbc->nr_to_write <= 0 &&
+                   wbc->sync_mode == WB_SYNC_NONE)
+                       done = 1;
+       }
+       xfs_start_page_writeback(page, !page_dirty, count);
+
+       return done;
+ fail_unlock_page:
+       unlock_page(page);
+ fail:
+       return 1;
+}
+
+/*
+ * Convert & write out a cluster of pages in the same extent as defined
+ * by mp and following the start page.
+ */
+STATIC void
+xfs_cluster_write(
+       struct inode            *inode,
+       pgoff_t                 tindex,
+       struct xfs_bmbt_irec    *imap,
+       xfs_ioend_t             **ioendp,
+       struct writeback_control *wbc,
+       pgoff_t                 tlast)
+{
+       struct pagevec          pvec;
+       int                     done = 0, i;
+
+       pagevec_init(&pvec, 0);
+       while (!done && tindex <= tlast) {
+               unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1);
+
+               if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len))
+                       break;
+
+               for (i = 0; i < pagevec_count(&pvec); i++) {
+                       done = xfs_convert_page(inode, pvec.pages[i], tindex++,
+                                       imap, ioendp, wbc);
+                       if (done)
+                               break;
+               }
+
+               pagevec_release(&pvec);
+               cond_resched();
+       }
+}
+
+STATIC void
+xfs_vm_invalidatepage(
+       struct page             *page,
+       unsigned long           offset)
+{
+       trace_xfs_invalidatepage(page->mapping->host, page, offset);
+       block_invalidatepage(page, offset);
+}
+
+/*
+ * If the page has delalloc buffers on it, we need to punch them out before we
+ * invalidate the page. If we don't, we leave a stale delalloc mapping on the
+ * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
+ * is done on that same region - the delalloc extent is returned when none is
+ * supposed to be there.
+ *
+ * We prevent this by truncating away the delalloc regions on the page before
+ * invalidating it. Because they are delalloc, we can do this without needing a
+ * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
+ * truncation without a transaction as there is no space left for block
+ * reservation (typically why we see a ENOSPC in writeback).
+ *
+ * This is not a performance critical path, so for now just do the punching a
+ * buffer head at a time.
+ */
+STATIC void
+xfs_aops_discard_page(
+       struct page             *page)
+{
+       struct inode            *inode = page->mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct buffer_head      *bh, *head;
+       loff_t                  offset = page_offset(page);
+
+       if (!xfs_is_delayed_page(page, IO_DELALLOC))
+               goto out_invalidate;
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               goto out_invalidate;
+
+       xfs_alert(ip->i_mount,
+               "page discard on page %p, inode 0x%llx, offset %llu.",
+                       page, ip->i_ino, offset);
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       bh = head = page_buffers(page);
+       do {
+               int             error;
+               xfs_fileoff_t   start_fsb;
+
+               if (!buffer_delay(bh))
+                       goto next_buffer;
+
+               start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+               error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
+               if (error) {
+                       /* something screwed, just bail */
+                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                               xfs_alert(ip->i_mount,
+                       "page discard unable to remove delalloc mapping.");
+                       }
+                       break;
+               }
+next_buffer:
+               offset += 1 << inode->i_blkbits;
+
+       } while ((bh = bh->b_this_page) != head);
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_invalidate:
+       xfs_vm_invalidatepage(page, 0);
+       return;
+}
+
+/*
+ * Write out a dirty page.
+ *
+ * For delalloc space on the page we need to allocate space and flush it.
+ * For unwritten space on the page we need to start the conversion to
+ * regular allocated space.
+ * For any other dirty buffer heads on the page we should flush them.
+ */
+STATIC int
+xfs_vm_writepage(
+       struct page             *page,
+       struct writeback_control *wbc)
+{
+       struct inode            *inode = page->mapping->host;
+       struct buffer_head      *bh, *head;
+       struct xfs_bmbt_irec    imap;
+       xfs_ioend_t             *ioend = NULL, *iohead = NULL;
+       loff_t                  offset;
+       unsigned int            type;
+       __uint64_t              end_offset;
+       pgoff_t                 end_index, last_index;
+       ssize_t                 len;
+       int                     err, imap_valid = 0, uptodate = 1;
+       int                     count = 0;
+       int                     nonblocking = 0;
+
+       trace_xfs_writepage(inode, page, 0);
+
+       ASSERT(page_has_buffers(page));
+
+       /*
+        * Refuse to write the page out if we are called from reclaim context.
+        *
+        * This avoids stack overflows when called from deeply used stacks in
+        * random callers for direct reclaim or memcg reclaim.  We explicitly
+        * allow reclaim from kswapd as the stack usage there is relatively low.
+        *
+        * This should really be done by the core VM, but until that happens
+        * filesystems like XFS, btrfs and ext4 have to take care of this
+        * by themselves.
+        */
+       if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+               goto redirty;
+
+       /*
+        * Given that we do not allow direct reclaim to call us, we should
+        * never be called while in a filesystem transaction.
+        */
+       if (WARN_ON(current->flags & PF_FSTRANS))
+               goto redirty;
+
+       /* Is this page beyond the end of the file? */
+       offset = i_size_read(inode);
+       end_index = offset >> PAGE_CACHE_SHIFT;
+       last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
+       if (page->index >= end_index) {
+               if ((page->index >= end_index + 1) ||
+                   !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
+                       unlock_page(page);
+                       return 0;
+               }
+       }
+
+       end_offset = min_t(unsigned long long,
+                       (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT,
+                       offset);
+       len = 1 << inode->i_blkbits;
+
+       bh = head = page_buffers(page);
+       offset = page_offset(page);
+       type = IO_OVERWRITE;
+
+       if (wbc->sync_mode == WB_SYNC_NONE)
+               nonblocking = 1;
+
+       do {
+               int new_ioend = 0;
+
+               if (offset >= end_offset)
+                       break;
+               if (!buffer_uptodate(bh))
+                       uptodate = 0;
+
+               /*
+                * set_page_dirty dirties all buffers in a page, independent
+                * of their state.  The dirty state however is entirely
+                * meaningless for holes (!mapped && uptodate), so skip
+                * buffers covering holes here.
+                */
+               if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
+                       imap_valid = 0;
+                       continue;
+               }
+
+               if (buffer_unwritten(bh)) {
+                       if (type != IO_UNWRITTEN) {
+                               type = IO_UNWRITTEN;
+                               imap_valid = 0;
+                       }
+               } else if (buffer_delay(bh)) {
+                       if (type != IO_DELALLOC) {
+                               type = IO_DELALLOC;
+                               imap_valid = 0;
+                       }
+               } else if (buffer_uptodate(bh)) {
+                       if (type != IO_OVERWRITE) {
+                               type = IO_OVERWRITE;
+                               imap_valid = 0;
+                       }
+               } else {
+                       if (PageUptodate(page)) {
+                               ASSERT(buffer_mapped(bh));
+                               imap_valid = 0;
+                       }
+                       continue;
+               }
+
+               if (imap_valid)
+                       imap_valid = xfs_imap_valid(inode, &imap, offset);
+               if (!imap_valid) {
+                       /*
+                        * If we didn't have a valid mapping then we need to
+                        * put the new mapping into a separate ioend structure.
+                        * This ensures non-contiguous extents always have
+                        * separate ioends, which is particularly important
+                        * for unwritten extent conversion at I/O completion
+                        * time.
+                        */
+                       new_ioend = 1;
+                       err = xfs_map_blocks(inode, offset, &imap, type,
+                                            nonblocking);
+                       if (err)
+                               goto error;
+                       imap_valid = xfs_imap_valid(inode, &imap, offset);
+               }
+               if (imap_valid) {
+                       lock_buffer(bh);
+                       if (type != IO_OVERWRITE)
+                               xfs_map_at_offset(inode, bh, &imap, offset);
+                       xfs_add_to_ioend(inode, bh, offset, type, &ioend,
+                                        new_ioend);
+                       count++;
+               }
+
+               if (!iohead)
+                       iohead = ioend;
+
+       } while (offset += len, ((bh = bh->b_this_page) != head));
+
+       if (uptodate && bh == head)
+               SetPageUptodate(page);
+
+       xfs_start_page_writeback(page, 1, count);
+
+       if (ioend && imap_valid) {
+               xfs_off_t               end_index;
+
+               end_index = imap.br_startoff + imap.br_blockcount;
+
+               /* to bytes */
+               end_index <<= inode->i_blkbits;
+
+               /* to pages */
+               end_index = (end_index - 1) >> PAGE_CACHE_SHIFT;
+
+               /* check against file size */
+               if (end_index > last_index)
+                       end_index = last_index;
+
+               xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
+                                 wbc, end_index);
+       }
+
+       if (iohead)
+               xfs_submit_ioend(wbc, iohead);
+
+       return 0;
+
+error:
+       if (iohead)
+               xfs_cancel_ioend(iohead);
+
+       if (err == -EAGAIN)
+               goto redirty;
+
+       xfs_aops_discard_page(page);
+       ClearPageUptodate(page);
+       unlock_page(page);
+       return err;
+
+redirty:
+       redirty_page_for_writepage(wbc, page);
+       unlock_page(page);
+       return 0;
+}
+
+STATIC int
+xfs_vm_writepages(
+       struct address_space    *mapping,
+       struct writeback_control *wbc)
+{
+       xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+       return generic_writepages(mapping, wbc);
+}
+
+/*
+ * Called to move a page into cleanable state - and from there
+ * to be released. The page should already be clean. We always
+ * have buffer heads in this call.
+ *
+ * Returns 1 if the page is ok to release, 0 otherwise.
+ */
+STATIC int
+xfs_vm_releasepage(
+       struct page             *page,
+       gfp_t                   gfp_mask)
+{
+       int                     delalloc, unwritten;
+
+       trace_xfs_releasepage(page->mapping->host, page, 0);
+
+       xfs_count_page_state(page, &delalloc, &unwritten);
+
+       if (WARN_ON(delalloc))
+               return 0;
+       if (WARN_ON(unwritten))
+               return 0;
+
+       return try_to_free_buffers(page);
+}
+
+STATIC int
+__xfs_get_blocks(
+       struct inode            *inode,
+       sector_t                iblock,
+       struct buffer_head      *bh_result,
+       int                     create,
+       int                     direct)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       xfs_fileoff_t           offset_fsb, end_fsb;
+       int                     error = 0;
+       int                     lockmode = 0;
+       struct xfs_bmbt_irec    imap;
+       int                     nimaps = 1;
+       xfs_off_t               offset;
+       ssize_t                 size;
+       int                     new = 0;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       offset = (xfs_off_t)iblock << inode->i_blkbits;
+       ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
+       size = bh_result->b_size;
+
+       if (!create && direct && offset >= i_size_read(inode))
+               return 0;
+
+       if (create) {
+               lockmode = XFS_ILOCK_EXCL;
+               xfs_ilock(ip, lockmode);
+       } else {
+               lockmode = xfs_ilock_map_shared(ip);
+       }
+
+       ASSERT(offset <= mp->m_maxioffset);
+       if (offset + size > mp->m_maxioffset)
+               size = mp->m_maxioffset - offset;
+       end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
+       offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+       error = xfs_bmapi(NULL, ip, offset_fsb, end_fsb - offset_fsb,
+                         XFS_BMAPI_ENTIRE,  NULL, 0, &imap, &nimaps, NULL);
+       if (error)
+               goto out_unlock;
+
+       if (create &&
+           (!nimaps ||
+            (imap.br_startblock == HOLESTARTBLOCK ||
+             imap.br_startblock == DELAYSTARTBLOCK))) {
+               if (direct) {
+                       error = xfs_iomap_write_direct(ip, offset, size,
+                                                      &imap, nimaps);
+               } else {
+                       error = xfs_iomap_write_delay(ip, offset, size, &imap);
+               }
+               if (error)
+                       goto out_unlock;
+
+               trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap);
+       } else if (nimaps) {
+               trace_xfs_get_blocks_found(ip, offset, size, 0, &imap);
+       } else {
+               trace_xfs_get_blocks_notfound(ip, offset, size);
+               goto out_unlock;
+       }
+       xfs_iunlock(ip, lockmode);
+
+       if (imap.br_startblock != HOLESTARTBLOCK &&
+           imap.br_startblock != DELAYSTARTBLOCK) {
+               /*
+                * For unwritten extents do not report a disk address on
+                * the read case (treat as if we're reading into a hole).
+                */
+               if (create || !ISUNWRITTEN(&imap))
+                       xfs_map_buffer(inode, bh_result, &imap, offset);
+               if (create && ISUNWRITTEN(&imap)) {
+                       if (direct)
+                               bh_result->b_private = inode;
+                       set_buffer_unwritten(bh_result);
+               }
+       }
+
+       /*
+        * If this is a realtime file, data may be on a different device.
+        * to that pointed to from the buffer_head b_bdev currently.
+        */
+       bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
+
+       /*
+        * If we previously allocated a block out beyond eof and we are now
+        * coming back to use it then we will need to flag it as new even if it
+        * has a disk address.
+        *
+        * With sub-block writes into unwritten extents we also need to mark
+        * the buffer as new so that the unwritten parts of the buffer gets
+        * correctly zeroed.
+        */
+       if (create &&
+           ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
+            (offset >= i_size_read(inode)) ||
+            (new || ISUNWRITTEN(&imap))))
+               set_buffer_new(bh_result);
+
+       if (imap.br_startblock == DELAYSTARTBLOCK) {
+               BUG_ON(direct);
+               if (create) {
+                       set_buffer_uptodate(bh_result);
+                       set_buffer_mapped(bh_result);
+                       set_buffer_delay(bh_result);
+               }
+       }
+
+       /*
+        * If this is O_DIRECT or the mpage code calling tell them how large
+        * the mapping is, so that we can avoid repeated get_blocks calls.
+        */
+       if (direct || size > (1 << inode->i_blkbits)) {
+               xfs_off_t               mapping_size;
+
+               mapping_size = imap.br_startoff + imap.br_blockcount - iblock;
+               mapping_size <<= inode->i_blkbits;
+
+               ASSERT(mapping_size > 0);
+               if (mapping_size > size)
+                       mapping_size = size;
+               if (mapping_size > LONG_MAX)
+                       mapping_size = LONG_MAX;
+
+               bh_result->b_size = mapping_size;
+       }
+
+       return 0;
+
+out_unlock:
+       xfs_iunlock(ip, lockmode);
+       return -error;
+}
+
+int
+xfs_get_blocks(
+       struct inode            *inode,
+       sector_t                iblock,
+       struct buffer_head      *bh_result,
+       int                     create)
+{
+       return __xfs_get_blocks(inode, iblock, bh_result, create, 0);
+}
+
+STATIC int
+xfs_get_blocks_direct(
+       struct inode            *inode,
+       sector_t                iblock,
+       struct buffer_head      *bh_result,
+       int                     create)
+{
+       return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
+}
+
+/*
+ * Complete a direct I/O write request.
+ *
+ * If the private argument is non-NULL __xfs_get_blocks signals us that we
+ * need to issue a transaction to convert the range from unwritten to written
+ * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * to do this and we are done.  But in case this was a successful AIO
+ * request this handler is called from interrupt context, from which we
+ * can't start transactions.  In that case offload the I/O completion to
+ * the workqueues we also use for buffered I/O completion.
+ */
+STATIC void
+xfs_end_io_direct_write(
+       struct kiocb            *iocb,
+       loff_t                  offset,
+       ssize_t                 size,
+       void                    *private,
+       int                     ret,
+       bool                    is_async)
+{
+       struct xfs_ioend        *ioend = iocb->private;
+
+       /*
+        * blockdev_direct_IO can return an error even after the I/O
+        * completion handler was called.  Thus we need to protect
+        * against double-freeing.
+        */
+       iocb->private = NULL;
+
+       ioend->io_offset = offset;
+       ioend->io_size = size;
+       if (private && size > 0)
+               ioend->io_type = IO_UNWRITTEN;
+
+       if (is_async) {
+               /*
+                * If we are converting an unwritten extent we need to delay
+                * the AIO completion until after the unwrittent extent
+                * conversion has completed, otherwise do it ASAP.
+                */
+               if (ioend->io_type == IO_UNWRITTEN) {
+                       ioend->io_iocb = iocb;
+                       ioend->io_result = ret;
+               } else {
+                       aio_complete(iocb, ret, 0);
+               }
+               xfs_finish_ioend(ioend);
+       } else {
+               xfs_finish_ioend_sync(ioend);
+       }
+
+       /* XXX: probably should move into the real I/O completion handler */
+       inode_dio_done(ioend->io_inode);
+}
+
+STATIC ssize_t
+xfs_vm_direct_IO(
+       int                     rw,
+       struct kiocb            *iocb,
+       const struct iovec      *iov,
+       loff_t                  offset,
+       unsigned long           nr_segs)
+{
+       struct inode            *inode = iocb->ki_filp->f_mapping->host;
+       struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
+       ssize_t                 ret;
+
+       if (rw & WRITE) {
+               iocb->private = xfs_alloc_ioend(inode, IO_DIRECT);
+
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+                                           offset, nr_segs,
+                                           xfs_get_blocks_direct,
+                                           xfs_end_io_direct_write, NULL, 0);
+               if (ret != -EIOCBQUEUED && iocb->private)
+                       xfs_destroy_ioend(iocb->private);
+       } else {
+               ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
+                                           offset, nr_segs,
+                                           xfs_get_blocks_direct,
+                                           NULL, NULL, 0);
+       }
+
+       return ret;
+}
+
+STATIC void
+xfs_vm_write_failed(
+       struct address_space    *mapping,
+       loff_t                  to)
+{
+       struct inode            *inode = mapping->host;
+
+       if (to > inode->i_size) {
+               /*
+                * punch out the delalloc blocks we have already allocated. We
+                * don't call xfs_setattr() to do this as we may be in the
+                * middle of a multi-iovec write and so the vfs inode->i_size
+                * will not match the xfs ip->i_size and so it will zero too
+                * much. Hence we jus truncate the page cache to zero what is
+                * necessary and punch the delalloc blocks directly.
+                */
+               struct xfs_inode        *ip = XFS_I(inode);
+               xfs_fileoff_t           start_fsb;
+               xfs_fileoff_t           end_fsb;
+               int                     error;
+
+               truncate_pagecache(inode, to, inode->i_size);
+
+               /*
+                * Check if there are any blocks that are outside of i_size
+                * that need to be trimmed back.
+                */
+               start_fsb = XFS_B_TO_FSB(ip->i_mount, inode->i_size) + 1;
+               end_fsb = XFS_B_TO_FSB(ip->i_mount, to);
+               if (end_fsb <= start_fsb)
+                       return;
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+               error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
+                                                       end_fsb - start_fsb);
+               if (error) {
+                       /* something screwed, just bail */
+                       if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+                               xfs_alert(ip->i_mount,
+                       "xfs_vm_write_failed: unable to clean up ino %lld",
+                                               ip->i_ino);
+                       }
+               }
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+}
+
+STATIC int
+xfs_vm_write_begin(
+       struct file             *file,
+       struct address_space    *mapping,
+       loff_t                  pos,
+       unsigned                len,
+       unsigned                flags,
+       struct page             **pagep,
+       void                    **fsdata)
+{
+       int                     ret;
+
+       ret = block_write_begin(mapping, pos, len, flags | AOP_FLAG_NOFS,
+                               pagep, xfs_get_blocks);
+       if (unlikely(ret))
+               xfs_vm_write_failed(mapping, pos + len);
+       return ret;
+}
+
+STATIC int
+xfs_vm_write_end(
+       struct file             *file,
+       struct address_space    *mapping,
+       loff_t                  pos,
+       unsigned                len,
+       unsigned                copied,
+       struct page             *page,
+       void                    *fsdata)
+{
+       int                     ret;
+
+       ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+       if (unlikely(ret < len))
+               xfs_vm_write_failed(mapping, pos + len);
+       return ret;
+}
+
+STATIC sector_t
+xfs_vm_bmap(
+       struct address_space    *mapping,
+       sector_t                block)
+{
+       struct inode            *inode = (struct inode *)mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+
+       trace_xfs_vm_bmap(XFS_I(inode));
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+       return generic_block_bmap(mapping, block, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpage(
+       struct file             *unused,
+       struct page             *page)
+{
+       return mpage_readpage(page, xfs_get_blocks);
+}
+
+STATIC int
+xfs_vm_readpages(
+       struct file             *unused,
+       struct address_space    *mapping,
+       struct list_head        *pages,
+       unsigned                nr_pages)
+{
+       return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
+}
+
+const struct address_space_operations xfs_address_space_operations = {
+       .readpage               = xfs_vm_readpage,
+       .readpages              = xfs_vm_readpages,
+       .writepage              = xfs_vm_writepage,
+       .writepages             = xfs_vm_writepages,
+       .releasepage            = xfs_vm_releasepage,
+       .invalidatepage         = xfs_vm_invalidatepage,
+       .write_begin            = xfs_vm_write_begin,
+       .write_end              = xfs_vm_write_end,
+       .bmap                   = xfs_vm_bmap,
+       .direct_IO              = xfs_vm_direct_IO,
+       .migratepage            = buffer_migrate_page,
+       .is_partially_uptodate  = block_is_partially_uptodate,
+       .error_remove_page      = generic_error_remove_page,
+};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
new file mode 100644 (file)
index 0000000..71f721e
--- /dev/null
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_AOPS_H__
+#define __XFS_AOPS_H__
+
+extern struct workqueue_struct *xfsdatad_workqueue;
+extern struct workqueue_struct *xfsconvertd_workqueue;
+extern mempool_t *xfs_ioend_pool;
+
+/*
+ * Types of I/O for bmap clustering and I/O completion tracking.
+ */
+enum {
+       IO_DIRECT = 0,  /* special case for direct I/O ioends */
+       IO_DELALLOC,    /* mapping covers delalloc region */
+       IO_UNWRITTEN,   /* mapping covers allocated but uninitialized data */
+       IO_OVERWRITE,   /* mapping covers already allocated extent */
+};
+
+#define XFS_IO_TYPES \
+       { 0,                    "" }, \
+       { IO_DELALLOC,          "delalloc" }, \
+       { IO_UNWRITTEN,         "unwritten" }, \
+       { IO_OVERWRITE,         "overwrite" }
+
+/*
+ * xfs_ioend struct manages large extent writes for XFS.
+ * It can manage several multi-page bio's at once.
+ */
+typedef struct xfs_ioend {
+       struct xfs_ioend        *io_list;       /* next ioend in chain */
+       unsigned int            io_type;        /* delalloc / unwritten */
+       int                     io_error;       /* I/O error code */
+       atomic_t                io_remaining;   /* hold count */
+       struct inode            *io_inode;      /* file being written to */
+       struct buffer_head      *io_buffer_head;/* buffer linked list head */
+       struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
+       size_t                  io_size;        /* size of the extent */
+       xfs_off_t               io_offset;      /* offset in the file */
+       struct work_struct      io_work;        /* xfsdatad work queue */
+       struct kiocb            *io_iocb;
+       int                     io_result;
+} xfs_ioend_t;
+
+extern const struct address_space_operations xfs_address_space_operations;
+extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
+
+extern void xfs_count_page_state(struct page *, int *, int *);
+
+#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
new file mode 100644 (file)
index 0000000..c57836d
--- /dev/null
@@ -0,0 +1,1876 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/gfp.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/bio.h>
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include <linux/workqueue.h>
+#include <linux/percpu.h>
+#include <linux/blkdev.h>
+#include <linux/hash.h>
+#include <linux/kthread.h>
+#include <linux/migrate.h>
+#include <linux/backing-dev.h>
+#include <linux/freezer.h>
+
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_trace.h"
+
+static kmem_zone_t *xfs_buf_zone;
+STATIC int xfsbufd(void *);
+STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
+
+static struct workqueue_struct *xfslogd_workqueue;
+struct workqueue_struct *xfsdatad_workqueue;
+struct workqueue_struct *xfsconvertd_workqueue;
+
+#ifdef XFS_BUF_LOCK_TRACKING
+# define XB_SET_OWNER(bp)      ((bp)->b_last_holder = current->pid)
+# define XB_CLEAR_OWNER(bp)    ((bp)->b_last_holder = -1)
+# define XB_GET_OWNER(bp)      ((bp)->b_last_holder)
+#else
+# define XB_SET_OWNER(bp)      do { } while (0)
+# define XB_CLEAR_OWNER(bp)    do { } while (0)
+# define XB_GET_OWNER(bp)      do { } while (0)
+#endif
+
+#define xb_to_gfp(flags) \
+       ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
+         ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
+
+#define xb_to_km(flags) \
+        (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
+
+#define xfs_buf_allocate(flags) \
+       kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
+#define xfs_buf_deallocate(bp) \
+       kmem_zone_free(xfs_buf_zone, (bp));
+
+static inline int
+xfs_buf_is_vmapped(
+       struct xfs_buf  *bp)
+{
+       /*
+        * Return true if the buffer is vmapped.
+        *
+        * The XBF_MAPPED flag is set if the buffer should be mapped, but the
+        * code is clever enough to know it doesn't have to map a single page,
+        * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
+        */
+       return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
+}
+
+static inline int
+xfs_buf_vmap_len(
+       struct xfs_buf  *bp)
+{
+       return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
+}
+
+/*
+ * xfs_buf_lru_add - add a buffer to the LRU.
+ *
+ * The LRU takes a new reference to the buffer so that it will only be freed
+ * once the shrinker takes the buffer off the LRU.
+ */
+STATIC void
+xfs_buf_lru_add(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+
+       spin_lock(&btp->bt_lru_lock);
+       if (list_empty(&bp->b_lru)) {
+               atomic_inc(&bp->b_hold);
+               list_add_tail(&bp->b_lru, &btp->bt_lru);
+               btp->bt_lru_nr++;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * xfs_buf_lru_del - remove a buffer from the LRU
+ *
+ * The unlocked check is safe here because it only occurs when there are not
+ * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
+ * to optimise the shrinker removing the buffer from the LRU and calling
+ * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
+ * bt_lru_lock.
+ */
+STATIC void
+xfs_buf_lru_del(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+
+       if (list_empty(&bp->b_lru))
+               return;
+
+       spin_lock(&btp->bt_lru_lock);
+       if (!list_empty(&bp->b_lru)) {
+               list_del_init(&bp->b_lru);
+               btp->bt_lru_nr--;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+}
+
+/*
+ * When we mark a buffer stale, we remove the buffer from the LRU and clear the
+ * b_lru_ref count so that the buffer is freed immediately when the buffer
+ * reference count falls to zero. If the buffer is already on the LRU, we need
+ * to remove the reference that LRU holds on the buffer.
+ *
+ * This prevents build-up of stale buffers on the LRU.
+ */
+void
+xfs_buf_stale(
+       struct xfs_buf  *bp)
+{
+       bp->b_flags |= XBF_STALE;
+       atomic_set(&(bp)->b_lru_ref, 0);
+       if (!list_empty(&bp->b_lru)) {
+               struct xfs_buftarg *btp = bp->b_target;
+
+               spin_lock(&btp->bt_lru_lock);
+               if (!list_empty(&bp->b_lru)) {
+                       list_del_init(&bp->b_lru);
+                       btp->bt_lru_nr--;
+                       atomic_dec(&bp->b_hold);
+               }
+               spin_unlock(&btp->bt_lru_lock);
+       }
+       ASSERT(atomic_read(&bp->b_hold) >= 1);
+}
+
+STATIC void
+_xfs_buf_initialize(
+       xfs_buf_t               *bp,
+       xfs_buftarg_t           *target,
+       xfs_off_t               range_base,
+       size_t                  range_length,
+       xfs_buf_flags_t         flags)
+{
+       /*
+        * We don't want certain flags to appear in b_flags.
+        */
+       flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
+
+       memset(bp, 0, sizeof(xfs_buf_t));
+       atomic_set(&bp->b_hold, 1);
+       atomic_set(&bp->b_lru_ref, 1);
+       init_completion(&bp->b_iowait);
+       INIT_LIST_HEAD(&bp->b_lru);
+       INIT_LIST_HEAD(&bp->b_list);
+       RB_CLEAR_NODE(&bp->b_rbnode);
+       sema_init(&bp->b_sema, 0); /* held, no waiters */
+       XB_SET_OWNER(bp);
+       bp->b_target = target;
+       bp->b_file_offset = range_base;
+       /*
+        * Set buffer_length and count_desired to the same value initially.
+        * I/O routines should use count_desired, which will be the same in
+        * most cases but may be reset (e.g. XFS recovery).
+        */
+       bp->b_buffer_length = bp->b_count_desired = range_length;
+       bp->b_flags = flags;
+       bp->b_bn = XFS_BUF_DADDR_NULL;
+       atomic_set(&bp->b_pin_count, 0);
+       init_waitqueue_head(&bp->b_waiters);
+
+       XFS_STATS_INC(xb_create);
+
+       trace_xfs_buf_init(bp, _RET_IP_);
+}
+
+/*
+ *     Allocate a page array capable of holding a specified number
+ *     of pages, and point the page buf at it.
+ */
+STATIC int
+_xfs_buf_get_pages(
+       xfs_buf_t               *bp,
+       int                     page_count,
+       xfs_buf_flags_t         flags)
+{
+       /* Make sure that we have a page list */
+       if (bp->b_pages == NULL) {
+               bp->b_offset = xfs_buf_poff(bp->b_file_offset);
+               bp->b_page_count = page_count;
+               if (page_count <= XB_PAGES) {
+                       bp->b_pages = bp->b_page_array;
+               } else {
+                       bp->b_pages = kmem_alloc(sizeof(struct page *) *
+                                       page_count, xb_to_km(flags));
+                       if (bp->b_pages == NULL)
+                               return -ENOMEM;
+               }
+               memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
+       }
+       return 0;
+}
+
+/*
+ *     Frees b_pages if it was allocated.
+ */
+STATIC void
+_xfs_buf_free_pages(
+       xfs_buf_t       *bp)
+{
+       if (bp->b_pages != bp->b_page_array) {
+               kmem_free(bp->b_pages);
+               bp->b_pages = NULL;
+       }
+}
+
+/*
+ *     Releases the specified buffer.
+ *
+ *     The modification state of any associated pages is left unchanged.
+ *     The buffer most not be on any hash - use xfs_buf_rele instead for
+ *     hashed and refcounted buffers
+ */
+void
+xfs_buf_free(
+       xfs_buf_t               *bp)
+{
+       trace_xfs_buf_free(bp, _RET_IP_);
+
+       ASSERT(list_empty(&bp->b_lru));
+
+       if (bp->b_flags & _XBF_PAGES) {
+               uint            i;
+
+               if (xfs_buf_is_vmapped(bp))
+                       vm_unmap_ram(bp->b_addr - bp->b_offset,
+                                       bp->b_page_count);
+
+               for (i = 0; i < bp->b_page_count; i++) {
+                       struct page     *page = bp->b_pages[i];
+
+                       __free_page(page);
+               }
+       } else if (bp->b_flags & _XBF_KMEM)
+               kmem_free(bp->b_addr);
+       _xfs_buf_free_pages(bp);
+       xfs_buf_deallocate(bp);
+}
+
+/*
+ * Allocates all the pages for buffer in question and builds it's page list.
+ */
+STATIC int
+xfs_buf_allocate_memory(
+       xfs_buf_t               *bp,
+       uint                    flags)
+{
+       size_t                  size = bp->b_count_desired;
+       size_t                  nbytes, offset;
+       gfp_t                   gfp_mask = xb_to_gfp(flags);
+       unsigned short          page_count, i;
+       xfs_off_t               end;
+       int                     error;
+
+       /*
+        * for buffers that are contained within a single page, just allocate
+        * the memory from the heap - there's no need for the complexity of
+        * page arrays to keep allocation down to order 0.
+        */
+       if (bp->b_buffer_length < PAGE_SIZE) {
+               bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
+               if (!bp->b_addr) {
+                       /* low memory - use alloc_page loop instead */
+                       goto use_alloc_page;
+               }
+
+               if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
+                                                               PAGE_MASK) !=
+                   ((unsigned long)bp->b_addr & PAGE_MASK)) {
+                       /* b_addr spans two pages - use alloc_page instead */
+                       kmem_free(bp->b_addr);
+                       bp->b_addr = NULL;
+                       goto use_alloc_page;
+               }
+               bp->b_offset = offset_in_page(bp->b_addr);
+               bp->b_pages = bp->b_page_array;
+               bp->b_pages[0] = virt_to_page(bp->b_addr);
+               bp->b_page_count = 1;
+               bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
+               return 0;
+       }
+
+use_alloc_page:
+       end = bp->b_file_offset + bp->b_buffer_length;
+       page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
+       error = _xfs_buf_get_pages(bp, page_count, flags);
+       if (unlikely(error))
+               return error;
+
+       offset = bp->b_offset;
+       bp->b_flags |= _XBF_PAGES;
+
+       for (i = 0; i < bp->b_page_count; i++) {
+               struct page     *page;
+               uint            retries = 0;
+retry:
+               page = alloc_page(gfp_mask);
+               if (unlikely(page == NULL)) {
+                       if (flags & XBF_READ_AHEAD) {
+                               bp->b_page_count = i;
+                               error = ENOMEM;
+                               goto out_free_pages;
+                       }
+
+                       /*
+                        * This could deadlock.
+                        *
+                        * But until all the XFS lowlevel code is revamped to
+                        * handle buffer allocation failures we can't do much.
+                        */
+                       if (!(++retries % 100))
+                               xfs_err(NULL,
+               "possible memory allocation deadlock in %s (mode:0x%x)",
+                                       __func__, gfp_mask);
+
+                       XFS_STATS_INC(xb_page_retries);
+                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       goto retry;
+               }
+
+               XFS_STATS_INC(xb_page_found);
+
+               nbytes = min_t(size_t, size, PAGE_SIZE - offset);
+               size -= nbytes;
+               bp->b_pages[i] = page;
+               offset = 0;
+       }
+       return 0;
+
+out_free_pages:
+       for (i = 0; i < bp->b_page_count; i++)
+               __free_page(bp->b_pages[i]);
+       return error;
+}
+
+/*
+ *     Map buffer into kernel address-space if necessary.
+ */
+STATIC int
+_xfs_buf_map_pages(
+       xfs_buf_t               *bp,
+       uint                    flags)
+{
+       ASSERT(bp->b_flags & _XBF_PAGES);
+       if (bp->b_page_count == 1) {
+               /* A single page buffer is always mappable */
+               bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
+               bp->b_flags |= XBF_MAPPED;
+       } else if (flags & XBF_MAPPED) {
+               int retried = 0;
+
+               do {
+                       bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
+                                               -1, PAGE_KERNEL);
+                       if (bp->b_addr)
+                               break;
+                       vm_unmap_aliases();
+               } while (retried++ <= 1);
+
+               if (!bp->b_addr)
+                       return -ENOMEM;
+               bp->b_addr += bp->b_offset;
+               bp->b_flags |= XBF_MAPPED;
+       }
+
+       return 0;
+}
+
+/*
+ *     Finding and Reading Buffers
+ */
+
+/*
+ *     Look up, and creates if absent, a lockable buffer for
+ *     a given range of an inode.  The buffer is returned
+ *     locked.  If other overlapping buffers exist, they are
+ *     released before the new buffer is created and locked,
+ *     which may imply that this call will block until those buffers
+ *     are unlocked.  No I/O is implied by this call.
+ */
+xfs_buf_t *
+_xfs_buf_find(
+       xfs_buftarg_t           *btp,   /* block device target          */
+       xfs_off_t               ioff,   /* starting offset of range     */
+       size_t                  isize,  /* length of range              */
+       xfs_buf_flags_t         flags,
+       xfs_buf_t               *new_bp)
+{
+       xfs_off_t               range_base;
+       size_t                  range_length;
+       struct xfs_perag        *pag;
+       struct rb_node          **rbp;
+       struct rb_node          *parent;
+       xfs_buf_t               *bp;
+
+       range_base = (ioff << BBSHIFT);
+       range_length = (isize << BBSHIFT);
+
+       /* Check for IOs smaller than the sector size / not sector aligned */
+       ASSERT(!(range_length < (1 << btp->bt_sshift)));
+       ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
+
+       /* get tree root */
+       pag = xfs_perag_get(btp->bt_mount,
+                               xfs_daddr_to_agno(btp->bt_mount, ioff));
+
+       /* walk tree */
+       spin_lock(&pag->pag_buf_lock);
+       rbp = &pag->pag_buf_tree.rb_node;
+       parent = NULL;
+       bp = NULL;
+       while (*rbp) {
+               parent = *rbp;
+               bp = rb_entry(parent, struct xfs_buf, b_rbnode);
+
+               if (range_base < bp->b_file_offset)
+                       rbp = &(*rbp)->rb_left;
+               else if (range_base > bp->b_file_offset)
+                       rbp = &(*rbp)->rb_right;
+               else {
+                       /*
+                        * found a block offset match. If the range doesn't
+                        * match, the only way this is allowed is if the buffer
+                        * in the cache is stale and the transaction that made
+                        * it stale has not yet committed. i.e. we are
+                        * reallocating a busy extent. Skip this buffer and
+                        * continue searching to the right for an exact match.
+                        */
+                       if (bp->b_buffer_length != range_length) {
+                               ASSERT(bp->b_flags & XBF_STALE);
+                               rbp = &(*rbp)->rb_right;
+                               continue;
+                       }
+                       atomic_inc(&bp->b_hold);
+                       goto found;
+               }
+       }
+
+       /* No match found */
+       if (new_bp) {
+               _xfs_buf_initialize(new_bp, btp, range_base,
+                               range_length, flags);
+               rb_link_node(&new_bp->b_rbnode, parent, rbp);
+               rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
+               /* the buffer keeps the perag reference until it is freed */
+               new_bp->b_pag = pag;
+               spin_unlock(&pag->pag_buf_lock);
+       } else {
+               XFS_STATS_INC(xb_miss_locked);
+               spin_unlock(&pag->pag_buf_lock);
+               xfs_perag_put(pag);
+       }
+       return new_bp;
+
+found:
+       spin_unlock(&pag->pag_buf_lock);
+       xfs_perag_put(pag);
+
+       if (!xfs_buf_trylock(bp)) {
+               if (flags & XBF_TRYLOCK) {
+                       xfs_buf_rele(bp);
+                       XFS_STATS_INC(xb_busy_locked);
+                       return NULL;
+               }
+               xfs_buf_lock(bp);
+               XFS_STATS_INC(xb_get_locked_waited);
+       }
+
+       /*
+        * if the buffer is stale, clear all the external state associated with
+        * it. We need to keep flags such as how we allocated the buffer memory
+        * intact here.
+        */
+       if (bp->b_flags & XBF_STALE) {
+               ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+               bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
+       }
+
+       trace_xfs_buf_find(bp, flags, _RET_IP_);
+       XFS_STATS_INC(xb_get_locked);
+       return bp;
+}
+
+/*
+ *     Assembles a buffer covering the specified range.
+ *     Storage in memory for all portions of the buffer will be allocated,
+ *     although backing storage may not be.
+ */
+xfs_buf_t *
+xfs_buf_get(
+       xfs_buftarg_t           *target,/* target for buffer            */
+       xfs_off_t               ioff,   /* starting offset of range     */
+       size_t                  isize,  /* length of range              */
+       xfs_buf_flags_t         flags)
+{
+       xfs_buf_t               *bp, *new_bp;
+       int                     error = 0;
+
+       new_bp = xfs_buf_allocate(flags);
+       if (unlikely(!new_bp))
+               return NULL;
+
+       bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
+       if (bp == new_bp) {
+               error = xfs_buf_allocate_memory(bp, flags);
+               if (error)
+                       goto no_buffer;
+       } else {
+               xfs_buf_deallocate(new_bp);
+               if (unlikely(bp == NULL))
+                       return NULL;
+       }
+
+       if (!(bp->b_flags & XBF_MAPPED)) {
+               error = _xfs_buf_map_pages(bp, flags);
+               if (unlikely(error)) {
+                       xfs_warn(target->bt_mount,
+                               "%s: failed to map pages\n", __func__);
+                       goto no_buffer;
+               }
+       }
+
+       XFS_STATS_INC(xb_get);
+
+       /*
+        * Always fill in the block number now, the mapped cases can do
+        * their own overlay of this later.
+        */
+       bp->b_bn = ioff;
+       bp->b_count_desired = bp->b_buffer_length;
+
+       trace_xfs_buf_get(bp, flags, _RET_IP_);
+       return bp;
+
+ no_buffer:
+       if (flags & (XBF_LOCK | XBF_TRYLOCK))
+               xfs_buf_unlock(bp);
+       xfs_buf_rele(bp);
+       return NULL;
+}
+
+STATIC int
+_xfs_buf_read(
+       xfs_buf_t               *bp,
+       xfs_buf_flags_t         flags)
+{
+       int                     status;
+
+       ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+       ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+
+       bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
+       bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
+
+       status = xfs_buf_iorequest(bp);
+       if (status || bp->b_error || (flags & XBF_ASYNC))
+               return status;
+       return xfs_buf_iowait(bp);
+}
+
+xfs_buf_t *
+xfs_buf_read(
+       xfs_buftarg_t           *target,
+       xfs_off_t               ioff,
+       size_t                  isize,
+       xfs_buf_flags_t         flags)
+{
+       xfs_buf_t               *bp;
+
+       flags |= XBF_READ;
+
+       bp = xfs_buf_get(target, ioff, isize, flags);
+       if (bp) {
+               trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+               if (!XFS_BUF_ISDONE(bp)) {
+                       XFS_STATS_INC(xb_get_read);
+                       _xfs_buf_read(bp, flags);
+               } else if (flags & XBF_ASYNC) {
+                       /*
+                        * Read ahead call which is already satisfied,
+                        * drop the buffer
+                        */
+                       goto no_buffer;
+               } else {
+                       /* We do not want read in the flags */
+                       bp->b_flags &= ~XBF_READ;
+               }
+       }
+
+       return bp;
+
+ no_buffer:
+       if (flags & (XBF_LOCK | XBF_TRYLOCK))
+               xfs_buf_unlock(bp);
+       xfs_buf_rele(bp);
+       return NULL;
+}
+
+/*
+ *     If we are not low on memory then do the readahead in a deadlock
+ *     safe manner.
+ */
+void
+xfs_buf_readahead(
+       xfs_buftarg_t           *target,
+       xfs_off_t               ioff,
+       size_t                  isize)
+{
+       if (bdi_read_congested(target->bt_bdi))
+               return;
+
+       xfs_buf_read(target, ioff, isize,
+                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
+}
+
+/*
+ * Read an uncached buffer from disk. Allocates and returns a locked
+ * buffer containing the disk contents or nothing.
+ */
+struct xfs_buf *
+xfs_buf_read_uncached(
+       struct xfs_mount        *mp,
+       struct xfs_buftarg      *target,
+       xfs_daddr_t             daddr,
+       size_t                  length,
+       int                     flags)
+{
+       xfs_buf_t               *bp;
+       int                     error;
+
+       bp = xfs_buf_get_uncached(target, length, flags);
+       if (!bp)
+               return NULL;
+
+       /* set up the buffer for a read IO */
+       XFS_BUF_SET_ADDR(bp, daddr);
+       XFS_BUF_READ(bp);
+
+       xfsbdstrat(mp, bp);
+       error = xfs_buf_iowait(bp);
+       if (error || bp->b_error) {
+               xfs_buf_relse(bp);
+               return NULL;
+       }
+       return bp;
+}
+
+xfs_buf_t *
+xfs_buf_get_empty(
+       size_t                  len,
+       xfs_buftarg_t           *target)
+{
+       xfs_buf_t               *bp;
+
+       bp = xfs_buf_allocate(0);
+       if (bp)
+               _xfs_buf_initialize(bp, target, 0, len, 0);
+       return bp;
+}
+
+/*
+ * Return a buffer allocated as an empty buffer and associated to external
+ * memory via xfs_buf_associate_memory() back to it's empty state.
+ */
+void
+xfs_buf_set_empty(
+       struct xfs_buf          *bp,
+       size_t                  len)
+{
+       if (bp->b_pages)
+               _xfs_buf_free_pages(bp);
+
+       bp->b_pages = NULL;
+       bp->b_page_count = 0;
+       bp->b_addr = NULL;
+       bp->b_file_offset = 0;
+       bp->b_buffer_length = bp->b_count_desired = len;
+       bp->b_bn = XFS_BUF_DADDR_NULL;
+       bp->b_flags &= ~XBF_MAPPED;
+}
+
+static inline struct page *
+mem_to_page(
+       void                    *addr)
+{
+       if ((!is_vmalloc_addr(addr))) {
+               return virt_to_page(addr);
+       } else {
+               return vmalloc_to_page(addr);
+       }
+}
+
+int
+xfs_buf_associate_memory(
+       xfs_buf_t               *bp,
+       void                    *mem,
+       size_t                  len)
+{
+       int                     rval;
+       int                     i = 0;
+       unsigned long           pageaddr;
+       unsigned long           offset;
+       size_t                  buflen;
+       int                     page_count;
+
+       pageaddr = (unsigned long)mem & PAGE_MASK;
+       offset = (unsigned long)mem - pageaddr;
+       buflen = PAGE_ALIGN(len + offset);
+       page_count = buflen >> PAGE_SHIFT;
+
+       /* Free any previous set of page pointers */
+       if (bp->b_pages)
+               _xfs_buf_free_pages(bp);
+
+       bp->b_pages = NULL;
+       bp->b_addr = mem;
+
+       rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
+       if (rval)
+               return rval;
+
+       bp->b_offset = offset;
+
+       for (i = 0; i < bp->b_page_count; i++) {
+               bp->b_pages[i] = mem_to_page((void *)pageaddr);
+               pageaddr += PAGE_SIZE;
+       }
+
+       bp->b_count_desired = len;
+       bp->b_buffer_length = buflen;
+       bp->b_flags |= XBF_MAPPED;
+
+       return 0;
+}
+
+xfs_buf_t *
+xfs_buf_get_uncached(
+       struct xfs_buftarg      *target,
+       size_t                  len,
+       int                     flags)
+{
+       unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
+       int                     error, i;
+       xfs_buf_t               *bp;
+
+       bp = xfs_buf_allocate(0);
+       if (unlikely(bp == NULL))
+               goto fail;
+       _xfs_buf_initialize(bp, target, 0, len, 0);
+
+       error = _xfs_buf_get_pages(bp, page_count, 0);
+       if (error)
+               goto fail_free_buf;
+
+       for (i = 0; i < page_count; i++) {
+               bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
+               if (!bp->b_pages[i])
+                       goto fail_free_mem;
+       }
+       bp->b_flags |= _XBF_PAGES;
+
+       error = _xfs_buf_map_pages(bp, XBF_MAPPED);
+       if (unlikely(error)) {
+               xfs_warn(target->bt_mount,
+                       "%s: failed to map pages\n", __func__);
+               goto fail_free_mem;
+       }
+
+       trace_xfs_buf_get_uncached(bp, _RET_IP_);
+       return bp;
+
+ fail_free_mem:
+       while (--i >= 0)
+               __free_page(bp->b_pages[i]);
+       _xfs_buf_free_pages(bp);
+ fail_free_buf:
+       xfs_buf_deallocate(bp);
+ fail:
+       return NULL;
+}
+
+/*
+ *     Increment reference count on buffer, to hold the buffer concurrently
+ *     with another thread which may release (free) the buffer asynchronously.
+ *     Must hold the buffer already to call this function.
+ */
+void
+xfs_buf_hold(
+       xfs_buf_t               *bp)
+{
+       trace_xfs_buf_hold(bp, _RET_IP_);
+       atomic_inc(&bp->b_hold);
+}
+
+/*
+ *     Releases a hold on the specified buffer.  If the
+ *     the hold count is 1, calls xfs_buf_free.
+ */
+void
+xfs_buf_rele(
+       xfs_buf_t               *bp)
+{
+       struct xfs_perag        *pag = bp->b_pag;
+
+       trace_xfs_buf_rele(bp, _RET_IP_);
+
+       if (!pag) {
+               ASSERT(list_empty(&bp->b_lru));
+               ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
+               if (atomic_dec_and_test(&bp->b_hold))
+                       xfs_buf_free(bp);
+               return;
+       }
+
+       ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
+
+       ASSERT(atomic_read(&bp->b_hold) > 0);
+       if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
+               if (!(bp->b_flags & XBF_STALE) &&
+                          atomic_read(&bp->b_lru_ref)) {
+                       xfs_buf_lru_add(bp);
+                       spin_unlock(&pag->pag_buf_lock);
+               } else {
+                       xfs_buf_lru_del(bp);
+                       ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
+                       rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+                       spin_unlock(&pag->pag_buf_lock);
+                       xfs_perag_put(pag);
+                       xfs_buf_free(bp);
+               }
+       }
+}
+
+
+/*
+ *     Lock a buffer object, if it is not already locked.
+ *
+ *     If we come across a stale, pinned, locked buffer, we know that we are
+ *     being asked to lock a buffer that has been reallocated. Because it is
+ *     pinned, we know that the log has not been pushed to disk and hence it
+ *     will still be locked.  Rather than continuing to have trylock attempts
+ *     fail until someone else pushes the log, push it ourselves before
+ *     returning.  This means that the xfsaild will not get stuck trying
+ *     to push on stale inode buffers.
+ */
+int
+xfs_buf_trylock(
+       struct xfs_buf          *bp)
+{
+       int                     locked;
+
+       locked = down_trylock(&bp->b_sema) == 0;
+       if (locked)
+               XB_SET_OWNER(bp);
+       else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+               xfs_log_force(bp->b_target->bt_mount, 0);
+
+       trace_xfs_buf_trylock(bp, _RET_IP_);
+       return locked;
+}
+
+/*
+ *     Lock a buffer object.
+ *
+ *     If we come across a stale, pinned, locked buffer, we know that we
+ *     are being asked to lock a buffer that has been reallocated. Because
+ *     it is pinned, we know that the log has not been pushed to disk and
+ *     hence it will still be locked. Rather than sleeping until someone
+ *     else pushes the log, push it ourselves before trying to get the lock.
+ */
+void
+xfs_buf_lock(
+       struct xfs_buf          *bp)
+{
+       trace_xfs_buf_lock(bp, _RET_IP_);
+
+       if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+               xfs_log_force(bp->b_target->bt_mount, 0);
+       down(&bp->b_sema);
+       XB_SET_OWNER(bp);
+
+       trace_xfs_buf_lock_done(bp, _RET_IP_);
+}
+
+/*
+ *     Releases the lock on the buffer object.
+ *     If the buffer is marked delwri but is not queued, do so before we
+ *     unlock the buffer as we need to set flags correctly.  We also need to
+ *     take a reference for the delwri queue because the unlocker is going to
+ *     drop their's and they don't know we just queued it.
+ */
+void
+xfs_buf_unlock(
+       struct xfs_buf          *bp)
+{
+       if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
+               atomic_inc(&bp->b_hold);
+               bp->b_flags |= XBF_ASYNC;
+               xfs_buf_delwri_queue(bp, 0);
+       }
+
+       XB_CLEAR_OWNER(bp);
+       up(&bp->b_sema);
+
+       trace_xfs_buf_unlock(bp, _RET_IP_);
+}
+
+STATIC void
+xfs_buf_wait_unpin(
+       xfs_buf_t               *bp)
+{
+       DECLARE_WAITQUEUE       (wait, current);
+
+       if (atomic_read(&bp->b_pin_count) == 0)
+               return;
+
+       add_wait_queue(&bp->b_waiters, &wait);
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (atomic_read(&bp->b_pin_count) == 0)
+                       break;
+               io_schedule();
+       }
+       remove_wait_queue(&bp->b_waiters, &wait);
+       set_current_state(TASK_RUNNING);
+}
+
+/*
+ *     Buffer Utility Routines
+ */
+
+STATIC void
+xfs_buf_iodone_work(
+       struct work_struct      *work)
+{
+       xfs_buf_t               *bp =
+               container_of(work, xfs_buf_t, b_iodone_work);
+
+       if (bp->b_iodone)
+               (*(bp->b_iodone))(bp);
+       else if (bp->b_flags & XBF_ASYNC)
+               xfs_buf_relse(bp);
+}
+
+void
+xfs_buf_ioend(
+       xfs_buf_t               *bp,
+       int                     schedule)
+{
+       trace_xfs_buf_iodone(bp, _RET_IP_);
+
+       bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+       if (bp->b_error == 0)
+               bp->b_flags |= XBF_DONE;
+
+       if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+               if (schedule) {
+                       INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
+                       queue_work(xfslogd_workqueue, &bp->b_iodone_work);
+               } else {
+                       xfs_buf_iodone_work(&bp->b_iodone_work);
+               }
+       } else {
+               complete(&bp->b_iowait);
+       }
+}
+
+void
+xfs_buf_ioerror(
+       xfs_buf_t               *bp,
+       int                     error)
+{
+       ASSERT(error >= 0 && error <= 0xffff);
+       bp->b_error = (unsigned short)error;
+       trace_xfs_buf_ioerror(bp, error, _RET_IP_);
+}
+
+int
+xfs_bwrite(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       int                     error;
+
+       bp->b_flags |= XBF_WRITE;
+       bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
+
+       xfs_buf_delwri_dequeue(bp);
+       xfs_bdstrat_cb(bp);
+
+       error = xfs_buf_iowait(bp);
+       if (error)
+               xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+       xfs_buf_relse(bp);
+       return error;
+}
+
+void
+xfs_bdwrite(
+       void                    *mp,
+       struct xfs_buf          *bp)
+{
+       trace_xfs_buf_bdwrite(bp, _RET_IP_);
+
+       bp->b_flags &= ~XBF_READ;
+       bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
+
+       xfs_buf_delwri_queue(bp, 1);
+}
+
+/*
+ * Called when we want to stop a buffer from getting written or read.
+ * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
+ * so that the proper iodone callbacks get called.
+ */
+STATIC int
+xfs_bioerror(
+       xfs_buf_t *bp)
+{
+#ifdef XFSERRORDEBUG
+       ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
+#endif
+
+       /*
+        * No need to wait until the buffer is unpinned, we aren't flushing it.
+        */
+       xfs_buf_ioerror(bp, EIO);
+
+       /*
+        * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
+        */
+       XFS_BUF_UNREAD(bp);
+       XFS_BUF_UNDELAYWRITE(bp);
+       XFS_BUF_UNDONE(bp);
+       XFS_BUF_STALE(bp);
+
+       xfs_buf_ioend(bp, 0);
+
+       return EIO;
+}
+
+/*
+ * Same as xfs_bioerror, except that we are releasing the buffer
+ * here ourselves, and avoiding the xfs_buf_ioend call.
+ * This is meant for userdata errors; metadata bufs come with
+ * iodone functions attached, so that we can track down errors.
+ */
+STATIC int
+xfs_bioerror_relse(
+       struct xfs_buf  *bp)
+{
+       int64_t         fl = bp->b_flags;
+       /*
+        * No need to wait until the buffer is unpinned.
+        * We aren't flushing it.
+        *
+        * chunkhold expects B_DONE to be set, whether
+        * we actually finish the I/O or not. We don't want to
+        * change that interface.
+        */
+       XFS_BUF_UNREAD(bp);
+       XFS_BUF_UNDELAYWRITE(bp);
+       XFS_BUF_DONE(bp);
+       XFS_BUF_STALE(bp);
+       bp->b_iodone = NULL;
+       if (!(fl & XBF_ASYNC)) {
+               /*
+                * Mark b_error and B_ERROR _both_.
+                * Lot's of chunkcache code assumes that.
+                * There's no reason to mark error for
+                * ASYNC buffers.
+                */
+               xfs_buf_ioerror(bp, EIO);
+               XFS_BUF_FINISH_IOWAIT(bp);
+       } else {
+               xfs_buf_relse(bp);
+       }
+
+       return EIO;
+}
+
+
+/*
+ * All xfs metadata buffers except log state machine buffers
+ * get this attached as their b_bdstrat callback function.
+ * This is so that we can catch a buffer
+ * after prematurely unpinning it to forcibly shutdown the filesystem.
+ */
+int
+xfs_bdstrat_cb(
+       struct xfs_buf  *bp)
+{
+       if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
+               trace_xfs_bdstrat_shut(bp, _RET_IP_);
+               /*
+                * Metadata write that didn't get logged but
+                * written delayed anyway. These aren't associated
+                * with a transaction, and can be ignored.
+                */
+               if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
+                       return xfs_bioerror_relse(bp);
+               else
+                       return xfs_bioerror(bp);
+       }
+
+       xfs_buf_iorequest(bp);
+       return 0;
+}
+
+/*
+ * Wrapper around bdstrat so that we can stop data from going to disk in case
+ * we are shutting down the filesystem.  Typically user data goes thru this
+ * path; one of the exceptions is the superblock.
+ */
+void
+xfsbdstrat(
+       struct xfs_mount        *mp,
+       struct xfs_buf          *bp)
+{
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               trace_xfs_bdstrat_shut(bp, _RET_IP_);
+               xfs_bioerror_relse(bp);
+               return;
+       }
+
+       xfs_buf_iorequest(bp);
+}
+
+STATIC void
+_xfs_buf_ioend(
+       xfs_buf_t               *bp,
+       int                     schedule)
+{
+       if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
+               xfs_buf_ioend(bp, schedule);
+}
+
+STATIC void
+xfs_buf_bio_end_io(
+       struct bio              *bio,
+       int                     error)
+{
+       xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
+
+       xfs_buf_ioerror(bp, -error);
+
+       if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+               invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
+
+       _xfs_buf_ioend(bp, 1);
+       bio_put(bio);
+}
+
+STATIC void
+_xfs_buf_ioapply(
+       xfs_buf_t               *bp)
+{
+       int                     rw, map_i, total_nr_pages, nr_pages;
+       struct bio              *bio;
+       int                     offset = bp->b_offset;
+       int                     size = bp->b_count_desired;
+       sector_t                sector = bp->b_bn;
+
+       total_nr_pages = bp->b_page_count;
+       map_i = 0;
+
+       if (bp->b_flags & XBF_WRITE) {
+               if (bp->b_flags & XBF_SYNCIO)
+                       rw = WRITE_SYNC;
+               else
+                       rw = WRITE;
+               if (bp->b_flags & XBF_FUA)
+                       rw |= REQ_FUA;
+               if (bp->b_flags & XBF_FLUSH)
+                       rw |= REQ_FLUSH;
+       } else if (bp->b_flags & XBF_READ_AHEAD) {
+               rw = READA;
+       } else {
+               rw = READ;
+       }
+
+       /* we only use the buffer cache for meta-data */
+       rw |= REQ_META;
+
+next_chunk:
+       atomic_inc(&bp->b_io_remaining);
+       nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
+       if (nr_pages > total_nr_pages)
+               nr_pages = total_nr_pages;
+
+       bio = bio_alloc(GFP_NOIO, nr_pages);
+       bio->bi_bdev = bp->b_target->bt_bdev;
+       bio->bi_sector = sector;
+       bio->bi_end_io = xfs_buf_bio_end_io;
+       bio->bi_private = bp;
+
+
+       for (; size && nr_pages; nr_pages--, map_i++) {
+               int     rbytes, nbytes = PAGE_SIZE - offset;
+
+               if (nbytes > size)
+                       nbytes = size;
+
+               rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
+               if (rbytes < nbytes)
+                       break;
+
+               offset = 0;
+               sector += nbytes >> BBSHIFT;
+               size -= nbytes;
+               total_nr_pages--;
+       }
+
+       if (likely(bio->bi_size)) {
+               if (xfs_buf_is_vmapped(bp)) {
+                       flush_kernel_vmap_range(bp->b_addr,
+                                               xfs_buf_vmap_len(bp));
+               }
+               submit_bio(rw, bio);
+               if (size)
+                       goto next_chunk;
+       } else {
+               xfs_buf_ioerror(bp, EIO);
+               bio_put(bio);
+       }
+}
+
+int
+xfs_buf_iorequest(
+       xfs_buf_t               *bp)
+{
+       trace_xfs_buf_iorequest(bp, _RET_IP_);
+
+       if (bp->b_flags & XBF_DELWRI) {
+               xfs_buf_delwri_queue(bp, 1);
+               return 0;
+       }
+
+       if (bp->b_flags & XBF_WRITE) {
+               xfs_buf_wait_unpin(bp);
+       }
+
+       xfs_buf_hold(bp);
+
+       /* Set the count to 1 initially, this will stop an I/O
+        * completion callout which happens before we have started
+        * all the I/O from calling xfs_buf_ioend too early.
+        */
+       atomic_set(&bp->b_io_remaining, 1);
+       _xfs_buf_ioapply(bp);
+       _xfs_buf_ioend(bp, 0);
+
+       xfs_buf_rele(bp);
+       return 0;
+}
+
+/*
+ *     Waits for I/O to complete on the buffer supplied.
+ *     It returns immediately if no I/O is pending.
+ *     It returns the I/O error code, if any, or 0 if there was no error.
+ */
+int
+xfs_buf_iowait(
+       xfs_buf_t               *bp)
+{
+       trace_xfs_buf_iowait(bp, _RET_IP_);
+
+       wait_for_completion(&bp->b_iowait);
+
+       trace_xfs_buf_iowait_done(bp, _RET_IP_);
+       return bp->b_error;
+}
+
+xfs_caddr_t
+xfs_buf_offset(
+       xfs_buf_t               *bp,
+       size_t                  offset)
+{
+       struct page             *page;
+
+       if (bp->b_flags & XBF_MAPPED)
+               return bp->b_addr + offset;
+
+       offset += bp->b_offset;
+       page = bp->b_pages[offset >> PAGE_SHIFT];
+       return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
+}
+
+/*
+ *     Move data into or out of a buffer.
+ */
+void
+xfs_buf_iomove(
+       xfs_buf_t               *bp,    /* buffer to process            */
+       size_t                  boff,   /* starting buffer offset       */
+       size_t                  bsize,  /* length to copy               */
+       void                    *data,  /* data address                 */
+       xfs_buf_rw_t            mode)   /* read/write/zero flag         */
+{
+       size_t                  bend, cpoff, csize;
+       struct page             *page;
+
+       bend = boff + bsize;
+       while (boff < bend) {
+               page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
+               cpoff = xfs_buf_poff(boff + bp->b_offset);
+               csize = min_t(size_t,
+                             PAGE_SIZE-cpoff, bp->b_count_desired-boff);
+
+               ASSERT(((csize + cpoff) <= PAGE_SIZE));
+
+               switch (mode) {
+               case XBRW_ZERO:
+                       memset(page_address(page) + cpoff, 0, csize);
+                       break;
+               case XBRW_READ:
+                       memcpy(data, page_address(page) + cpoff, csize);
+                       break;
+               case XBRW_WRITE:
+                       memcpy(page_address(page) + cpoff, data, csize);
+               }
+
+               boff += csize;
+               data += csize;
+       }
+}
+
+/*
+ *     Handling of buffer targets (buftargs).
+ */
+
+/*
+ * Wait for any bufs with callbacks that have been submitted but have not yet
+ * returned. These buffers will have an elevated hold count, so wait on those
+ * while freeing all the buffers only held by the LRU.
+ */
+void
+xfs_wait_buftarg(
+       struct xfs_buftarg      *btp)
+{
+       struct xfs_buf          *bp;
+
+restart:
+       spin_lock(&btp->bt_lru_lock);
+       while (!list_empty(&btp->bt_lru)) {
+               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+               if (atomic_read(&bp->b_hold) > 1) {
+                       spin_unlock(&btp->bt_lru_lock);
+                       delay(100);
+                       goto restart;
+               }
+               /*
+                * clear the LRU reference count so the bufer doesn't get
+                * ignored in xfs_buf_rele().
+                */
+               atomic_set(&bp->b_lru_ref, 0);
+               spin_unlock(&btp->bt_lru_lock);
+               xfs_buf_rele(bp);
+               spin_lock(&btp->bt_lru_lock);
+       }
+       spin_unlock(&btp->bt_lru_lock);
+}
+
+int
+xfs_buftarg_shrink(
+       struct shrinker         *shrink,
+       struct shrink_control   *sc)
+{
+       struct xfs_buftarg      *btp = container_of(shrink,
+                                       struct xfs_buftarg, bt_shrinker);
+       struct xfs_buf          *bp;
+       int nr_to_scan = sc->nr_to_scan;
+       LIST_HEAD(dispose);
+
+       if (!nr_to_scan)
+               return btp->bt_lru_nr;
+
+       spin_lock(&btp->bt_lru_lock);
+       while (!list_empty(&btp->bt_lru)) {
+               if (nr_to_scan-- <= 0)
+                       break;
+
+               bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
+
+               /*
+                * Decrement the b_lru_ref count unless the value is already
+                * zero. If the value is already zero, we need to reclaim the
+                * buffer, otherwise it gets another trip through the LRU.
+                */
+               if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+                       list_move_tail(&bp->b_lru, &btp->bt_lru);
+                       continue;
+               }
+
+               /*
+                * remove the buffer from the LRU now to avoid needing another
+                * lock round trip inside xfs_buf_rele().
+                */
+               list_move(&bp->b_lru, &dispose);
+               btp->bt_lru_nr--;
+       }
+       spin_unlock(&btp->bt_lru_lock);
+
+       while (!list_empty(&dispose)) {
+               bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+               list_del_init(&bp->b_lru);
+               xfs_buf_rele(bp);
+       }
+
+       return btp->bt_lru_nr;
+}
+
+void
+xfs_free_buftarg(
+       struct xfs_mount        *mp,
+       struct xfs_buftarg      *btp)
+{
+       unregister_shrinker(&btp->bt_shrinker);
+
+       xfs_flush_buftarg(btp, 1);
+       if (mp->m_flags & XFS_MOUNT_BARRIER)
+               xfs_blkdev_issue_flush(btp);
+
+       kthread_stop(btp->bt_task);
+       kmem_free(btp);
+}
+
+STATIC int
+xfs_setsize_buftarg_flags(
+       xfs_buftarg_t           *btp,
+       unsigned int            blocksize,
+       unsigned int            sectorsize,
+       int                     verbose)
+{
+       btp->bt_bsize = blocksize;
+       btp->bt_sshift = ffs(sectorsize) - 1;
+       btp->bt_smask = sectorsize - 1;
+
+       if (set_blocksize(btp->bt_bdev, sectorsize)) {
+               xfs_warn(btp->bt_mount,
+                       "Cannot set_blocksize to %u on device %s\n",
+                       sectorsize, xfs_buf_target_name(btp));
+               return EINVAL;
+       }
+
+       return 0;
+}
+
+/*
+ *     When allocating the initial buffer target we have not yet
+ *     read in the superblock, so don't know what sized sectors
+ *     are being used is at this early stage.  Play safe.
+ */
+STATIC int
+xfs_setsize_buftarg_early(
+       xfs_buftarg_t           *btp,
+       struct block_device     *bdev)
+{
+       return xfs_setsize_buftarg_flags(btp,
+                       PAGE_SIZE, bdev_logical_block_size(bdev), 0);
+}
+
+int
+xfs_setsize_buftarg(
+       xfs_buftarg_t           *btp,
+       unsigned int            blocksize,
+       unsigned int            sectorsize)
+{
+       return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
+}
+
+STATIC int
+xfs_alloc_delwrite_queue(
+       xfs_buftarg_t           *btp,
+       const char              *fsname)
+{
+       INIT_LIST_HEAD(&btp->bt_delwrite_queue);
+       spin_lock_init(&btp->bt_delwrite_lock);
+       btp->bt_flags = 0;
+       btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
+       if (IS_ERR(btp->bt_task))
+               return PTR_ERR(btp->bt_task);
+       return 0;
+}
+
+xfs_buftarg_t *
+xfs_alloc_buftarg(
+       struct xfs_mount        *mp,
+       struct block_device     *bdev,
+       int                     external,
+       const char              *fsname)
+{
+       xfs_buftarg_t           *btp;
+
+       btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
+
+       btp->bt_mount = mp;
+       btp->bt_dev =  bdev->bd_dev;
+       btp->bt_bdev = bdev;
+       btp->bt_bdi = blk_get_backing_dev_info(bdev);
+       if (!btp->bt_bdi)
+               goto error;
+
+       INIT_LIST_HEAD(&btp->bt_lru);
+       spin_lock_init(&btp->bt_lru_lock);
+       if (xfs_setsize_buftarg_early(btp, bdev))
+               goto error;
+       if (xfs_alloc_delwrite_queue(btp, fsname))
+               goto error;
+       btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+       btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+       register_shrinker(&btp->bt_shrinker);
+       return btp;
+
+error:
+       kmem_free(btp);
+       return NULL;
+}
+
+
+/*
+ *     Delayed write buffer handling
+ */
+STATIC void
+xfs_buf_delwri_queue(
+       xfs_buf_t               *bp,
+       int                     unlock)
+{
+       struct list_head        *dwq = &bp->b_target->bt_delwrite_queue;
+       spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
+
+       trace_xfs_buf_delwri_queue(bp, _RET_IP_);
+
+       ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
+
+       spin_lock(dwlk);
+       /* If already in the queue, dequeue and place at tail */
+       if (!list_empty(&bp->b_list)) {
+               ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+               if (unlock)
+                       atomic_dec(&bp->b_hold);
+               list_del(&bp->b_list);
+       }
+
+       if (list_empty(dwq)) {
+               /* start xfsbufd as it is about to have something to do */
+               wake_up_process(bp->b_target->bt_task);
+       }
+
+       bp->b_flags |= _XBF_DELWRI_Q;
+       list_add_tail(&bp->b_list, dwq);
+       bp->b_queuetime = jiffies;
+       spin_unlock(dwlk);
+
+       if (unlock)
+               xfs_buf_unlock(bp);
+}
+
+void
+xfs_buf_delwri_dequeue(
+       xfs_buf_t               *bp)
+{
+       spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
+       int                     dequeued = 0;
+
+       spin_lock(dwlk);
+       if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
+               ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+               list_del_init(&bp->b_list);
+               dequeued = 1;
+       }
+       bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
+       spin_unlock(dwlk);
+
+       if (dequeued)
+               xfs_buf_rele(bp);
+
+       trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
+}
+
+/*
+ * If a delwri buffer needs to be pushed before it has aged out, then promote
+ * it to the head of the delwri queue so that it will be flushed on the next
+ * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
+ * than the age currently needed to flush the buffer. Hence the next time the
+ * xfsbufd sees it is guaranteed to be considered old enough to flush.
+ */
+void
+xfs_buf_delwri_promote(
+       struct xfs_buf  *bp)
+{
+       struct xfs_buftarg *btp = bp->b_target;
+       long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
+
+       ASSERT(bp->b_flags & XBF_DELWRI);
+       ASSERT(bp->b_flags & _XBF_DELWRI_Q);
+
+       /*
+        * Check the buffer age before locking the delayed write queue as we
+        * don't need to promote buffers that are already past the flush age.
+        */
+       if (bp->b_queuetime < jiffies - age)
+               return;
+       bp->b_queuetime = jiffies - age;
+       spin_lock(&btp->bt_delwrite_lock);
+       list_move(&bp->b_list, &btp->bt_delwrite_queue);
+       spin_unlock(&btp->bt_delwrite_lock);
+}
+
+STATIC void
+xfs_buf_runall_queues(
+       struct workqueue_struct *queue)
+{
+       flush_workqueue(queue);
+}
+
+/*
+ * Move as many buffers as specified to the supplied list
+ * idicating if we skipped any buffers to prevent deadlocks.
+ */
+STATIC int
+xfs_buf_delwri_split(
+       xfs_buftarg_t   *target,
+       struct list_head *list,
+       unsigned long   age)
+{
+       xfs_buf_t       *bp, *n;
+       struct list_head *dwq = &target->bt_delwrite_queue;
+       spinlock_t      *dwlk = &target->bt_delwrite_lock;
+       int             skipped = 0;
+       int             force;
+
+       force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+       INIT_LIST_HEAD(list);
+       spin_lock(dwlk);
+       list_for_each_entry_safe(bp, n, dwq, b_list) {
+               ASSERT(bp->b_flags & XBF_DELWRI);
+
+               if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
+                       if (!force &&
+                           time_before(jiffies, bp->b_queuetime + age)) {
+                               xfs_buf_unlock(bp);
+                               break;
+                       }
+
+                       bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
+                       bp->b_flags |= XBF_WRITE;
+                       list_move_tail(&bp->b_list, list);
+                       trace_xfs_buf_delwri_split(bp, _RET_IP_);
+               } else
+                       skipped++;
+       }
+       spin_unlock(dwlk);
+
+       return skipped;
+
+}
+
+/*
+ * Compare function is more complex than it needs to be because
+ * the return value is only 32 bits and we are doing comparisons
+ * on 64 bit values
+ */
+static int
+xfs_buf_cmp(
+       void            *priv,
+       struct list_head *a,
+       struct list_head *b)
+{
+       struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
+       struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
+       xfs_daddr_t             diff;
+
+       diff = ap->b_bn - bp->b_bn;
+       if (diff < 0)
+               return -1;
+       if (diff > 0)
+               return 1;
+       return 0;
+}
+
+STATIC int
+xfsbufd(
+       void            *data)
+{
+       xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
+
+       current->flags |= PF_MEMALLOC;
+
+       set_freezable();
+
+       do {
+               long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
+               long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
+               struct list_head tmp;
+               struct blk_plug plug;
+
+               if (unlikely(freezing(current))) {
+                       set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+                       refrigerator();
+               } else {
+                       clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
+               }
+
+               /* sleep for a long time if there is nothing to do. */
+               if (list_empty(&target->bt_delwrite_queue))
+                       tout = MAX_SCHEDULE_TIMEOUT;
+               schedule_timeout_interruptible(tout);
+
+               xfs_buf_delwri_split(target, &tmp, age);
+               list_sort(NULL, &tmp, xfs_buf_cmp);
+
+               blk_start_plug(&plug);
+               while (!list_empty(&tmp)) {
+                       struct xfs_buf *bp;
+                       bp = list_first_entry(&tmp, struct xfs_buf, b_list);
+                       list_del_init(&bp->b_list);
+                       xfs_bdstrat_cb(bp);
+               }
+               blk_finish_plug(&plug);
+       } while (!kthread_should_stop());
+
+       return 0;
+}
+
+/*
+ *     Go through all incore buffers, and release buffers if they belong to
+ *     the given device. This is used in filesystem error handling to
+ *     preserve the consistency of its metadata.
+ */
+int
+xfs_flush_buftarg(
+       xfs_buftarg_t   *target,
+       int             wait)
+{
+       xfs_buf_t       *bp;
+       int             pincount = 0;
+       LIST_HEAD(tmp_list);
+       LIST_HEAD(wait_list);
+       struct blk_plug plug;
+
+       xfs_buf_runall_queues(xfsconvertd_workqueue);
+       xfs_buf_runall_queues(xfsdatad_workqueue);
+       xfs_buf_runall_queues(xfslogd_workqueue);
+
+       set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
+       pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
+
+       /*
+        * Dropped the delayed write list lock, now walk the temporary list.
+        * All I/O is issued async and then if we need to wait for completion
+        * we do that after issuing all the IO.
+        */
+       list_sort(NULL, &tmp_list, xfs_buf_cmp);
+
+       blk_start_plug(&plug);
+       while (!list_empty(&tmp_list)) {
+               bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
+               ASSERT(target == bp->b_target);
+               list_del_init(&bp->b_list);
+               if (wait) {
+                       bp->b_flags &= ~XBF_ASYNC;
+                       list_add(&bp->b_list, &wait_list);
+               }
+               xfs_bdstrat_cb(bp);
+       }
+       blk_finish_plug(&plug);
+
+       if (wait) {
+               /* Wait for IO to complete. */
+               while (!list_empty(&wait_list)) {
+                       bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
+
+                       list_del_init(&bp->b_list);
+                       xfs_buf_iowait(bp);
+                       xfs_buf_relse(bp);
+               }
+       }
+
+       return pincount;
+}
+
+int __init
+xfs_buf_init(void)
+{
+       xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
+                                               KM_ZONE_HWALIGN, NULL);
+       if (!xfs_buf_zone)
+               goto out;
+
+       xfslogd_workqueue = alloc_workqueue("xfslogd",
+                                       WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
+       if (!xfslogd_workqueue)
+               goto out_free_buf_zone;
+
+       xfsdatad_workqueue = alloc_workqueue("xfsdatad", WQ_MEM_RECLAIM, 1);
+       if (!xfsdatad_workqueue)
+               goto out_destroy_xfslogd_workqueue;
+
+       xfsconvertd_workqueue = alloc_workqueue("xfsconvertd",
+                                               WQ_MEM_RECLAIM, 1);
+       if (!xfsconvertd_workqueue)
+               goto out_destroy_xfsdatad_workqueue;
+
+       return 0;
+
+ out_destroy_xfsdatad_workqueue:
+       destroy_workqueue(xfsdatad_workqueue);
+ out_destroy_xfslogd_workqueue:
+       destroy_workqueue(xfslogd_workqueue);
+ out_free_buf_zone:
+       kmem_zone_destroy(xfs_buf_zone);
+ out:
+       return -ENOMEM;
+}
+
+void
+xfs_buf_terminate(void)
+{
+       destroy_workqueue(xfsconvertd_workqueue);
+       destroy_workqueue(xfsdatad_workqueue);
+       destroy_workqueue(xfslogd_workqueue);
+       kmem_zone_destroy(xfs_buf_zone);
+}
+
+#ifdef CONFIG_KDB_MODULES
+struct list_head *
+xfs_get_buftarg_list(void)
+{
+       return &xfs_buftarg_list;
+}
+#endif
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
new file mode 100644 (file)
index 0000000..620972b
--- /dev/null
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_BUF_H__
+#define __XFS_BUF_H__
+
+#include <linux/list.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <asm/system.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/uio.h>
+
+/*
+ *     Base types
+ */
+
+#define XFS_BUF_DADDR_NULL     ((xfs_daddr_t) (-1LL))
+
+#define xfs_buf_ctob(pp)       ((pp) * PAGE_CACHE_SIZE)
+#define xfs_buf_btoc(dd)       (((dd) + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_btoct(dd)      ((dd) >> PAGE_CACHE_SHIFT)
+#define xfs_buf_poff(aa)       ((aa) & ~PAGE_CACHE_MASK)
+
+typedef enum {
+       XBRW_READ = 1,                  /* transfer into target memory */
+       XBRW_WRITE = 2,                 /* transfer from target memory */
+       XBRW_ZERO = 3,                  /* Zero target memory */
+} xfs_buf_rw_t;
+
+#define XBF_READ       (1 << 0) /* buffer intended for reading from device */
+#define XBF_WRITE      (1 << 1) /* buffer intended for writing to device */
+#define XBF_READ_AHEAD (1 << 2) /* asynchronous read-ahead */
+#define XBF_MAPPED     (1 << 3) /* buffer mapped (b_addr valid) */
+#define XBF_ASYNC      (1 << 4) /* initiator will not wait for completion */
+#define XBF_DONE       (1 << 5) /* all pages in the buffer uptodate */
+#define XBF_DELWRI     (1 << 6) /* buffer has dirty pages */
+#define XBF_STALE      (1 << 7) /* buffer has been staled, do not find it */
+
+/* I/O hints for the BIO layer */
+#define XBF_SYNCIO     (1 << 10)/* treat this buffer as synchronous I/O */
+#define XBF_FUA                (1 << 11)/* force cache write through mode */
+#define XBF_FLUSH      (1 << 12)/* flush the disk cache before a write */
+
+/* flags used only as arguments to access routines */
+#define XBF_LOCK       (1 << 15)/* lock requested */
+#define XBF_TRYLOCK    (1 << 16)/* lock requested, but do not wait */
+#define XBF_DONT_BLOCK (1 << 17)/* do not block in current thread */
+
+/* flags used only internally */
+#define _XBF_PAGES     (1 << 20)/* backed by refcounted pages */
+#define _XBF_KMEM      (1 << 21)/* backed by heap memory */
+#define _XBF_DELWRI_Q  (1 << 22)/* buffer on delwri queue */
+
+typedef unsigned int xfs_buf_flags_t;
+
+#define XFS_BUF_FLAGS \
+       { XBF_READ,             "READ" }, \
+       { XBF_WRITE,            "WRITE" }, \
+       { XBF_READ_AHEAD,       "READ_AHEAD" }, \
+       { XBF_MAPPED,           "MAPPED" }, \
+       { XBF_ASYNC,            "ASYNC" }, \
+       { XBF_DONE,             "DONE" }, \
+       { XBF_DELWRI,           "DELWRI" }, \
+       { XBF_STALE,            "STALE" }, \
+       { XBF_SYNCIO,           "SYNCIO" }, \
+       { XBF_FUA,              "FUA" }, \
+       { XBF_FLUSH,            "FLUSH" }, \
+       { XBF_LOCK,             "LOCK" },       /* should never be set */\
+       { XBF_TRYLOCK,          "TRYLOCK" },    /* ditto */\
+       { XBF_DONT_BLOCK,       "DONT_BLOCK" }, /* ditto */\
+       { _XBF_PAGES,           "PAGES" }, \
+       { _XBF_KMEM,            "KMEM" }, \
+       { _XBF_DELWRI_Q,        "DELWRI_Q" }
+
+typedef enum {
+       XBT_FORCE_SLEEP = 0,
+       XBT_FORCE_FLUSH = 1,
+} xfs_buftarg_flags_t;
+
+typedef struct xfs_buftarg {
+       dev_t                   bt_dev;
+       struct block_device     *bt_bdev;
+       struct backing_dev_info *bt_bdi;
+       struct xfs_mount        *bt_mount;
+       unsigned int            bt_bsize;
+       unsigned int            bt_sshift;
+       size_t                  bt_smask;
+
+       /* per device delwri queue */
+       struct task_struct      *bt_task;
+       struct list_head        bt_delwrite_queue;
+       spinlock_t              bt_delwrite_lock;
+       unsigned long           bt_flags;
+
+       /* LRU control structures */
+       struct shrinker         bt_shrinker;
+       struct list_head        bt_lru;
+       spinlock_t              bt_lru_lock;
+       unsigned int            bt_lru_nr;
+} xfs_buftarg_t;
+
+struct xfs_buf;
+typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
+#define XB_PAGES       2
+
+typedef struct xfs_buf {
+       /*
+        * first cacheline holds all the fields needed for an uncontended cache
+        * hit to be fully processed. The semaphore straddles the cacheline
+        * boundary, but the counter and lock sits on the first cacheline,
+        * which is the only bit that is touched if we hit the semaphore
+        * fast-path on locking.
+        */
+       struct rb_node          b_rbnode;       /* rbtree node */
+       xfs_off_t               b_file_offset;  /* offset in file */
+       size_t                  b_buffer_length;/* size of buffer in bytes */
+       atomic_t                b_hold;         /* reference count */
+       atomic_t                b_lru_ref;      /* lru reclaim ref count */
+       xfs_buf_flags_t         b_flags;        /* status flags */
+       struct semaphore        b_sema;         /* semaphore for lockables */
+
+       struct list_head        b_lru;          /* lru list */
+       wait_queue_head_t       b_waiters;      /* unpin waiters */
+       struct list_head        b_list;
+       struct xfs_perag        *b_pag;         /* contains rbtree root */
+       xfs_buftarg_t           *b_target;      /* buffer target (device) */
+       xfs_daddr_t             b_bn;           /* block number for I/O */
+       size_t                  b_count_desired;/* desired transfer size */
+       void                    *b_addr;        /* virtual address of buffer */
+       struct work_struct      b_iodone_work;
+       xfs_buf_iodone_t        b_iodone;       /* I/O completion function */
+       struct completion       b_iowait;       /* queue for I/O waiters */
+       void                    *b_fspriv;
+       struct xfs_trans        *b_transp;
+       struct page             **b_pages;      /* array of page pointers */
+       struct page             *b_page_array[XB_PAGES]; /* inline pages */
+       unsigned long           b_queuetime;    /* time buffer was queued */
+       atomic_t                b_pin_count;    /* pin count */
+       atomic_t                b_io_remaining; /* #outstanding I/O requests */
+       unsigned int            b_page_count;   /* size of page array */
+       unsigned int            b_offset;       /* page offset in first page */
+       unsigned short          b_error;        /* error code on I/O */
+#ifdef XFS_BUF_LOCK_TRACKING
+       int                     b_last_holder;
+#endif
+} xfs_buf_t;
+
+
+/* Finding and Reading Buffers */
+extern xfs_buf_t *_xfs_buf_find(xfs_buftarg_t *, xfs_off_t, size_t,
+                               xfs_buf_flags_t, xfs_buf_t *);
+#define xfs_incore(buftarg,blkno,len,lockit) \
+       _xfs_buf_find(buftarg, blkno ,len, lockit, NULL)
+
+extern xfs_buf_t *xfs_buf_get(xfs_buftarg_t *, xfs_off_t, size_t,
+                               xfs_buf_flags_t);
+extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t,
+                               xfs_buf_flags_t);
+
+extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *);
+extern void xfs_buf_set_empty(struct xfs_buf *bp, size_t len);
+extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int);
+extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t);
+extern void xfs_buf_hold(xfs_buf_t *);
+extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t);
+struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp,
+                               struct xfs_buftarg *target,
+                               xfs_daddr_t daddr, size_t length, int flags);
+
+/* Releasing Buffers */
+extern void xfs_buf_free(xfs_buf_t *);
+extern void xfs_buf_rele(xfs_buf_t *);
+
+/* Locking and Unlocking Buffers */
+extern int xfs_buf_trylock(xfs_buf_t *);
+extern void xfs_buf_lock(xfs_buf_t *);
+extern void xfs_buf_unlock(xfs_buf_t *);
+#define xfs_buf_islocked(bp) \
+       ((bp)->b_sema.count <= 0)
+
+/* Buffer Read and Write Routines */
+extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
+
+extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
+extern int xfs_bdstrat_cb(struct xfs_buf *);
+
+extern void xfs_buf_ioend(xfs_buf_t *, int);
+extern void xfs_buf_ioerror(xfs_buf_t *, int);
+extern int xfs_buf_iorequest(xfs_buf_t *);
+extern int xfs_buf_iowait(xfs_buf_t *);
+extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *,
+                               xfs_buf_rw_t);
+#define xfs_buf_zero(bp, off, len) \
+           xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO)
+
+static inline int xfs_buf_geterror(xfs_buf_t *bp)
+{
+       return bp ? bp->b_error : ENOMEM;
+}
+
+/* Buffer Utility Routines */
+extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
+
+/* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
+extern void xfs_buf_delwri_promote(xfs_buf_t *);
+
+/* Buffer Daemon Setup Routines */
+extern int xfs_buf_init(void);
+extern void xfs_buf_terminate(void);
+
+static inline const char *
+xfs_buf_target_name(struct xfs_buftarg *target)
+{
+       static char __b[BDEVNAME_SIZE];
+
+       return bdevname(target->bt_bdev, __b);
+}
+
+
+#define XFS_BUF_ZEROFLAGS(bp) \
+       ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \
+                           XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
+
+void xfs_buf_stale(struct xfs_buf *bp);
+#define XFS_BUF_STALE(bp)      xfs_buf_stale(bp);
+#define XFS_BUF_UNSTALE(bp)    ((bp)->b_flags &= ~XBF_STALE)
+#define XFS_BUF_ISSTALE(bp)    ((bp)->b_flags & XBF_STALE)
+#define XFS_BUF_SUPER_STALE(bp)        do {                            \
+                                       XFS_BUF_STALE(bp);      \
+                                       xfs_buf_delwri_dequeue(bp);     \
+                                       XFS_BUF_DONE(bp);       \
+                               } while (0)
+
+#define XFS_BUF_DELAYWRITE(bp)         ((bp)->b_flags |= XBF_DELWRI)
+#define XFS_BUF_UNDELAYWRITE(bp)       xfs_buf_delwri_dequeue(bp)
+#define XFS_BUF_ISDELAYWRITE(bp)       ((bp)->b_flags & XBF_DELWRI)
+
+#define XFS_BUF_DONE(bp)       ((bp)->b_flags |= XBF_DONE)
+#define XFS_BUF_UNDONE(bp)     ((bp)->b_flags &= ~XBF_DONE)
+#define XFS_BUF_ISDONE(bp)     ((bp)->b_flags & XBF_DONE)
+
+#define XFS_BUF_ASYNC(bp)      ((bp)->b_flags |= XBF_ASYNC)
+#define XFS_BUF_UNASYNC(bp)    ((bp)->b_flags &= ~XBF_ASYNC)
+#define XFS_BUF_ISASYNC(bp)    ((bp)->b_flags & XBF_ASYNC)
+
+#define XFS_BUF_READ(bp)       ((bp)->b_flags |= XBF_READ)
+#define XFS_BUF_UNREAD(bp)     ((bp)->b_flags &= ~XBF_READ)
+#define XFS_BUF_ISREAD(bp)     ((bp)->b_flags & XBF_READ)
+
+#define XFS_BUF_WRITE(bp)      ((bp)->b_flags |= XBF_WRITE)
+#define XFS_BUF_UNWRITE(bp)    ((bp)->b_flags &= ~XBF_WRITE)
+#define XFS_BUF_ISWRITE(bp)    ((bp)->b_flags & XBF_WRITE)
+
+#define XFS_BUF_ADDR(bp)               ((bp)->b_bn)
+#define XFS_BUF_SET_ADDR(bp, bno)      ((bp)->b_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_OFFSET(bp)             ((bp)->b_file_offset)
+#define XFS_BUF_SET_OFFSET(bp, off)    ((bp)->b_file_offset = (off))
+#define XFS_BUF_COUNT(bp)              ((bp)->b_count_desired)
+#define XFS_BUF_SET_COUNT(bp, cnt)     ((bp)->b_count_desired = (cnt))
+#define XFS_BUF_SIZE(bp)               ((bp)->b_buffer_length)
+#define XFS_BUF_SET_SIZE(bp, cnt)      ((bp)->b_buffer_length = (cnt))
+
+static inline void
+xfs_buf_set_ref(
+       struct xfs_buf  *bp,
+       int             lru_ref)
+{
+       atomic_set(&bp->b_lru_ref, lru_ref);
+}
+#define XFS_BUF_SET_VTYPE_REF(bp, type, ref)   xfs_buf_set_ref(bp, ref)
+#define XFS_BUF_SET_VTYPE(bp, type)            do { } while (0)
+
+static inline int xfs_buf_ispinned(struct xfs_buf *bp)
+{
+       return atomic_read(&bp->b_pin_count);
+}
+
+#define XFS_BUF_FINISH_IOWAIT(bp)      complete(&bp->b_iowait);
+
+static inline void xfs_buf_relse(xfs_buf_t *bp)
+{
+       xfs_buf_unlock(bp);
+       xfs_buf_rele(bp);
+}
+
+/*
+ *     Handling of buftargs.
+ */
+extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
+                       struct block_device *, int, const char *);
+extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
+extern void xfs_wait_buftarg(xfs_buftarg_t *);
+extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
+extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
+
+#ifdef CONFIG_KDB_MODULES
+extern struct list_head *xfs_get_buftarg_list(void);
+#endif
+
+#define xfs_getsize_buftarg(buftarg)   block_size((buftarg)->bt_bdev)
+#define xfs_readonly_buftarg(buftarg)  bdev_read_only((buftarg)->bt_bdev)
+
+#define xfs_binval(buftarg)            xfs_flush_buftarg(buftarg, 1)
+#define XFS_bflush(buftarg)            xfs_flush_buftarg(buftarg, 1)
+
+#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
new file mode 100644 (file)
index 0000000..244e797
--- /dev/null
@@ -0,0 +1,222 @@
+/*
+ * Copyright (C) 2010 Red Hat, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_discard.h"
+#include "xfs_trace.h"
+
+STATIC int
+xfs_trim_extents(
+       struct xfs_mount        *mp,
+       xfs_agnumber_t          agno,
+       xfs_fsblock_t           start,
+       xfs_fsblock_t           len,
+       xfs_fsblock_t           minlen,
+       __uint64_t              *blocks_trimmed)
+{
+       struct block_device     *bdev = mp->m_ddev_targp->bt_bdev;
+       struct xfs_btree_cur    *cur;
+       struct xfs_buf          *agbp;
+       struct xfs_perag        *pag;
+       int                     error;
+       int                     i;
+
+       pag = xfs_perag_get(mp, agno);
+
+       error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+       if (error || !agbp)
+               goto out_put_perag;
+
+       cur = xfs_allocbt_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_CNT);
+
+       /*
+        * Force out the log.  This means any transactions that might have freed
+        * space before we took the AGF buffer lock are now on disk, and the
+        * volatile disk cache is flushed.
+        */
+       xfs_log_force(mp, XFS_LOG_SYNC);
+
+       /*
+        * Look up the longest btree in the AGF and start with it.
+        */
+       error = xfs_alloc_lookup_le(cur, 0,
+                                   XFS_BUF_TO_AGF(agbp)->agf_longest, &i);
+       if (error)
+               goto out_del_cursor;
+
+       /*
+        * Loop until we are done with all extents that are large
+        * enough to be worth discarding.
+        */
+       while (i) {
+               xfs_agblock_t fbno;
+               xfs_extlen_t flen;
+
+               error = xfs_alloc_get_rec(cur, &fbno, &flen, &i);
+               if (error)
+                       goto out_del_cursor;
+               XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor);
+               ASSERT(flen <= XFS_BUF_TO_AGF(agbp)->agf_longest);
+
+               /*
+                * Too small?  Give up.
+                */
+               if (flen < minlen) {
+                       trace_xfs_discard_toosmall(mp, agno, fbno, flen);
+                       goto out_del_cursor;
+               }
+
+               /*
+                * If the extent is entirely outside of the range we are
+                * supposed to discard skip it.  Do not bother to trim
+                * down partially overlapping ranges for now.
+                */
+               if (XFS_AGB_TO_FSB(mp, agno, fbno) + flen < start ||
+                   XFS_AGB_TO_FSB(mp, agno, fbno) >= start + len) {
+                       trace_xfs_discard_exclude(mp, agno, fbno, flen);
+                       goto next_extent;
+               }
+
+               /*
+                * If any blocks in the range are still busy, skip the
+                * discard and try again the next time.
+                */
+               if (xfs_alloc_busy_search(mp, agno, fbno, flen)) {
+                       trace_xfs_discard_busy(mp, agno, fbno, flen);
+                       goto next_extent;
+               }
+
+               trace_xfs_discard_extent(mp, agno, fbno, flen);
+               error = -blkdev_issue_discard(bdev,
+                               XFS_AGB_TO_DADDR(mp, agno, fbno),
+                               XFS_FSB_TO_BB(mp, flen),
+                               GFP_NOFS, 0);
+               if (error)
+                       goto out_del_cursor;
+               *blocks_trimmed += flen;
+
+next_extent:
+               error = xfs_btree_decrement(cur, 0, &i);
+               if (error)
+                       goto out_del_cursor;
+       }
+
+out_del_cursor:
+       xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+       xfs_buf_relse(agbp);
+out_put_perag:
+       xfs_perag_put(pag);
+       return error;
+}
+
+int
+xfs_ioc_trim(
+       struct xfs_mount                *mp,
+       struct fstrim_range __user      *urange)
+{
+       struct request_queue    *q = mp->m_ddev_targp->bt_bdev->bd_disk->queue;
+       unsigned int            granularity = q->limits.discard_granularity;
+       struct fstrim_range     range;
+       xfs_fsblock_t           start, len, minlen;
+       xfs_agnumber_t          start_agno, end_agno, agno;
+       __uint64_t              blocks_trimmed = 0;
+       int                     error, last_error = 0;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (!blk_queue_discard(q))
+               return -XFS_ERROR(EOPNOTSUPP);
+       if (copy_from_user(&range, urange, sizeof(range)))
+               return -XFS_ERROR(EFAULT);
+
+       /*
+        * Truncating down the len isn't actually quite correct, but using
+        * XFS_B_TO_FSB would mean we trivially get overflows for values
+        * of ULLONG_MAX or slightly lower.  And ULLONG_MAX is the default
+        * used by the fstrim application.  In the end it really doesn't
+        * matter as trimming blocks is an advisory interface.
+        */
+       start = XFS_B_TO_FSBT(mp, range.start);
+       len = XFS_B_TO_FSBT(mp, range.len);
+       minlen = XFS_B_TO_FSB(mp, max_t(u64, granularity, range.minlen));
+
+       start_agno = XFS_FSB_TO_AGNO(mp, start);
+       if (start_agno >= mp->m_sb.sb_agcount)
+               return -XFS_ERROR(EINVAL);
+
+       end_agno = XFS_FSB_TO_AGNO(mp, start + len);
+       if (end_agno >= mp->m_sb.sb_agcount)
+               end_agno = mp->m_sb.sb_agcount - 1;
+
+       for (agno = start_agno; agno <= end_agno; agno++) {
+               error = -xfs_trim_extents(mp, agno, start, len, minlen,
+                                         &blocks_trimmed);
+               if (error)
+                       last_error = error;
+       }
+
+       if (last_error)
+               return last_error;
+
+       range.len = XFS_FSB_TO_B(mp, blocks_trimmed);
+       if (copy_to_user(urange, &range, sizeof(range)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+int
+xfs_discard_extents(
+       struct xfs_mount        *mp,
+       struct list_head        *list)
+{
+       struct xfs_busy_extent  *busyp;
+       int                     error = 0;
+
+       list_for_each_entry(busyp, list, list) {
+               trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
+                                        busyp->length);
+
+               error = -blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
+                               XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
+                               XFS_FSB_TO_BB(mp, busyp->length),
+                               GFP_NOFS, 0);
+               if (error && error != EOPNOTSUPP) {
+                       xfs_info(mp,
+        "discard failed for extent [0x%llu,%u], error %d",
+                                (unsigned long long)busyp->bno,
+                                busyp->length,
+                                error);
+                       return error;
+               }
+       }
+
+       return 0;
+}
diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h
new file mode 100644 (file)
index 0000000..344879a
--- /dev/null
@@ -0,0 +1,10 @@
+#ifndef XFS_DISCARD_H
+#define XFS_DISCARD_H 1
+
+struct fstrim_range;
+struct list_head;
+
+extern int     xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *);
+extern int     xfs_discard_extents(struct xfs_mount *, struct list_head *);
+
+#endif /* XFS_DISCARD_H */
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
new file mode 100644 (file)
index 0000000..db62959
--- /dev/null
@@ -0,0 +1,1454 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+
+/*
+   LOCK ORDER
+
+   inode lock              (ilock)
+   dquot hash-chain lock    (hashlock)
+   xqm dquot freelist lock  (freelistlock
+   mount's dquot list lock  (mplistlock)
+   user dquot lock - lock ordering among dquots is based on the uid or gid
+   group dquot lock - similar to udquots. Between the two dquots, the udquot
+                     has to be locked first.
+   pin lock - the dquot lock must be held to take this lock.
+   flush lock - ditto.
+*/
+
+#ifdef DEBUG
+xfs_buftarg_t *xfs_dqerror_target;
+int xfs_do_dqerror;
+int xfs_dqreq_num;
+int xfs_dqerror_mod = 33;
+#endif
+
+static struct lock_class_key xfs_dquot_other_class;
+
+/*
+ * Allocate and initialize a dquot. We don't always allocate fresh memory;
+ * we try to reclaim a free dquot if the number of incore dquots are above
+ * a threshold.
+ * The only field inside the core that gets initialized at this point
+ * is the d_id field. The idea is to fill in the entire q_core
+ * when we read in the on disk dquot.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqinit(
+       xfs_mount_t  *mp,
+       xfs_dqid_t   id,
+       uint         type)
+{
+       xfs_dquot_t     *dqp;
+       boolean_t       brandnewdquot;
+
+       brandnewdquot = xfs_qm_dqalloc_incore(&dqp);
+       dqp->dq_flags = type;
+       dqp->q_core.d_id = cpu_to_be32(id);
+       dqp->q_mount = mp;
+
+       /*
+        * No need to re-initialize these if this is a reclaimed dquot.
+        */
+       if (brandnewdquot) {
+               INIT_LIST_HEAD(&dqp->q_freelist);
+               mutex_init(&dqp->q_qlock);
+               init_waitqueue_head(&dqp->q_pinwait);
+
+               /*
+                * Because we want to use a counting completion, complete
+                * the flush completion once to allow a single access to
+                * the flush completion without blocking.
+                */
+               init_completion(&dqp->q_flush);
+               complete(&dqp->q_flush);
+
+               trace_xfs_dqinit(dqp);
+       } else {
+               /*
+                * Only the q_core portion was zeroed in dqreclaim_one().
+                * So, we need to reset others.
+                */
+               dqp->q_nrefs = 0;
+               dqp->q_blkno = 0;
+               INIT_LIST_HEAD(&dqp->q_mplist);
+               INIT_LIST_HEAD(&dqp->q_hashlist);
+               dqp->q_bufoffset = 0;
+               dqp->q_fileoffset = 0;
+               dqp->q_transp = NULL;
+               dqp->q_gdquot = NULL;
+               dqp->q_res_bcount = 0;
+               dqp->q_res_icount = 0;
+               dqp->q_res_rtbcount = 0;
+               atomic_set(&dqp->q_pincount, 0);
+               dqp->q_hash = NULL;
+               ASSERT(list_empty(&dqp->q_freelist));
+
+               trace_xfs_dqreuse(dqp);
+       }
+
+       /*
+        * In either case we need to make sure group quotas have a different
+        * lock class than user quotas, to make sure lockdep knows we can
+        * locks of one of each at the same time.
+        */
+       if (!(type & XFS_DQ_USER))
+               lockdep_set_class(&dqp->q_qlock, &xfs_dquot_other_class);
+
+       /*
+        * log item gets initialized later
+        */
+       return (dqp);
+}
+
+/*
+ * This is called to free all the memory associated with a dquot
+ */
+void
+xfs_qm_dqdestroy(
+       xfs_dquot_t     *dqp)
+{
+       ASSERT(list_empty(&dqp->q_freelist));
+
+       mutex_destroy(&dqp->q_qlock);
+       kmem_zone_free(xfs_Gqm->qm_dqzone, dqp);
+
+       atomic_dec(&xfs_Gqm->qm_totaldquots);
+}
+
+/*
+ * This is what a 'fresh' dquot inside a dquot chunk looks like on disk.
+ */
+STATIC void
+xfs_qm_dqinit_core(
+       xfs_dqid_t      id,
+       uint            type,
+       xfs_dqblk_t     *d)
+{
+       /*
+        * Caller has zero'd the entire dquot 'chunk' already.
+        */
+       d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+       d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
+       d->dd_diskdq.d_id = cpu_to_be32(id);
+       d->dd_diskdq.d_flags = type;
+}
+
+/*
+ * If default limits are in force, push them into the dquot now.
+ * We overwrite the dquot limits only if they are zero and this
+ * is not the root dquot.
+ */
+void
+xfs_qm_adjust_dqlimits(
+       xfs_mount_t             *mp,
+       xfs_disk_dquot_t        *d)
+{
+       xfs_quotainfo_t         *q = mp->m_quotainfo;
+
+       ASSERT(d->d_id);
+
+       if (q->qi_bsoftlimit && !d->d_blk_softlimit)
+               d->d_blk_softlimit = cpu_to_be64(q->qi_bsoftlimit);
+       if (q->qi_bhardlimit && !d->d_blk_hardlimit)
+               d->d_blk_hardlimit = cpu_to_be64(q->qi_bhardlimit);
+       if (q->qi_isoftlimit && !d->d_ino_softlimit)
+               d->d_ino_softlimit = cpu_to_be64(q->qi_isoftlimit);
+       if (q->qi_ihardlimit && !d->d_ino_hardlimit)
+               d->d_ino_hardlimit = cpu_to_be64(q->qi_ihardlimit);
+       if (q->qi_rtbsoftlimit && !d->d_rtb_softlimit)
+               d->d_rtb_softlimit = cpu_to_be64(q->qi_rtbsoftlimit);
+       if (q->qi_rtbhardlimit && !d->d_rtb_hardlimit)
+               d->d_rtb_hardlimit = cpu_to_be64(q->qi_rtbhardlimit);
+}
+
+/*
+ * Check the limits and timers of a dquot and start or reset timers
+ * if necessary.
+ * This gets called even when quota enforcement is OFF, which makes our
+ * life a little less complicated. (We just don't reject any quota
+ * reservations in that case, when enforcement is off).
+ * We also return 0 as the values of the timers in Q_GETQUOTA calls, when
+ * enforcement's off.
+ * In contrast, warnings are a little different in that they don't
+ * 'automatically' get started when limits get exceeded.  They do
+ * get reset to zero, however, when we find the count to be under
+ * the soft limit (they are only ever set non-zero via userspace).
+ */
+void
+xfs_qm_adjust_dqtimers(
+       xfs_mount_t             *mp,
+       xfs_disk_dquot_t        *d)
+{
+       ASSERT(d->d_id);
+
+#ifdef DEBUG
+       if (d->d_blk_hardlimit)
+               ASSERT(be64_to_cpu(d->d_blk_softlimit) <=
+                      be64_to_cpu(d->d_blk_hardlimit));
+       if (d->d_ino_hardlimit)
+               ASSERT(be64_to_cpu(d->d_ino_softlimit) <=
+                      be64_to_cpu(d->d_ino_hardlimit));
+       if (d->d_rtb_hardlimit)
+               ASSERT(be64_to_cpu(d->d_rtb_softlimit) <=
+                      be64_to_cpu(d->d_rtb_hardlimit));
+#endif
+
+       if (!d->d_btimer) {
+               if ((d->d_blk_softlimit &&
+                    (be64_to_cpu(d->d_bcount) >=
+                     be64_to_cpu(d->d_blk_softlimit))) ||
+                   (d->d_blk_hardlimit &&
+                    (be64_to_cpu(d->d_bcount) >=
+                     be64_to_cpu(d->d_blk_hardlimit)))) {
+                       d->d_btimer = cpu_to_be32(get_seconds() +
+                                       mp->m_quotainfo->qi_btimelimit);
+               } else {
+                       d->d_bwarns = 0;
+               }
+       } else {
+               if ((!d->d_blk_softlimit ||
+                    (be64_to_cpu(d->d_bcount) <
+                     be64_to_cpu(d->d_blk_softlimit))) &&
+                   (!d->d_blk_hardlimit ||
+                   (be64_to_cpu(d->d_bcount) <
+                    be64_to_cpu(d->d_blk_hardlimit)))) {
+                       d->d_btimer = 0;
+               }
+       }
+
+       if (!d->d_itimer) {
+               if ((d->d_ino_softlimit &&
+                    (be64_to_cpu(d->d_icount) >=
+                     be64_to_cpu(d->d_ino_softlimit))) ||
+                   (d->d_ino_hardlimit &&
+                    (be64_to_cpu(d->d_icount) >=
+                     be64_to_cpu(d->d_ino_hardlimit)))) {
+                       d->d_itimer = cpu_to_be32(get_seconds() +
+                                       mp->m_quotainfo->qi_itimelimit);
+               } else {
+                       d->d_iwarns = 0;
+               }
+       } else {
+               if ((!d->d_ino_softlimit ||
+                    (be64_to_cpu(d->d_icount) <
+                     be64_to_cpu(d->d_ino_softlimit)))  &&
+                   (!d->d_ino_hardlimit ||
+                    (be64_to_cpu(d->d_icount) <
+                     be64_to_cpu(d->d_ino_hardlimit)))) {
+                       d->d_itimer = 0;
+               }
+       }
+
+       if (!d->d_rtbtimer) {
+               if ((d->d_rtb_softlimit &&
+                    (be64_to_cpu(d->d_rtbcount) >=
+                     be64_to_cpu(d->d_rtb_softlimit))) ||
+                   (d->d_rtb_hardlimit &&
+                    (be64_to_cpu(d->d_rtbcount) >=
+                     be64_to_cpu(d->d_rtb_hardlimit)))) {
+                       d->d_rtbtimer = cpu_to_be32(get_seconds() +
+                                       mp->m_quotainfo->qi_rtbtimelimit);
+               } else {
+                       d->d_rtbwarns = 0;
+               }
+       } else {
+               if ((!d->d_rtb_softlimit ||
+                    (be64_to_cpu(d->d_rtbcount) <
+                     be64_to_cpu(d->d_rtb_softlimit))) &&
+                   (!d->d_rtb_hardlimit ||
+                    (be64_to_cpu(d->d_rtbcount) <
+                     be64_to_cpu(d->d_rtb_hardlimit)))) {
+                       d->d_rtbtimer = 0;
+               }
+       }
+}
+
+/*
+ * initialize a buffer full of dquots and log the whole thing
+ */
+STATIC void
+xfs_qm_init_dquot_blk(
+       xfs_trans_t     *tp,
+       xfs_mount_t     *mp,
+       xfs_dqid_t      id,
+       uint            type,
+       xfs_buf_t       *bp)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       xfs_dqblk_t     *d;
+       int             curid, i;
+
+       ASSERT(tp);
+       ASSERT(xfs_buf_islocked(bp));
+
+       d = bp->b_addr;
+
+       /*
+        * ID of the first dquot in the block - id's are zero based.
+        */
+       curid = id - (id % q->qi_dqperchunk);
+       ASSERT(curid >= 0);
+       memset(d, 0, BBTOB(q->qi_dqchunklen));
+       for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
+               xfs_qm_dqinit_core(curid, type, d);
+       xfs_trans_dquot_buf(tp, bp,
+                           (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
+                           ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
+                            XFS_BLF_GDQUOT_BUF)));
+       xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+}
+
+
+
+/*
+ * Allocate a block and fill it with dquots.
+ * This is called when the bmapi finds a hole.
+ */
+STATIC int
+xfs_qm_dqalloc(
+       xfs_trans_t     **tpp,
+       xfs_mount_t     *mp,
+       xfs_dquot_t     *dqp,
+       xfs_inode_t     *quotip,
+       xfs_fileoff_t   offset_fsb,
+       xfs_buf_t       **O_bpp)
+{
+       xfs_fsblock_t   firstblock;
+       xfs_bmap_free_t flist;
+       xfs_bmbt_irec_t map;
+       int             nmaps, error, committed;
+       xfs_buf_t       *bp;
+       xfs_trans_t     *tp = *tpp;
+
+       ASSERT(tp != NULL);
+
+       trace_xfs_dqalloc(dqp);
+
+       /*
+        * Initialize the bmap freelist prior to calling bmapi code.
+        */
+       xfs_bmap_init(&flist, &firstblock);
+       xfs_ilock(quotip, XFS_ILOCK_EXCL);
+       /*
+        * Return if this type of quotas is turned off while we didn't
+        * have an inode lock
+        */
+       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+               xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+               return (ESRCH);
+       }
+
+       xfs_trans_ijoin_ref(tp, quotip, XFS_ILOCK_EXCL);
+       nmaps = 1;
+       if ((error = xfs_bmapi(tp, quotip,
+                             offset_fsb, XFS_DQUOT_CLUSTER_SIZE_FSB,
+                             XFS_BMAPI_METADATA | XFS_BMAPI_WRITE,
+                             &firstblock,
+                             XFS_QM_DQALLOC_SPACE_RES(mp),
+                             &map, &nmaps, &flist))) {
+               goto error0;
+       }
+       ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
+       ASSERT(nmaps == 1);
+       ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
+              (map.br_startblock != HOLESTARTBLOCK));
+
+       /*
+        * Keep track of the blkno to save a lookup later
+        */
+       dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+       /* now we can just get the buffer (there's nothing to read yet) */
+       bp = xfs_trans_get_buf(tp, mp->m_ddev_targp,
+                              dqp->q_blkno,
+                              mp->m_quotainfo->qi_dqchunklen,
+                              0);
+       if (!bp || (error = xfs_buf_geterror(bp)))
+               goto error1;
+       /*
+        * Make a chunk of dquots out of this buffer and log
+        * the entire thing.
+        */
+       xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id),
+                             dqp->dq_flags & XFS_DQ_ALLTYPES, bp);
+
+       /*
+        * xfs_bmap_finish() may commit the current transaction and
+        * start a second transaction if the freelist is not empty.
+        *
+        * Since we still want to modify this buffer, we need to
+        * ensure that the buffer is not released on commit of
+        * the first transaction and ensure the buffer is added to the
+        * second transaction.
+        *
+        * If there is only one transaction then don't stop the buffer
+        * from being released when it commits later on.
+        */
+
+       xfs_trans_bhold(tp, bp);
+
+       if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+               goto error1;
+       }
+
+       if (committed) {
+               tp = *tpp;
+               xfs_trans_bjoin(tp, bp);
+       } else {
+               xfs_trans_bhold_release(tp, bp);
+       }
+
+       *O_bpp = bp;
+       return 0;
+
+      error1:
+       xfs_bmap_cancel(&flist);
+      error0:
+       xfs_iunlock(quotip, XFS_ILOCK_EXCL);
+
+       return (error);
+}
+
+/*
+ * Maps a dquot to the buffer containing its on-disk version.
+ * This returns a ptr to the buffer containing the on-disk dquot
+ * in the bpp param, and a ptr to the on-disk dquot within that buffer
+ */
+STATIC int
+xfs_qm_dqtobp(
+       xfs_trans_t             **tpp,
+       xfs_dquot_t             *dqp,
+       xfs_disk_dquot_t        **O_ddpp,
+       xfs_buf_t               **O_bpp,
+       uint                    flags)
+{
+       xfs_bmbt_irec_t map;
+       int             nmaps = 1, error;
+       xfs_buf_t       *bp;
+       xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
+       xfs_mount_t     *mp = dqp->q_mount;
+       xfs_disk_dquot_t *ddq;
+       xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
+       xfs_trans_t     *tp = (tpp ? *tpp : NULL);
+
+       dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk;
+
+       xfs_ilock(quotip, XFS_ILOCK_SHARED);
+       if (XFS_IS_THIS_QUOTA_OFF(dqp)) {
+               /*
+                * Return if this type of quotas is turned off while we
+                * didn't have the quota inode lock.
+                */
+               xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+               return ESRCH;
+       }
+
+       /*
+        * Find the block map; no allocations yet
+        */
+       error = xfs_bmapi(NULL, quotip, dqp->q_fileoffset,
+                         XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA,
+                         NULL, 0, &map, &nmaps, NULL);
+
+       xfs_iunlock(quotip, XFS_ILOCK_SHARED);
+       if (error)
+               return error;
+
+       ASSERT(nmaps == 1);
+       ASSERT(map.br_blockcount == 1);
+
+       /*
+        * Offset of dquot in the (fixed sized) dquot chunk.
+        */
+       dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) *
+               sizeof(xfs_dqblk_t);
+
+       ASSERT(map.br_startblock != DELAYSTARTBLOCK);
+       if (map.br_startblock == HOLESTARTBLOCK) {
+               /*
+                * We don't allocate unless we're asked to
+                */
+               if (!(flags & XFS_QMOPT_DQALLOC))
+                       return ENOENT;
+
+               ASSERT(tp);
+               error = xfs_qm_dqalloc(tpp, mp, dqp, quotip,
+                                       dqp->q_fileoffset, &bp);
+               if (error)
+                       return error;
+               tp = *tpp;
+       } else {
+               trace_xfs_dqtobp_read(dqp);
+
+               /*
+                * store the blkno etc so that we don't have to do the
+                * mapping all the time
+                */
+               dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock);
+
+               error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                          dqp->q_blkno,
+                                          mp->m_quotainfo->qi_dqchunklen,
+                                          0, &bp);
+               if (error || !bp)
+                       return XFS_ERROR(error);
+       }
+
+       ASSERT(xfs_buf_islocked(bp));
+
+       /*
+        * calculate the location of the dquot inside the buffer.
+        */
+       ddq = bp->b_addr + dqp->q_bufoffset;
+
+       /*
+        * A simple sanity check in case we got a corrupted dquot...
+        */
+       error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
+                          flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
+                          "dqtobp");
+       if (error) {
+               if (!(flags & XFS_QMOPT_DQREPAIR)) {
+                       xfs_trans_brelse(tp, bp);
+                       return XFS_ERROR(EIO);
+               }
+       }
+
+       *O_bpp = bp;
+       *O_ddpp = ddq;
+
+       return (0);
+}
+
+
+/*
+ * Read in the ondisk dquot using dqtobp() then copy it to an incore version,
+ * and release the buffer immediately.
+ *
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqread(
+       xfs_trans_t     **tpp,
+       xfs_dqid_t      id,
+       xfs_dquot_t     *dqp,   /* dquot to get filled in */
+       uint            flags)
+{
+       xfs_disk_dquot_t *ddqp;
+       xfs_buf_t        *bp;
+       int              error;
+       xfs_trans_t      *tp;
+
+       ASSERT(tpp);
+
+       trace_xfs_dqread(dqp);
+
+       /*
+        * get a pointer to the on-disk dquot and the buffer containing it
+        * dqp already knows its own type (GROUP/USER).
+        */
+       if ((error = xfs_qm_dqtobp(tpp, dqp, &ddqp, &bp, flags))) {
+               return (error);
+       }
+       tp = *tpp;
+
+       /* copy everything from disk dquot to the incore dquot */
+       memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+       ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+       xfs_qm_dquot_logitem_init(dqp);
+
+       /*
+        * Reservation counters are defined as reservation plus current usage
+        * to avoid having to add every time.
+        */
+       dqp->q_res_bcount = be64_to_cpu(ddqp->d_bcount);
+       dqp->q_res_icount = be64_to_cpu(ddqp->d_icount);
+       dqp->q_res_rtbcount = be64_to_cpu(ddqp->d_rtbcount);
+
+       /* Mark the buf so that this will stay incore a little longer */
+       XFS_BUF_SET_VTYPE_REF(bp, B_FS_DQUOT, XFS_DQUOT_REF);
+
+       /*
+        * We got the buffer with a xfs_trans_read_buf() (in dqtobp())
+        * So we need to release with xfs_trans_brelse().
+        * The strategy here is identical to that of inodes; we lock
+        * the dquot in xfs_qm_dqget() before making it accessible to
+        * others. This is because dquots, like inodes, need a good level of
+        * concurrency, and we don't want to take locks on the entire buffers
+        * for dquot accesses.
+        * Note also that the dquot buffer may even be dirty at this point, if
+        * this particular dquot was repaired. We still aren't afraid to
+        * brelse it because we have the changes incore.
+        */
+       ASSERT(xfs_buf_islocked(bp));
+       xfs_trans_brelse(tp, bp);
+
+       return (error);
+}
+
+
+/*
+ * allocate an incore dquot from the kernel heap,
+ * and fill its core with quota information kept on disk.
+ * If XFS_QMOPT_DQALLOC is set, it'll allocate a dquot on disk
+ * if it wasn't already allocated.
+ */
+STATIC int
+xfs_qm_idtodq(
+       xfs_mount_t     *mp,
+       xfs_dqid_t      id,      /* gid or uid, depending on type */
+       uint            type,    /* UDQUOT or GDQUOT */
+       uint            flags,   /* DQALLOC, DQREPAIR */
+       xfs_dquot_t     **O_dqpp)/* OUT : incore dquot, not locked */
+{
+       xfs_dquot_t     *dqp;
+       int             error;
+       xfs_trans_t     *tp;
+       int             cancelflags=0;
+
+       dqp = xfs_qm_dqinit(mp, id, type);
+       tp = NULL;
+       if (flags & XFS_QMOPT_DQALLOC) {
+               tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
+               error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+                               XFS_WRITE_LOG_RES(mp) +
+                               BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 +
+                               128,
+                               0,
+                               XFS_TRANS_PERM_LOG_RES,
+                               XFS_WRITE_LOG_COUNT);
+               if (error) {
+                       cancelflags = 0;
+                       goto error0;
+               }
+               cancelflags = XFS_TRANS_RELEASE_LOG_RES;
+       }
+
+       /*
+        * Read it from disk; xfs_dqread() takes care of
+        * all the necessary initialization of dquot's fields (locks, etc)
+        */
+       if ((error = xfs_qm_dqread(&tp, id, dqp, flags))) {
+               /*
+                * This can happen if quotas got turned off (ESRCH),
+                * or if the dquot didn't exist on disk and we ask to
+                * allocate (ENOENT).
+                */
+               trace_xfs_dqread_fail(dqp);
+               cancelflags |= XFS_TRANS_ABORT;
+               goto error0;
+       }
+       if (tp) {
+               if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES)))
+                       goto error1;
+       }
+
+       *O_dqpp = dqp;
+       return (0);
+
+ error0:
+       ASSERT(error);
+       if (tp)
+               xfs_trans_cancel(tp, cancelflags);
+ error1:
+       xfs_qm_dqdestroy(dqp);
+       *O_dqpp = NULL;
+       return (error);
+}
+
+/*
+ * Lookup a dquot in the incore dquot hashtable. We keep two separate
+ * hashtables for user and group dquots; and, these are global tables
+ * inside the XQM, not per-filesystem tables.
+ * The hash chain must be locked by caller, and it is left locked
+ * on return. Returning dquot is locked.
+ */
+STATIC int
+xfs_qm_dqlookup(
+       xfs_mount_t             *mp,
+       xfs_dqid_t              id,
+       xfs_dqhash_t            *qh,
+       xfs_dquot_t             **O_dqpp)
+{
+       xfs_dquot_t             *dqp;
+       uint                    flist_locked;
+
+       ASSERT(mutex_is_locked(&qh->qh_lock));
+
+       flist_locked = B_FALSE;
+
+       /*
+        * Traverse the hashchain looking for a match
+        */
+       list_for_each_entry(dqp, &qh->qh_list, q_hashlist) {
+               /*
+                * We already have the hashlock. We don't need the
+                * dqlock to look at the id field of the dquot, since the
+                * id can't be modified without the hashlock anyway.
+                */
+               if (be32_to_cpu(dqp->q_core.d_id) == id && dqp->q_mount == mp) {
+                       trace_xfs_dqlookup_found(dqp);
+
+                       /*
+                        * All in core dquots must be on the dqlist of mp
+                        */
+                       ASSERT(!list_empty(&dqp->q_mplist));
+
+                       xfs_dqlock(dqp);
+                       if (dqp->q_nrefs == 0) {
+                               ASSERT(!list_empty(&dqp->q_freelist));
+                               if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+                                       trace_xfs_dqlookup_want(dqp);
+
+                                       /*
+                                        * We may have raced with dqreclaim_one()
+                                        * (and lost). So, flag that we don't
+                                        * want the dquot to be reclaimed.
+                                        */
+                                       dqp->dq_flags |= XFS_DQ_WANT;
+                                       xfs_dqunlock(dqp);
+                                       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+                                       xfs_dqlock(dqp);
+                                       dqp->dq_flags &= ~(XFS_DQ_WANT);
+                               }
+                               flist_locked = B_TRUE;
+                       }
+
+                       /*
+                        * id couldn't have changed; we had the hashlock all
+                        * along
+                        */
+                       ASSERT(be32_to_cpu(dqp->q_core.d_id) == id);
+
+                       if (flist_locked) {
+                               if (dqp->q_nrefs != 0) {
+                                       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                                       flist_locked = B_FALSE;
+                               } else {
+                                       /* take it off the freelist */
+                                       trace_xfs_dqlookup_freelist(dqp);
+                                       list_del_init(&dqp->q_freelist);
+                                       xfs_Gqm->qm_dqfrlist_cnt--;
+                               }
+                       }
+
+                       XFS_DQHOLD(dqp);
+
+                       if (flist_locked)
+                               mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                       /*
+                        * move the dquot to the front of the hashchain
+                        */
+                       ASSERT(mutex_is_locked(&qh->qh_lock));
+                       list_move(&dqp->q_hashlist, &qh->qh_list);
+                       trace_xfs_dqlookup_done(dqp);
+                       *O_dqpp = dqp;
+                       return 0;
+               }
+       }
+
+       *O_dqpp = NULL;
+       ASSERT(mutex_is_locked(&qh->qh_lock));
+       return (1);
+}
+
+/*
+ * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a
+ * a locked dquot, doing an allocation (if requested) as needed.
+ * When both an inode and an id are given, the inode's id takes precedence.
+ * That is, if the id changes while we don't hold the ilock inside this
+ * function, the new dquot is returned, not necessarily the one requested
+ * in the id argument.
+ */
+int
+xfs_qm_dqget(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *ip,      /* locked inode (optional) */
+       xfs_dqid_t      id,       /* uid/projid/gid depending on type */
+       uint            type,     /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */
+       uint            flags,    /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */
+       xfs_dquot_t     **O_dqpp) /* OUT : locked incore dquot */
+{
+       xfs_dquot_t     *dqp;
+       xfs_dqhash_t    *h;
+       uint            version;
+       int             error;
+
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+       if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) ||
+           (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) ||
+           (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) {
+               return (ESRCH);
+       }
+       h = XFS_DQ_HASH(mp, id, type);
+
+#ifdef DEBUG
+       if (xfs_do_dqerror) {
+               if ((xfs_dqerror_target == mp->m_ddev_targp) &&
+                   (xfs_dqreq_num++ % xfs_dqerror_mod) == 0) {
+                       xfs_debug(mp, "Returning error in dqget");
+                       return (EIO);
+               }
+       }
+#endif
+
+ again:
+
+#ifdef DEBUG
+       ASSERT(type == XFS_DQ_USER ||
+              type == XFS_DQ_PROJ ||
+              type == XFS_DQ_GROUP);
+       if (ip) {
+               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+               if (type == XFS_DQ_USER)
+                       ASSERT(ip->i_udquot == NULL);
+               else
+                       ASSERT(ip->i_gdquot == NULL);
+       }
+#endif
+       mutex_lock(&h->qh_lock);
+
+       /*
+        * Look in the cache (hashtable).
+        * The chain is kept locked during lookup.
+        */
+       if (xfs_qm_dqlookup(mp, id, h, O_dqpp) == 0) {
+               XQM_STATS_INC(xqmstats.xs_qm_dqcachehits);
+               /*
+                * The dquot was found, moved to the front of the chain,
+                * taken off the freelist if it was on it, and locked
+                * at this point. Just unlock the hashchain and return.
+                */
+               ASSERT(*O_dqpp);
+               ASSERT(XFS_DQ_IS_LOCKED(*O_dqpp));
+               mutex_unlock(&h->qh_lock);
+               trace_xfs_dqget_hit(*O_dqpp);
+               return (0);     /* success */
+       }
+       XQM_STATS_INC(xqmstats.xs_qm_dqcachemisses);
+
+       /*
+        * Dquot cache miss. We don't want to keep the inode lock across
+        * a (potential) disk read. Also we don't want to deal with the lock
+        * ordering between quotainode and this inode. OTOH, dropping the inode
+        * lock here means dealing with a chown that can happen before
+        * we re-acquire the lock.
+        */
+       if (ip)
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       /*
+        * Save the hashchain version stamp, and unlock the chain, so that
+        * we don't keep the lock across a disk read
+        */
+       version = h->qh_version;
+       mutex_unlock(&h->qh_lock);
+
+       /*
+        * Allocate the dquot on the kernel heap, and read the ondisk
+        * portion off the disk. Also, do all the necessary initialization
+        * This can return ENOENT if dquot didn't exist on disk and we didn't
+        * ask it to allocate; ESRCH if quotas got turned off suddenly.
+        */
+       if ((error = xfs_qm_idtodq(mp, id, type,
+                                 flags & (XFS_QMOPT_DQALLOC|XFS_QMOPT_DQREPAIR|
+                                          XFS_QMOPT_DOWARN),
+                                 &dqp))) {
+               if (ip)
+                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+               return (error);
+       }
+
+       /*
+        * See if this is mount code calling to look at the overall quota limits
+        * which are stored in the id == 0 user or group's dquot.
+        * Since we may not have done a quotacheck by this point, just return
+        * the dquot without attaching it to any hashtables, lists, etc, or even
+        * taking a reference.
+        * The caller must dqdestroy this once done.
+        */
+       if (flags & XFS_QMOPT_DQSUSER) {
+               ASSERT(id == 0);
+               ASSERT(! ip);
+               goto dqret;
+       }
+
+       /*
+        * Dquot lock comes after hashlock in the lock ordering
+        */
+       if (ip) {
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+               /*
+                * A dquot could be attached to this inode by now, since
+                * we had dropped the ilock.
+                */
+               if (type == XFS_DQ_USER) {
+                       if (!XFS_IS_UQUOTA_ON(mp)) {
+                               /* inode stays locked on return */
+                               xfs_qm_dqdestroy(dqp);
+                               return XFS_ERROR(ESRCH);
+                       }
+                       if (ip->i_udquot) {
+                               xfs_qm_dqdestroy(dqp);
+                               dqp = ip->i_udquot;
+                               xfs_dqlock(dqp);
+                               goto dqret;
+                       }
+               } else {
+                       if (!XFS_IS_OQUOTA_ON(mp)) {
+                               /* inode stays locked on return */
+                               xfs_qm_dqdestroy(dqp);
+                               return XFS_ERROR(ESRCH);
+                       }
+                       if (ip->i_gdquot) {
+                               xfs_qm_dqdestroy(dqp);
+                               dqp = ip->i_gdquot;
+                               xfs_dqlock(dqp);
+                               goto dqret;
+                       }
+               }
+       }
+
+       /*
+        * Hashlock comes after ilock in lock order
+        */
+       mutex_lock(&h->qh_lock);
+       if (version != h->qh_version) {
+               xfs_dquot_t *tmpdqp;
+               /*
+                * Now, see if somebody else put the dquot in the
+                * hashtable before us. This can happen because we didn't
+                * keep the hashchain lock. We don't have to worry about
+                * lock order between the two dquots here since dqp isn't
+                * on any findable lists yet.
+                */
+               if (xfs_qm_dqlookup(mp, id, h, &tmpdqp) == 0) {
+                       /*
+                        * Duplicate found. Just throw away the new dquot
+                        * and start over.
+                        */
+                       xfs_qm_dqput(tmpdqp);
+                       mutex_unlock(&h->qh_lock);
+                       xfs_qm_dqdestroy(dqp);
+                       XQM_STATS_INC(xqmstats.xs_qm_dquot_dups);
+                       goto again;
+               }
+       }
+
+       /*
+        * Put the dquot at the beginning of the hash-chain and mp's list
+        * LOCK ORDER: hashlock, freelistlock, mplistlock, udqlock, gdqlock ..
+        */
+       ASSERT(mutex_is_locked(&h->qh_lock));
+       dqp->q_hash = h;
+       list_add(&dqp->q_hashlist, &h->qh_list);
+       h->qh_version++;
+
+       /*
+        * Attach this dquot to this filesystem's list of all dquots,
+        * kept inside the mount structure in m_quotainfo field
+        */
+       mutex_lock(&mp->m_quotainfo->qi_dqlist_lock);
+
+       /*
+        * We return a locked dquot to the caller, with a reference taken
+        */
+       xfs_dqlock(dqp);
+       dqp->q_nrefs = 1;
+
+       list_add(&dqp->q_mplist, &mp->m_quotainfo->qi_dqlist);
+       mp->m_quotainfo->qi_dquots++;
+       mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+       mutex_unlock(&h->qh_lock);
+ dqret:
+       ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       trace_xfs_dqget_miss(dqp);
+       *O_dqpp = dqp;
+       return (0);
+}
+
+
+/*
+ * Release a reference to the dquot (decrement ref-count)
+ * and unlock it. If there is a group quota attached to this
+ * dquot, carefully release that too without tripping over
+ * deadlocks'n'stuff.
+ */
+void
+xfs_qm_dqput(
+       xfs_dquot_t     *dqp)
+{
+       xfs_dquot_t     *gdqp;
+
+       ASSERT(dqp->q_nrefs > 0);
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       trace_xfs_dqput(dqp);
+
+       if (dqp->q_nrefs != 1) {
+               dqp->q_nrefs--;
+               xfs_dqunlock(dqp);
+               return;
+       }
+
+       /*
+        * drop the dqlock and acquire the freelist and dqlock
+        * in the right order; but try to get it out-of-order first
+        */
+       if (!mutex_trylock(&xfs_Gqm->qm_dqfrlist_lock)) {
+               trace_xfs_dqput_wait(dqp);
+               xfs_dqunlock(dqp);
+               mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+               xfs_dqlock(dqp);
+       }
+
+       while (1) {
+               gdqp = NULL;
+
+               /* We can't depend on nrefs being == 1 here */
+               if (--dqp->q_nrefs == 0) {
+                       trace_xfs_dqput_free(dqp);
+
+                       list_add_tail(&dqp->q_freelist, &xfs_Gqm->qm_dqfrlist);
+                       xfs_Gqm->qm_dqfrlist_cnt++;
+
+                       /*
+                        * If we just added a udquot to the freelist, then
+                        * we want to release the gdquot reference that
+                        * it (probably) has. Otherwise it'll keep the
+                        * gdquot from getting reclaimed.
+                        */
+                       if ((gdqp = dqp->q_gdquot)) {
+                               /*
+                                * Avoid a recursive dqput call
+                                */
+                               xfs_dqlock(gdqp);
+                               dqp->q_gdquot = NULL;
+                       }
+               }
+               xfs_dqunlock(dqp);
+
+               /*
+                * If we had a group quota inside the user quota as a hint,
+                * release it now.
+                */
+               if (! gdqp)
+                       break;
+               dqp = gdqp;
+       }
+       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+}
+
+/*
+ * Release a dquot. Flush it if dirty, then dqput() it.
+ * dquot must not be locked.
+ */
+void
+xfs_qm_dqrele(
+       xfs_dquot_t     *dqp)
+{
+       if (!dqp)
+               return;
+
+       trace_xfs_dqrele(dqp);
+
+       xfs_dqlock(dqp);
+       /*
+        * We don't care to flush it if the dquot is dirty here.
+        * That will create stutters that we want to avoid.
+        * Instead we do a delayed write when we try to reclaim
+        * a dirty dquot. Also xfs_sync will take part of the burden...
+        */
+       xfs_qm_dqput(dqp);
+}
+
+/*
+ * This is the dquot flushing I/O completion routine.  It is called
+ * from interrupt level when the buffer containing the dquot is
+ * flushed to disk.  It is responsible for removing the dquot logitem
+ * from the AIL if it has not been re-logged, and unlocking the dquot's
+ * flush lock. This behavior is very similar to that of inodes..
+ */
+STATIC void
+xfs_qm_dqflush_done(
+       struct xfs_buf          *bp,
+       struct xfs_log_item     *lip)
+{
+       xfs_dq_logitem_t        *qip = (struct xfs_dq_logitem *)lip;
+       xfs_dquot_t             *dqp = qip->qli_dquot;
+       struct xfs_ail          *ailp = lip->li_ailp;
+
+       /*
+        * We only want to pull the item from the AIL if its
+        * location in the log has not changed since we started the flush.
+        * Thus, we only bother if the dquot's lsn has
+        * not changed. First we check the lsn outside the lock
+        * since it's cheaper, and then we recheck while
+        * holding the lock before removing the dquot from the AIL.
+        */
+       if ((lip->li_flags & XFS_LI_IN_AIL) &&
+           lip->li_lsn == qip->qli_flush_lsn) {
+
+               /* xfs_trans_ail_delete() drops the AIL lock. */
+               spin_lock(&ailp->xa_lock);
+               if (lip->li_lsn == qip->qli_flush_lsn)
+                       xfs_trans_ail_delete(ailp, lip);
+               else
+                       spin_unlock(&ailp->xa_lock);
+       }
+
+       /*
+        * Release the dq's flush lock since we're done with it.
+        */
+       xfs_dqfunlock(dqp);
+}
+
+/*
+ * Write a modified dquot to disk.
+ * The dquot must be locked and the flush lock too taken by caller.
+ * The flush lock will not be unlocked until the dquot reaches the disk,
+ * but the dquot is free to be unlocked and modified by the caller
+ * in the interim. Dquot is still locked on return. This behavior is
+ * identical to that of inodes.
+ */
+int
+xfs_qm_dqflush(
+       xfs_dquot_t             *dqp,
+       uint                    flags)
+{
+       struct xfs_mount        *mp = dqp->q_mount;
+       struct xfs_buf          *bp;
+       struct xfs_disk_dquot   *ddqp;
+       int                     error;
+
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+       ASSERT(!completion_done(&dqp->q_flush));
+
+       trace_xfs_dqflush(dqp);
+
+       /*
+        * If not dirty, or it's pinned and we are not supposed to block, nada.
+        */
+       if (!XFS_DQ_IS_DIRTY(dqp) ||
+           (!(flags & SYNC_WAIT) && atomic_read(&dqp->q_pincount) > 0)) {
+               xfs_dqfunlock(dqp);
+               return 0;
+       }
+       xfs_qm_dqunpin_wait(dqp);
+
+       /*
+        * This may have been unpinned because the filesystem is shutting
+        * down forcibly. If that's the case we must not write this dquot
+        * to disk, because the log record didn't make it to disk!
+        */
+       if (XFS_FORCED_SHUTDOWN(mp)) {
+               dqp->dq_flags &= ~XFS_DQ_DIRTY;
+               xfs_dqfunlock(dqp);
+               return XFS_ERROR(EIO);
+       }
+
+       /*
+        * Get the buffer containing the on-disk dquot
+        */
+       error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
+                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+       if (error) {
+               ASSERT(error != ENOENT);
+               xfs_dqfunlock(dqp);
+               return error;
+       }
+
+       /*
+        * Calculate the location of the dquot inside the buffer.
+        */
+       ddqp = bp->b_addr + dqp->q_bufoffset;
+
+       /*
+        * A simple sanity check in case we got a corrupted dquot..
+        */
+       error = xfs_qm_dqcheck(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0,
+                          XFS_QMOPT_DOWARN, "dqflush (incore copy)");
+       if (error) {
+               xfs_buf_relse(bp);
+               xfs_dqfunlock(dqp);
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               return XFS_ERROR(EIO);
+       }
+
+       /* This is the only portion of data that needs to persist */
+       memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+
+       /*
+        * Clear the dirty field and remember the flush lsn for later use.
+        */
+       dqp->dq_flags &= ~XFS_DQ_DIRTY;
+
+       xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+                                       &dqp->q_logitem.qli_item.li_lsn);
+
+       /*
+        * Attach an iodone routine so that we can remove this dquot from the
+        * AIL and release the flush lock once the dquot is synced to disk.
+        */
+       xfs_buf_attach_iodone(bp, xfs_qm_dqflush_done,
+                                 &dqp->q_logitem.qli_item);
+
+       /*
+        * If the buffer is pinned then push on the log so we won't
+        * get stuck waiting in the write for too long.
+        */
+       if (xfs_buf_ispinned(bp)) {
+               trace_xfs_dqflush_force(dqp);
+               xfs_log_force(mp, 0);
+       }
+
+       if (flags & SYNC_WAIT)
+               error = xfs_bwrite(mp, bp);
+       else
+               xfs_bdwrite(mp, bp);
+
+       trace_xfs_dqflush_done(dqp);
+
+       /*
+        * dqp is still locked, but caller is free to unlock it now.
+        */
+       return error;
+
+}
+
+int
+xfs_qm_dqlock_nowait(
+       xfs_dquot_t *dqp)
+{
+       return mutex_trylock(&dqp->q_qlock);
+}
+
+void
+xfs_dqlock(
+       xfs_dquot_t *dqp)
+{
+       mutex_lock(&dqp->q_qlock);
+}
+
+void
+xfs_dqunlock(
+       xfs_dquot_t *dqp)
+{
+       mutex_unlock(&(dqp->q_qlock));
+       if (dqp->q_logitem.qli_dquot == dqp) {
+               /* Once was dqp->q_mount, but might just have been cleared */
+               xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
+                                       (xfs_log_item_t*)&(dqp->q_logitem));
+       }
+}
+
+
+void
+xfs_dqunlock_nonotify(
+       xfs_dquot_t *dqp)
+{
+       mutex_unlock(&(dqp->q_qlock));
+}
+
+/*
+ * Lock two xfs_dquot structures.
+ *
+ * To avoid deadlocks we always lock the quota structure with
+ * the lowerd id first.
+ */
+void
+xfs_dqlock2(
+       xfs_dquot_t     *d1,
+       xfs_dquot_t     *d2)
+{
+       if (d1 && d2) {
+               ASSERT(d1 != d2);
+               if (be32_to_cpu(d1->q_core.d_id) >
+                   be32_to_cpu(d2->q_core.d_id)) {
+                       mutex_lock(&d2->q_qlock);
+                       mutex_lock_nested(&d1->q_qlock, XFS_QLOCK_NESTED);
+               } else {
+                       mutex_lock(&d1->q_qlock);
+                       mutex_lock_nested(&d2->q_qlock, XFS_QLOCK_NESTED);
+               }
+       } else if (d1) {
+               mutex_lock(&d1->q_qlock);
+       } else if (d2) {
+               mutex_lock(&d2->q_qlock);
+       }
+}
+
+
+/*
+ * Take a dquot out of the mount's dqlist as well as the hashlist.
+ * This is called via unmount as well as quotaoff, and the purge
+ * will always succeed unless there are soft (temp) references
+ * outstanding.
+ *
+ * This returns 0 if it was purged, 1 if it wasn't. It's not an error code
+ * that we're returning! XXXsup - not cool.
+ */
+/* ARGSUSED */
+int
+xfs_qm_dqpurge(
+       xfs_dquot_t     *dqp)
+{
+       xfs_dqhash_t    *qh = dqp->q_hash;
+       xfs_mount_t     *mp = dqp->q_mount;
+
+       ASSERT(mutex_is_locked(&mp->m_quotainfo->qi_dqlist_lock));
+       ASSERT(mutex_is_locked(&dqp->q_hash->qh_lock));
+
+       xfs_dqlock(dqp);
+       /*
+        * We really can't afford to purge a dquot that is
+        * referenced, because these are hard refs.
+        * It shouldn't happen in general because we went thru _all_ inodes in
+        * dqrele_all_inodes before calling this and didn't let the mountlock go.
+        * However it is possible that we have dquots with temporary
+        * references that are not attached to an inode. e.g. see xfs_setattr().
+        */
+       if (dqp->q_nrefs != 0) {
+               xfs_dqunlock(dqp);
+               mutex_unlock(&dqp->q_hash->qh_lock);
+               return (1);
+       }
+
+       ASSERT(!list_empty(&dqp->q_freelist));
+
+       /*
+        * If we're turning off quotas, we have to make sure that, for
+        * example, we don't delete quota disk blocks while dquots are
+        * in the process of getting written to those disk blocks.
+        * This dquot might well be on AIL, and we can't leave it there
+        * if we're turning off quotas. Basically, we need this flush
+        * lock, and are willing to block on it.
+        */
+       if (!xfs_dqflock_nowait(dqp)) {
+               /*
+                * Block on the flush lock after nudging dquot buffer,
+                * if it is incore.
+                */
+               xfs_qm_dqflock_pushbuf_wait(dqp);
+       }
+
+       /*
+        * XXXIf we're turning this type of quotas off, we don't care
+        * about the dirty metadata sitting in this dquot. OTOH, if
+        * we're unmounting, we do care, so we flush it and wait.
+        */
+       if (XFS_DQ_IS_DIRTY(dqp)) {
+               int     error;
+
+               /* dqflush unlocks dqflock */
+               /*
+                * Given that dqpurge is a very rare occurrence, it is OK
+                * that we're holding the hashlist and mplist locks
+                * across the disk write. But, ... XXXsup
+                *
+                * We don't care about getting disk errors here. We need
+                * to purge this dquot anyway, so we go ahead regardless.
+                */
+               error = xfs_qm_dqflush(dqp, SYNC_WAIT);
+               if (error)
+                       xfs_warn(mp, "%s: dquot %p flush failed",
+                               __func__, dqp);
+               xfs_dqflock(dqp);
+       }
+       ASSERT(atomic_read(&dqp->q_pincount) == 0);
+       ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
+              !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
+
+       list_del_init(&dqp->q_hashlist);
+       qh->qh_version++;
+       list_del_init(&dqp->q_mplist);
+       mp->m_quotainfo->qi_dqreclaims++;
+       mp->m_quotainfo->qi_dquots--;
+       /*
+        * XXX Move this to the front of the freelist, if we can get the
+        * freelist lock.
+        */
+       ASSERT(!list_empty(&dqp->q_freelist));
+
+       dqp->q_mount = NULL;
+       dqp->q_hash = NULL;
+       dqp->dq_flags = XFS_DQ_INACTIVE;
+       memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+       xfs_dqfunlock(dqp);
+       xfs_dqunlock(dqp);
+       mutex_unlock(&qh->qh_lock);
+       return (0);
+}
+
+
+/*
+ * Give the buffer a little push if it is incore and
+ * wait on the flush lock.
+ */
+void
+xfs_qm_dqflock_pushbuf_wait(
+       xfs_dquot_t     *dqp)
+{
+       xfs_mount_t     *mp = dqp->q_mount;
+       xfs_buf_t       *bp;
+
+       /*
+        * Check to see if the dquot has been flushed delayed
+        * write.  If so, grab its buffer and send it
+        * out immediately.  We'll be able to acquire
+        * the flush lock when the I/O completes.
+        */
+       bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
+                       mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+       if (!bp)
+               goto out_lock;
+
+       if (XFS_BUF_ISDELAYWRITE(bp)) {
+               if (xfs_buf_ispinned(bp))
+                       xfs_log_force(mp, 0);
+               xfs_buf_delwri_promote(bp);
+               wake_up_process(bp->b_target->bt_task);
+       }
+       xfs_buf_relse(bp);
+out_lock:
+       xfs_dqflock(dqp);
+}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
new file mode 100644 (file)
index 0000000..34b7e94
--- /dev/null
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_H__
+#define __XFS_DQUOT_H__
+
+/*
+ * Dquots are structures that hold quota information about a user or a group,
+ * much like inodes are for files. In fact, dquots share many characteristics
+ * with inodes. However, dquots can also be a centralized resource, relative
+ * to a collection of inodes. In this respect, dquots share some characteristics
+ * of the superblock.
+ * XFS dquots exploit both those in its algorithms. They make every attempt
+ * to not be a bottleneck when quotas are on and have minimal impact, if any,
+ * when quotas are off.
+ */
+
+/*
+ * The hash chain headers (hash buckets)
+ */
+typedef struct xfs_dqhash {
+       struct list_head  qh_list;
+       struct mutex      qh_lock;
+       uint              qh_version;   /* ever increasing version */
+       uint              qh_nelems;    /* number of dquots on the list */
+} xfs_dqhash_t;
+
+struct xfs_mount;
+struct xfs_trans;
+
+/*
+ * The incore dquot structure
+ */
+typedef struct xfs_dquot {
+       uint             dq_flags;      /* various flags (XFS_DQ_*) */
+       struct list_head q_freelist;    /* global free list of dquots */
+       struct list_head q_mplist;      /* mount's list of dquots */
+       struct list_head q_hashlist;    /* gloabl hash list of dquots */
+       xfs_dqhash_t    *q_hash;        /* the hashchain header */
+       struct xfs_mount*q_mount;       /* filesystem this relates to */
+       struct xfs_trans*q_transp;      /* trans this belongs to currently */
+       uint             q_nrefs;       /* # active refs from inodes */
+       xfs_daddr_t      q_blkno;       /* blkno of dquot buffer */
+       int              q_bufoffset;   /* off of dq in buffer (# dquots) */
+       xfs_fileoff_t    q_fileoffset;  /* offset in quotas file */
+
+       struct xfs_dquot*q_gdquot;      /* group dquot, hint only */
+       xfs_disk_dquot_t q_core;        /* actual usage & quotas */
+       xfs_dq_logitem_t q_logitem;     /* dquot log item */
+       xfs_qcnt_t       q_res_bcount;  /* total regular nblks used+reserved */
+       xfs_qcnt_t       q_res_icount;  /* total inos allocd+reserved */
+       xfs_qcnt_t       q_res_rtbcount;/* total realtime blks used+reserved */
+       struct mutex     q_qlock;       /* quota lock */
+       struct completion q_flush;      /* flush completion queue */
+       atomic_t          q_pincount;   /* dquot pin count */
+       wait_queue_head_t q_pinwait;    /* dquot pinning wait queue */
+} xfs_dquot_t;
+
+/*
+ * Lock hierarchy for q_qlock:
+ *     XFS_QLOCK_NORMAL is the implicit default,
+ *     XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ */
+enum {
+       XFS_QLOCK_NORMAL = 0,
+       XFS_QLOCK_NESTED,
+};
+
+#define XFS_DQHOLD(dqp)                ((dqp)->q_nrefs++)
+
+/*
+ * Manage the q_flush completion queue embedded in the dquot.  This completion
+ * queue synchronizes processes attempting to flush the in-core dquot back to
+ * disk.
+ */
+static inline void xfs_dqflock(xfs_dquot_t *dqp)
+{
+       wait_for_completion(&dqp->q_flush);
+}
+
+static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+{
+       return try_wait_for_completion(&dqp->q_flush);
+}
+
+static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+{
+       complete(&dqp->q_flush);
+}
+
+#define XFS_DQ_IS_LOCKED(dqp)  (mutex_is_locked(&((dqp)->q_qlock)))
+#define XFS_DQ_IS_DIRTY(dqp)   ((dqp)->dq_flags & XFS_DQ_DIRTY)
+#define XFS_QM_ISUDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_USER)
+#define XFS_QM_ISPDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_PROJ)
+#define XFS_QM_ISGDQ(dqp)      ((dqp)->dq_flags & XFS_DQ_GROUP)
+#define XFS_DQ_TO_QINF(dqp)    ((dqp)->q_mount->m_quotainfo)
+#define XFS_DQ_TO_QIP(dqp)     (XFS_QM_ISUDQ(dqp) ? \
+                                XFS_DQ_TO_QINF(dqp)->qi_uquotaip : \
+                                XFS_DQ_TO_QINF(dqp)->qi_gquotaip)
+
+#define XFS_IS_THIS_QUOTA_OFF(d) (! (XFS_QM_ISUDQ(d) ? \
+                                    (XFS_IS_UQUOTA_ON((d)->q_mount)) : \
+                                    (XFS_IS_OQUOTA_ON((d)->q_mount))))
+
+extern void            xfs_qm_dqdestroy(xfs_dquot_t *);
+extern int             xfs_qm_dqflush(xfs_dquot_t *, uint);
+extern int             xfs_qm_dqpurge(xfs_dquot_t *);
+extern void            xfs_qm_dqunpin_wait(xfs_dquot_t *);
+extern int             xfs_qm_dqlock_nowait(xfs_dquot_t *);
+extern void            xfs_qm_dqflock_pushbuf_wait(xfs_dquot_t *dqp);
+extern void            xfs_qm_adjust_dqtimers(xfs_mount_t *,
+                                       xfs_disk_dquot_t *);
+extern void            xfs_qm_adjust_dqlimits(xfs_mount_t *,
+                                       xfs_disk_dquot_t *);
+extern int             xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
+                                       xfs_dqid_t, uint, uint, xfs_dquot_t **);
+extern void            xfs_qm_dqput(xfs_dquot_t *);
+extern void            xfs_dqlock(xfs_dquot_t *);
+extern void            xfs_dqlock2(xfs_dquot_t *, xfs_dquot_t *);
+extern void            xfs_dqunlock(xfs_dquot_t *);
+extern void            xfs_dqunlock_nonotify(xfs_dquot_t *);
+
+#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
new file mode 100644 (file)
index 0000000..9e0e2fa
--- /dev/null
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+static inline struct xfs_dq_logitem *DQUOT_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_dq_logitem, qli_item);
+}
+
+/*
+ * returns the number of iovecs needed to log the given dquot item.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_size(
+       struct xfs_log_item     *lip)
+{
+       /*
+        * we need only two iovecs, one for the format, one for the real thing
+        */
+       return 2;
+}
+
+/*
+ * fills in the vector of log iovecs for the given dquot log item.
+ */
+STATIC void
+xfs_qm_dquot_logitem_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_iovec    *logvec)
+{
+       struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
+
+       logvec->i_addr = &qlip->qli_format;
+       logvec->i_len  = sizeof(xfs_dq_logformat_t);
+       logvec->i_type = XLOG_REG_TYPE_QFORMAT;
+       logvec++;
+       logvec->i_addr = &qlip->qli_dquot->q_core;
+       logvec->i_len  = sizeof(xfs_disk_dquot_t);
+       logvec->i_type = XLOG_REG_TYPE_DQUOT;
+
+       ASSERT(2 == lip->li_desc->lid_size);
+       qlip->qli_format.qlf_size = 2;
+
+}
+
+/*
+ * Increment the pin count of the given dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pin(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+       atomic_inc(&dqp->q_pincount);
+}
+
+/*
+ * Decrement the pin count of the given dquot, and wake up
+ * anyone in xfs_dqwait_unpin() if the count goes to 0.         The
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
+ */
+STATIC void
+xfs_qm_dquot_logitem_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+       ASSERT(atomic_read(&dqp->q_pincount) > 0);
+       if (atomic_dec_and_test(&dqp->q_pincount))
+               wake_up(&dqp->q_pinwait);
+}
+
+/*
+ * Given the logitem, this writes the corresponding dquot entry to disk
+ * asynchronously. This is called with the dquot entry securely locked;
+ * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
+ * at the end.
+ */
+STATIC void
+xfs_qm_dquot_logitem_push(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
+       int                     error;
+
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+       ASSERT(!completion_done(&dqp->q_flush));
+
+       /*
+        * Since we were able to lock the dquot's flush lock and
+        * we found it on the AIL, the dquot must be dirty.  This
+        * is because the dquot is removed from the AIL while still
+        * holding the flush lock in xfs_dqflush_done().  Thus, if
+        * we found it in the AIL and were able to obtain the flush
+        * lock without sleeping, then there must not have been
+        * anyone in the process of flushing the dquot.
+        */
+       error = xfs_qm_dqflush(dqp, 0);
+       if (error)
+               xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
+                       __func__, error, dqp);
+       xfs_dqunlock(dqp);
+}
+
+STATIC xfs_lsn_t
+xfs_qm_dquot_logitem_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       /*
+        * We always re-log the entire dquot when it becomes dirty,
+        * so, the latest copy _is_ the only one that matters.
+        */
+       return lsn;
+}
+
+/*
+ * This is called to wait for the given dquot to be unpinned.
+ * Most of these pin/unpin routines are plagiarized from inode code.
+ */
+void
+xfs_qm_dqunpin_wait(
+       struct xfs_dquot        *dqp)
+{
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+       if (atomic_read(&dqp->q_pincount) == 0)
+               return;
+
+       /*
+        * Give the log a push so we don't wait here too long.
+        */
+       xfs_log_force(dqp->q_mount, 0);
+       wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
+}
+
+/*
+ * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
+ * the dquot is locked by us, but the flush lock isn't. So, here we are
+ * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
+ * If so, we want to push it out to help us take this item off the AIL as soon
+ * as possible.
+ *
+ * We must not be holding the AIL lock at this point. Calling incore() to
+ * search the buffer cache can be a time consuming thing, and AIL lock is a
+ * spinlock.
+ */
+STATIC void
+xfs_qm_dquot_logitem_pushbuf(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
+       struct xfs_dquot        *dqp = qlip->qli_dquot;
+       struct xfs_buf          *bp;
+
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       /*
+        * If flushlock isn't locked anymore, chances are that the
+        * inode flush completed and the inode was taken off the AIL.
+        * So, just get out.
+        */
+       if (completion_done(&dqp->q_flush) ||
+           !(lip->li_flags & XFS_LI_IN_AIL)) {
+               xfs_dqunlock(dqp);
+               return;
+       }
+
+       bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
+                       dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
+       xfs_dqunlock(dqp);
+       if (!bp)
+               return;
+       if (XFS_BUF_ISDELAYWRITE(bp))
+               xfs_buf_delwri_promote(bp);
+       xfs_buf_relse(bp);
+}
+
+/*
+ * This is called to attempt to lock the dquot associated with this
+ * dquot log item.  Don't sleep on the dquot lock or the flush lock.
+ * If the flush lock is already held, indicating that the dquot has
+ * been or is in the process of being flushed, then see if we can
+ * find the dquot's buffer in the buffer cache without sleeping.  If
+ * we can and it is marked delayed write, then we want to send it out.
+ * We delay doing so until the push routine, though, to avoid sleeping
+ * in any device strategy routines.
+ */
+STATIC uint
+xfs_qm_dquot_logitem_trylock(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+       if (atomic_read(&dqp->q_pincount) > 0)
+               return XFS_ITEM_PINNED;
+
+       if (!xfs_qm_dqlock_nowait(dqp))
+               return XFS_ITEM_LOCKED;
+
+       if (!xfs_dqflock_nowait(dqp)) {
+               /*
+                * dquot has already been flushed to the backing buffer,
+                * leave it locked, pushbuf routine will unlock it.
+                */
+               return XFS_ITEM_PUSHBUF;
+       }
+
+       ASSERT(lip->li_flags & XFS_LI_IN_AIL);
+       return XFS_ITEM_SUCCESS;
+}
+
+/*
+ * Unlock the dquot associated with the log item.
+ * Clear the fields of the dquot and dquot log item that
+ * are specific to the current transaction.  If the
+ * hold flags is set, do not unlock the dquot.
+ */
+STATIC void
+xfs_qm_dquot_logitem_unlock(
+       struct xfs_log_item     *lip)
+{
+       struct xfs_dquot        *dqp = DQUOT_ITEM(lip)->qli_dquot;
+
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       /*
+        * Clear the transaction pointer in the dquot
+        */
+       dqp->q_transp = NULL;
+
+       /*
+        * dquots are never 'held' from getting unlocked at the end of
+        * a transaction.  Their locking and unlocking is hidden inside the
+        * transaction layer, within trans_commit. Hence, no LI_HOLD flag
+        * for the logitem.
+        */
+       xfs_dqunlock(dqp);
+}
+
+/*
+ * this needs to stamp an lsn into the dquot, I think.
+ * rpc's that look at user dquot's would then have to
+ * push on the dependency recorded in the dquot
+ */
+STATIC void
+xfs_qm_dquot_logitem_committing(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+}
+
+/*
+ * This is the ops vector for dquots
+ */
+static struct xfs_item_ops xfs_dquot_item_ops = {
+       .iop_size       = xfs_qm_dquot_logitem_size,
+       .iop_format     = xfs_qm_dquot_logitem_format,
+       .iop_pin        = xfs_qm_dquot_logitem_pin,
+       .iop_unpin      = xfs_qm_dquot_logitem_unpin,
+       .iop_trylock    = xfs_qm_dquot_logitem_trylock,
+       .iop_unlock     = xfs_qm_dquot_logitem_unlock,
+       .iop_committed  = xfs_qm_dquot_logitem_committed,
+       .iop_push       = xfs_qm_dquot_logitem_push,
+       .iop_pushbuf    = xfs_qm_dquot_logitem_pushbuf,
+       .iop_committing = xfs_qm_dquot_logitem_committing
+};
+
+/*
+ * Initialize the dquot log item for a newly allocated dquot.
+ * The dquot isn't locked at this point, but it isn't on any of the lists
+ * either, so we don't care.
+ */
+void
+xfs_qm_dquot_logitem_init(
+       struct xfs_dquot        *dqp)
+{
+       struct xfs_dq_logitem   *lp = &dqp->q_logitem;
+
+       xfs_log_item_init(dqp->q_mount, &lp->qli_item, XFS_LI_DQUOT,
+                                       &xfs_dquot_item_ops);
+       lp->qli_dquot = dqp;
+       lp->qli_format.qlf_type = XFS_LI_DQUOT;
+       lp->qli_format.qlf_id = be32_to_cpu(dqp->q_core.d_id);
+       lp->qli_format.qlf_blkno = dqp->q_blkno;
+       lp->qli_format.qlf_len = 1;
+       /*
+        * This is just the offset of this dquot within its buffer
+        * (which is currently 1 FSB and probably won't change).
+        * Hence 32 bits for this offset should be just fine.
+        * Alternatively, we can store (bufoffset / sizeof(xfs_dqblk_t))
+        * here, and recompute it at recovery time.
+        */
+       lp->qli_format.qlf_boffset = (__uint32_t)dqp->q_bufoffset;
+}
+
+/*------------------  QUOTAOFF LOG ITEMS  -------------------*/
+
+static inline struct xfs_qoff_logitem *QOFF_ITEM(struct xfs_log_item *lip)
+{
+       return container_of(lip, struct xfs_qoff_logitem, qql_item);
+}
+
+
+/*
+ * This returns the number of iovecs needed to log the given quotaoff item.
+ * We only need 1 iovec for an quotaoff item.  It just logs the
+ * quotaoff_log_format structure.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_size(
+       struct xfs_log_item     *lip)
+{
+       return 1;
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given quotaoff log item. We use only 1 iovec, and we point that
+ * at the quotaoff_log_format structure embedded in the quotaoff item.
+ * It is at this point that we assert that all of the extent
+ * slots in the quotaoff item have been filled.
+ */
+STATIC void
+xfs_qm_qoff_logitem_format(
+       struct xfs_log_item     *lip,
+       struct xfs_log_iovec    *log_vector)
+{
+       struct xfs_qoff_logitem *qflip = QOFF_ITEM(lip);
+
+       ASSERT(qflip->qql_format.qf_type == XFS_LI_QUOTAOFF);
+
+       log_vector->i_addr = &qflip->qql_format;
+       log_vector->i_len = sizeof(xfs_qoff_logitem_t);
+       log_vector->i_type = XLOG_REG_TYPE_QUOTAOFF;
+       qflip->qql_format.qf_size = 1;
+}
+
+/*
+ * Pinning has no meaning for an quotaoff item, so just return.
+ */
+STATIC void
+xfs_qm_qoff_logitem_pin(
+       struct xfs_log_item     *lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an quotaoff item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unpin(
+       struct xfs_log_item     *lip,
+       int                     remove)
+{
+}
+
+/*
+ * Quotaoff items have no locking, so just return success.
+ */
+STATIC uint
+xfs_qm_qoff_logitem_trylock(
+       struct xfs_log_item     *lip)
+{
+       return XFS_ITEM_LOCKED;
+}
+
+/*
+ * Quotaoff items have no locking or pushing, so return failure
+ * so that the caller doesn't bother with us.
+ */
+STATIC void
+xfs_qm_qoff_logitem_unlock(
+       struct xfs_log_item     *lip)
+{
+}
+
+/*
+ * The quotaoff-start-item is logged only once and cannot be moved in the log,
+ * so simply return the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_qm_qoff_logitem_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       return lsn;
+}
+
+/*
+ * There isn't much you can do to push on an quotaoff item.  It is simply
+ * stuck waiting for the log to be flushed to disk.
+ */
+STATIC void
+xfs_qm_qoff_logitem_push(
+       struct xfs_log_item     *lip)
+{
+}
+
+
+STATIC xfs_lsn_t
+xfs_qm_qoffend_logitem_committed(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               lsn)
+{
+       struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
+       struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
+       struct xfs_ail          *ailp = qfs->qql_item.li_ailp;
+
+       /*
+        * Delete the qoff-start logitem from the AIL.
+        * xfs_trans_ail_delete() drops the AIL lock.
+        */
+       spin_lock(&ailp->xa_lock);
+       xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
+
+       kmem_free(qfs);
+       kmem_free(qfe);
+       return (xfs_lsn_t)-1;
+}
+
+/*
+ * XXX rcc - don't know quite what to do with this.  I think we can
+ * just ignore it.  The only time that isn't the case is if we allow
+ * the client to somehow see that quotas have been turned off in which
+ * we can't allow that to get back until the quotaoff hits the disk.
+ * So how would that happen?  Also, do we need different routines for
+ * quotaoff start and quotaoff end?  I suspect the answer is yes but
+ * to be sure, I need to look at the recovery code and see how quota off
+ * recovery is handled (do we roll forward or back or do something else).
+ * If we roll forwards or backwards, then we need two separate routines,
+ * one that does nothing and one that stamps in the lsn that matters
+ * (truly makes the quotaoff irrevocable).  If we do something else,
+ * then maybe we don't need two.
+ */
+STATIC void
+xfs_qm_qoff_logitem_committing(
+       struct xfs_log_item     *lip,
+       xfs_lsn_t               commit_lsn)
+{
+}
+
+static struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
+       .iop_size       = xfs_qm_qoff_logitem_size,
+       .iop_format     = xfs_qm_qoff_logitem_format,
+       .iop_pin        = xfs_qm_qoff_logitem_pin,
+       .iop_unpin      = xfs_qm_qoff_logitem_unpin,
+       .iop_trylock    = xfs_qm_qoff_logitem_trylock,
+       .iop_unlock     = xfs_qm_qoff_logitem_unlock,
+       .iop_committed  = xfs_qm_qoffend_logitem_committed,
+       .iop_push       = xfs_qm_qoff_logitem_push,
+       .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * This is the ops vector shared by all quotaoff-start log items.
+ */
+static struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
+       .iop_size       = xfs_qm_qoff_logitem_size,
+       .iop_format     = xfs_qm_qoff_logitem_format,
+       .iop_pin        = xfs_qm_qoff_logitem_pin,
+       .iop_unpin      = xfs_qm_qoff_logitem_unpin,
+       .iop_trylock    = xfs_qm_qoff_logitem_trylock,
+       .iop_unlock     = xfs_qm_qoff_logitem_unlock,
+       .iop_committed  = xfs_qm_qoff_logitem_committed,
+       .iop_push       = xfs_qm_qoff_logitem_push,
+       .iop_committing = xfs_qm_qoff_logitem_committing
+};
+
+/*
+ * Allocate and initialize an quotaoff item of the correct quota type(s).
+ */
+struct xfs_qoff_logitem *
+xfs_qm_qoff_logitem_init(
+       struct xfs_mount        *mp,
+       struct xfs_qoff_logitem *start,
+       uint                    flags)
+{
+       struct xfs_qoff_logitem *qf;
+
+       qf = kmem_zalloc(sizeof(struct xfs_qoff_logitem), KM_SLEEP);
+
+       xfs_log_item_init(mp, &qf->qql_item, XFS_LI_QUOTAOFF, start ?
+                       &xfs_qm_qoffend_logitem_ops : &xfs_qm_qoff_logitem_ops);
+       qf->qql_item.li_mountp = mp;
+       qf->qql_format.qf_type = XFS_LI_QUOTAOFF;
+       qf->qql_format.qf_flags = flags;
+       qf->qql_start_lip = start;
+       return qf;
+}
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
new file mode 100644 (file)
index 0000000..5acae2a
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_DQUOT_ITEM_H__
+#define __XFS_DQUOT_ITEM_H__
+
+struct xfs_dquot;
+struct xfs_trans;
+struct xfs_mount;
+struct xfs_qoff_logitem;
+
+typedef struct xfs_dq_logitem {
+       xfs_log_item_t           qli_item;         /* common portion */
+       struct xfs_dquot        *qli_dquot;        /* dquot ptr */
+       xfs_lsn_t                qli_flush_lsn;    /* lsn at last flush */
+       xfs_dq_logformat_t       qli_format;       /* logged structure */
+} xfs_dq_logitem_t;
+
+typedef struct xfs_qoff_logitem {
+       xfs_log_item_t           qql_item;      /* common portion */
+       struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+       xfs_qoff_logformat_t     qql_format;    /* logged structure */
+} xfs_qoff_logitem_t;
+
+
+extern void               xfs_qm_dquot_logitem_init(struct xfs_dquot *);
+extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
+                                       struct xfs_qoff_logitem *, uint);
+extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
+                                       struct xfs_qoff_logitem *, uint);
+extern void               xfs_trans_log_quotaoff_item(struct xfs_trans *,
+                                       struct xfs_qoff_logitem *);
+
+#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
new file mode 100644 (file)
index 0000000..75e5d32
--- /dev/null
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_mount.h"
+#include "xfs_export.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+/*
+ * Note that we only accept fileids which are long enough rather than allow
+ * the parent generation number to default to zero.  XFS considers zero a
+ * valid generation number not an invalid/wildcard value.
+ */
+static int xfs_fileid_length(int fileid_type)
+{
+       switch (fileid_type) {
+       case FILEID_INO32_GEN:
+               return 2;
+       case FILEID_INO32_GEN_PARENT:
+               return 4;
+       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+               return 3;
+       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+               return 6;
+       }
+       return 255; /* invalid */
+}
+
+STATIC int
+xfs_fs_encode_fh(
+       struct dentry           *dentry,
+       __u32                   *fh,
+       int                     *max_len,
+       int                     connectable)
+{
+       struct fid              *fid = (struct fid *)fh;
+       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fh;
+       struct inode            *inode = dentry->d_inode;
+       int                     fileid_type;
+       int                     len;
+
+       /* Directories don't need their parent encoded, they have ".." */
+       if (S_ISDIR(inode->i_mode) || !connectable)
+               fileid_type = FILEID_INO32_GEN;
+       else
+               fileid_type = FILEID_INO32_GEN_PARENT;
+
+       /*
+        * If the the filesystem may contain 64bit inode numbers, we need
+        * to use larger file handles that can represent them.
+        *
+        * While we only allocate inodes that do not fit into 32 bits any
+        * large enough filesystem may contain them, thus the slightly
+        * confusing looking conditional below.
+        */
+       if (!(XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_SMALL_INUMS) ||
+           (XFS_M(inode->i_sb)->m_flags & XFS_MOUNT_32BITINODES))
+               fileid_type |= XFS_FILEID_TYPE_64FLAG;
+
+       /*
+        * Only encode if there is enough space given.  In practice
+        * this means we can't export a filesystem with 64bit inodes
+        * over NFSv2 with the subtree_check export option; the other
+        * seven combinations work.  The real answer is "don't use v2".
+        */
+       len = xfs_fileid_length(fileid_type);
+       if (*max_len < len) {
+               *max_len = len;
+               return 255;
+       }
+       *max_len = len;
+
+       switch (fileid_type) {
+       case FILEID_INO32_GEN_PARENT:
+               spin_lock(&dentry->d_lock);
+               fid->i32.parent_ino = dentry->d_parent->d_inode->i_ino;
+               fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
+               spin_unlock(&dentry->d_lock);
+               /*FALLTHRU*/
+       case FILEID_INO32_GEN:
+               fid->i32.ino = inode->i_ino;
+               fid->i32.gen = inode->i_generation;
+               break;
+       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+               spin_lock(&dentry->d_lock);
+               fid64->parent_ino = dentry->d_parent->d_inode->i_ino;
+               fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
+               spin_unlock(&dentry->d_lock);
+               /*FALLTHRU*/
+       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+               fid64->ino = inode->i_ino;
+               fid64->gen = inode->i_generation;
+               break;
+       }
+
+       return fileid_type;
+}
+
+STATIC struct inode *
+xfs_nfs_get_inode(
+       struct super_block      *sb,
+       u64                     ino,
+       u32                     generation)
+ {
+       xfs_mount_t             *mp = XFS_M(sb);
+       xfs_inode_t             *ip;
+       int                     error;
+
+       /*
+        * NFS can sometimes send requests for ino 0.  Fail them gracefully.
+        */
+       if (ino == 0)
+               return ERR_PTR(-ESTALE);
+
+       /*
+        * The XFS_IGET_UNTRUSTED means that an invalid inode number is just
+        * fine and not an indication of a corrupted filesystem as clients can
+        * send invalid file handles and we have to handle it gracefully..
+        */
+       error = xfs_iget(mp, NULL, ino, XFS_IGET_UNTRUSTED, 0, &ip);
+       if (error) {
+               /*
+                * EINVAL means the inode cluster doesn't exist anymore.
+                * This implies the filehandle is stale, so we should
+                * translate it here.
+                * We don't use ESTALE directly down the chain to not
+                * confuse applications using bulkstat that expect EINVAL.
+                */
+               if (error == EINVAL || error == ENOENT)
+                       error = ESTALE;
+               return ERR_PTR(-error);
+       }
+
+       if (ip->i_d.di_gen != generation) {
+               IRELE(ip);
+               return ERR_PTR(-ESTALE);
+       }
+
+       return VFS_I(ip);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_dentry(struct super_block *sb, struct fid *fid,
+                int fh_len, int fileid_type)
+{
+       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fid;
+       struct inode            *inode = NULL;
+
+       if (fh_len < xfs_fileid_length(fileid_type))
+               return NULL;
+
+       switch (fileid_type) {
+       case FILEID_INO32_GEN_PARENT:
+       case FILEID_INO32_GEN:
+               inode = xfs_nfs_get_inode(sb, fid->i32.ino, fid->i32.gen);
+               break;
+       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+       case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
+               inode = xfs_nfs_get_inode(sb, fid64->ino, fid64->gen);
+               break;
+       }
+
+       return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_fh_to_parent(struct super_block *sb, struct fid *fid,
+                int fh_len, int fileid_type)
+{
+       struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fid;
+       struct inode            *inode = NULL;
+
+       switch (fileid_type) {
+       case FILEID_INO32_GEN_PARENT:
+               inode = xfs_nfs_get_inode(sb, fid->i32.parent_ino,
+                                             fid->i32.parent_gen);
+               break;
+       case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
+               inode = xfs_nfs_get_inode(sb, fid64->parent_ino,
+                                             fid64->parent_gen);
+               break;
+       }
+
+       return d_obtain_alias(inode);
+}
+
+STATIC struct dentry *
+xfs_fs_get_parent(
+       struct dentry           *child)
+{
+       int                     error;
+       struct xfs_inode        *cip;
+
+       error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL);
+       if (unlikely(error))
+               return ERR_PTR(-error);
+
+       return d_obtain_alias(VFS_I(cip));
+}
+
+STATIC int
+xfs_fs_nfs_commit_metadata(
+       struct inode            *inode)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error = 0;
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (xfs_ipincount(ip)) {
+               error = _xfs_log_force_lsn(mp, ip->i_itemp->ili_last_lsn,
+                               XFS_LOG_SYNC, NULL);
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       return error;
+}
+
+const struct export_operations xfs_export_operations = {
+       .encode_fh              = xfs_fs_encode_fh,
+       .fh_to_dentry           = xfs_fs_fh_to_dentry,
+       .fh_to_parent           = xfs_fs_fh_to_parent,
+       .get_parent             = xfs_fs_get_parent,
+       .commit_metadata        = xfs_fs_nfs_commit_metadata,
+};
diff --git a/fs/xfs/xfs_export.h b/fs/xfs/xfs_export.h
new file mode 100644 (file)
index 0000000..3272b6a
--- /dev/null
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_EXPORT_H__
+#define __XFS_EXPORT_H__
+
+/*
+ * Common defines for code related to exporting XFS filesystems over NFS.
+ *
+ * The NFS fileid goes out on the wire as an array of
+ * 32bit unsigned ints in host order.  There are 5 possible
+ * formats.
+ *
+ * (1) fileid_type=0x00
+ *     (no fileid data; handled by the generic code)
+ *
+ * (2) fileid_type=0x01
+ *     inode-num
+ *     generation
+ *
+ * (3) fileid_type=0x02
+ *     inode-num
+ *     generation
+ *     parent-inode-num
+ *     parent-generation
+ *
+ * (4) fileid_type=0x81
+ *     inode-num-lo32
+ *     inode-num-hi32
+ *     generation
+ *
+ * (5) fileid_type=0x82
+ *     inode-num-lo32
+ *     inode-num-hi32
+ *     generation
+ *     parent-inode-num-lo32
+ *     parent-inode-num-hi32
+ *     parent-generation
+ *
+ * Note, the NFS filehandle also includes an fsid portion which
+ * may have an inode number in it.  That number is hardcoded to
+ * 32bits and there is no way for XFS to intercept it.  In
+ * practice this means when exporting an XFS filesystem with 64bit
+ * inodes you should either export the mountpoint (rather than
+ * a subdirectory) or use the "fsid" export option.
+ */
+
+struct xfs_fid64 {
+       u64 ino;
+       u32 gen;
+       u64 parent_ino;
+       u32 parent_gen;
+} __attribute__((packed));
+
+/* This flag goes on the wire.  Don't play with it. */
+#define XFS_FILEID_TYPE_64FLAG 0x80    /* NFS fileid has 64bit inodes */
+
+#endif /* __XFS_EXPORT_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
new file mode 100644 (file)
index 0000000..7f7b424
--- /dev/null
@@ -0,0 +1,1096 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_trans.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_error.h"
+#include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+
+#include <linux/dcache.h>
+#include <linux/falloc.h>
+
+static const struct vm_operations_struct xfs_file_vm_ops;
+
+/*
+ * Locking primitives for read and write IO paths to ensure we consistently use
+ * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
+ */
+static inline void
+xfs_rw_ilock(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_lock(&VFS_I(ip)->i_mutex);
+       xfs_ilock(ip, type);
+}
+
+static inline void
+xfs_rw_iunlock(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       xfs_iunlock(ip, type);
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+static inline void
+xfs_rw_ilock_demote(
+       struct xfs_inode        *ip,
+       int                     type)
+{
+       xfs_ilock_demote(ip, type);
+       if (type & XFS_IOLOCK_EXCL)
+               mutex_unlock(&VFS_I(ip)->i_mutex);
+}
+
+/*
+ *     xfs_iozero
+ *
+ *     xfs_iozero clears the specified range of buffer supplied,
+ *     and marks all the affected blocks as valid and modified.  If
+ *     an affected block is not allocated, it will be allocated.  If
+ *     an affected block is not completely overwritten, and is not
+ *     valid before the operation, it will be read from disk before
+ *     being partially zeroed.
+ */
+STATIC int
+xfs_iozero(
+       struct xfs_inode        *ip,    /* inode                        */
+       loff_t                  pos,    /* offset in file               */
+       size_t                  count)  /* size of data to zero         */
+{
+       struct page             *page;
+       struct address_space    *mapping;
+       int                     status;
+
+       mapping = VFS_I(ip)->i_mapping;
+       do {
+               unsigned offset, bytes;
+               void *fsdata;
+
+               offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+               bytes = PAGE_CACHE_SIZE - offset;
+               if (bytes > count)
+                       bytes = count;
+
+               status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                       AOP_FLAG_UNINTERRUPTIBLE,
+                                       &page, &fsdata);
+               if (status)
+                       break;
+
+               zero_user(page, offset, bytes);
+
+               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
+                                       page, fsdata);
+               WARN_ON(status <= 0); /* can't return less than zero! */
+               pos += bytes;
+               count -= bytes;
+               status = 0;
+       } while (count);
+
+       return (-status);
+}
+
+STATIC int
+xfs_file_fsync(
+       struct file             *file,
+       loff_t                  start,
+       loff_t                  end,
+       int                     datasync)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       int                     error = 0;
+       int                     log_flushed = 0;
+
+       trace_xfs_file_fsync(ip);
+
+       error = filemap_write_and_wait_range(inode->i_mapping, start, end);
+       if (error)
+               return error;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+
+       xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       xfs_ioend_wait(ip);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+       if (mp->m_flags & XFS_MOUNT_BARRIER) {
+               /*
+                * If we have an RT and/or log subvolume we need to make sure
+                * to flush the write cache the device used for file data
+                * first.  This is to ensure newly written file data make
+                * it to disk before logging the new inode size in case of
+                * an extending write.
+                */
+               if (XFS_IS_REALTIME_INODE(ip))
+                       xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+               else if (mp->m_logdev_targp != mp->m_ddev_targp)
+                       xfs_blkdev_issue_flush(mp->m_ddev_targp);
+       }
+
+       /*
+        * We always need to make sure that the required inode state is safe on
+        * disk.  The inode might be clean but we still might need to force the
+        * log because of committed transactions that haven't hit the disk yet.
+        * Likewise, there could be unflushed non-transactional changes to the
+        * inode core that have to go to disk and this requires us to issue
+        * a synchronous transaction to capture these changes correctly.
+        *
+        * This code relies on the assumption that if the i_update_core field
+        * of the inode is clear and the inode is unpinned then it is clean
+        * and no action is required.
+        */
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+       /*
+        * First check if the VFS inode is marked dirty.  All the dirtying
+        * of non-transactional updates no goes through mark_inode_dirty*,
+        * which allows us to distinguish beteeen pure timestamp updates
+        * and i_size updates which need to be caught for fdatasync.
+        * After that also theck for the dirty state in the XFS inode, which
+        * might gets cleared when the inode gets written out via the AIL
+        * or xfs_iflush_cluster.
+        */
+       if (((inode->i_state & I_DIRTY_DATASYNC) ||
+           ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+           ip->i_update_core) {
+               /*
+                * Kick off a transaction to log the inode core to get the
+                * updates.  The sync transaction will also force the log.
+                */
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+               tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+               error = xfs_trans_reserve(tp, 0,
+                               XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+               if (error) {
+                       xfs_trans_cancel(tp, 0);
+                       return -error;
+               }
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+               /*
+                * Note - it's possible that we might have pushed ourselves out
+                * of the way during trans_reserve which would flush the inode.
+                * But there's no guarantee that the inode buffer has actually
+                * gone out yet (it's delwri).  Plus the buffer could be pinned
+                * anyway if it's part of an inode in another recent
+                * transaction.  So we play it safe and fire off the
+                * transaction anyway.
+                */
+               xfs_trans_ijoin(tp, ip);
+               xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+               xfs_trans_set_sync(tp);
+               error = _xfs_trans_commit(tp, 0, &log_flushed);
+
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       } else {
+               /*
+                * Timestamps/size haven't changed since last inode flush or
+                * inode transaction commit.  That means either nothing got
+                * written or a transaction committed which caught the updates.
+                * If the latter happened and the transaction hasn't hit the
+                * disk yet, the inode will be still be pinned.  If it is,
+                * force the log.
+                */
+               if (xfs_ipincount(ip)) {
+                       error = _xfs_log_force_lsn(mp,
+                                       ip->i_itemp->ili_last_lsn,
+                                       XFS_LOG_SYNC, &log_flushed);
+               }
+               xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       }
+
+       /*
+        * If we only have a single device, and the log force about was
+        * a no-op we might have to flush the data device cache here.
+        * This can only happen for fdatasync/O_DSYNC if we were overwriting
+        * an already allocated file and thus do not have any metadata to
+        * commit.
+        */
+       if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
+           mp->m_logdev_targp == mp->m_ddev_targp &&
+           !XFS_IS_REALTIME_INODE(ip) &&
+           !log_flushed)
+               xfs_blkdev_issue_flush(mp->m_ddev_targp);
+
+       return -error;
+}
+
+STATIC ssize_t
+xfs_file_aio_read(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos)
+{
+       struct file             *file = iocb->ki_filp;
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       size_t                  size = 0;
+       ssize_t                 ret = 0;
+       int                     ioflags = 0;
+       xfs_fsize_t             n;
+       unsigned long           seg;
+
+       XFS_STATS_INC(xs_read_calls);
+
+       BUG_ON(iocb->ki_pos != pos);
+
+       if (unlikely(file->f_flags & O_DIRECT))
+               ioflags |= IO_ISDIRECT;
+       if (file->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       /* START copy & waste from filemap.c */
+       for (seg = 0; seg < nr_segs; seg++) {
+               const struct iovec *iv = &iovp[seg];
+
+               /*
+                * If any segment has a negative length, or the cumulative
+                * length ever wraps negative then return -EINVAL.
+                */
+               size += iv->iov_len;
+               if (unlikely((ssize_t)(size|iv->iov_len) < 0))
+                       return XFS_ERROR(-EINVAL);
+       }
+       /* END copy & waste from filemap.c */
+
+       if (unlikely(ioflags & IO_ISDIRECT)) {
+               xfs_buftarg_t   *target =
+                       XFS_IS_REALTIME_INODE(ip) ?
+                               mp->m_rtdev_targp : mp->m_ddev_targp;
+               if ((iocb->ki_pos & target->bt_smask) ||
+                   (size & target->bt_smask)) {
+                       if (iocb->ki_pos == ip->i_size)
+                               return 0;
+                       return -XFS_ERROR(EINVAL);
+               }
+       }
+
+       n = XFS_MAXIOFFSET(mp) - iocb->ki_pos;
+       if (n <= 0 || size == 0)
+               return 0;
+
+       if (n < size)
+               size = n;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -EIO;
+
+       if (unlikely(ioflags & IO_ISDIRECT)) {
+               xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
+
+               if (inode->i_mapping->nrpages) {
+                       ret = -xfs_flushinval_pages(ip,
+                                       (iocb->ki_pos & PAGE_CACHE_MASK),
+                                       -1, FI_REMAPF_LOCKED);
+                       if (ret) {
+                               xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+                               return ret;
+                       }
+               }
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+       } else
+               xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+       trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+
+       ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+       if (ret > 0)
+               XFS_STATS_ADD(xs_read_bytes, ret);
+
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_splice_read(
+       struct file             *infilp,
+       loff_t                  *ppos,
+       struct pipe_inode_info  *pipe,
+       size_t                  count,
+       unsigned int            flags)
+{
+       struct xfs_inode        *ip = XFS_I(infilp->f_mapping->host);
+       int                     ioflags = 0;
+       ssize_t                 ret;
+
+       XFS_STATS_INC(xs_read_calls);
+
+       if (infilp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+
+       trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
+
+       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       if (ret > 0)
+               XFS_STATS_ADD(xs_read_bytes, ret);
+
+       xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+       return ret;
+}
+
+STATIC void
+xfs_aio_write_isize_update(
+       struct inode    *inode,
+       loff_t          *ppos,
+       ssize_t         bytes_written)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       xfs_fsize_t             isize = i_size_read(inode);
+
+       if (bytes_written > 0)
+               XFS_STATS_ADD(xs_write_bytes, bytes_written);
+
+       if (unlikely(bytes_written < 0 && bytes_written != -EFAULT &&
+                                       *ppos > isize))
+               *ppos = isize;
+
+       if (*ppos > ip->i_size) {
+               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+               if (*ppos > ip->i_size)
+                       ip->i_size = *ppos;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+}
+
+/*
+ * If this was a direct or synchronous I/O that failed (such as ENOSPC) then
+ * part of the I/O may have been written to disk before the error occurred.  In
+ * this case the on-disk file size may have been adjusted beyond the in-memory
+ * file size and now needs to be truncated back.
+ */
+STATIC void
+xfs_aio_write_newsize_update(
+       struct xfs_inode        *ip)
+{
+       if (ip->i_new_size) {
+               xfs_rw_ilock(ip, XFS_ILOCK_EXCL);
+               ip->i_new_size = 0;
+               if (ip->i_d.di_size > ip->i_size)
+                       ip->i_d.di_size = ip->i_size;
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+       }
+}
+
+/*
+ * xfs_file_splice_write() does not use xfs_rw_ilock() because
+ * generic_file_splice_write() takes the i_mutex itself. This, in theory,
+ * couuld cause lock inversions between the aio_write path and the splice path
+ * if someone is doing concurrent splice(2) based writes and write(2) based
+ * writes to the same inode. The only real way to fix this is to re-implement
+ * the generic code here with correct locking orders.
+ */
+STATIC ssize_t
+xfs_file_splice_write(
+       struct pipe_inode_info  *pipe,
+       struct file             *outfilp,
+       loff_t                  *ppos,
+       size_t                  count,
+       unsigned int            flags)
+{
+       struct inode            *inode = outfilp->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       xfs_fsize_t             new_size;
+       int                     ioflags = 0;
+       ssize_t                 ret;
+
+       XFS_STATS_INC(xs_write_calls);
+
+       if (outfilp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       new_size = *ppos + count;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (new_size > ip->i_size)
+               ip->i_new_size = new_size;
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
+
+       ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
+
+       xfs_aio_write_isize_update(inode, ppos, ret);
+       xfs_aio_write_newsize_update(ip);
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return ret;
+}
+
+/*
+ * This routine is called to handle zeroing any space in the last
+ * block of the file that is beyond the EOF.  We do this since the
+ * size is being increased without writing anything to that block
+ * and we don't want anyone to read the garbage on the disk.
+ */
+STATIC int                             /* error (positive) */
+xfs_zero_last_block(
+       xfs_inode_t     *ip,
+       xfs_fsize_t     offset,
+       xfs_fsize_t     isize)
+{
+       xfs_fileoff_t   last_fsb;
+       xfs_mount_t     *mp = ip->i_mount;
+       int             nimaps;
+       int             zero_offset;
+       int             zero_len;
+       int             error = 0;
+       xfs_bmbt_irec_t imap;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       zero_offset = XFS_B_FSB_OFFSET(mp, isize);
+       if (zero_offset == 0) {
+               /*
+                * There are no extra bytes in the last block on disk to
+                * zero, so return.
+                */
+               return 0;
+       }
+
+       last_fsb = XFS_B_TO_FSBT(mp, isize);
+       nimaps = 1;
+       error = xfs_bmapi(NULL, ip, last_fsb, 1, 0, NULL, 0, &imap,
+                         &nimaps, NULL);
+       if (error) {
+               return error;
+       }
+       ASSERT(nimaps > 0);
+       /*
+        * If the block underlying isize is just a hole, then there
+        * is nothing to zero.
+        */
+       if (imap.br_startblock == HOLESTARTBLOCK) {
+               return 0;
+       }
+       /*
+        * Zero the part of the last block beyond the EOF, and write it
+        * out sync.  We need to drop the ilock while we do this so we
+        * don't deadlock when the buffer cache calls back to us.
+        */
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       zero_len = mp->m_sb.sb_blocksize - zero_offset;
+       if (isize + zero_len > offset)
+               zero_len = offset - isize;
+       error = xfs_iozero(ip, isize, zero_len);
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       ASSERT(error >= 0);
+       return error;
+}
+
+/*
+ * Zero any on disk space between the current EOF and the new,
+ * larger EOF.  This handles the normal case of zeroing the remainder
+ * of the last block in the file and the unusual case of zeroing blocks
+ * out beyond the size of the file.  This second case only happens
+ * with fixed size extents and when the system crashes before the inode
+ * size was updated but after blocks were allocated.  If fill is set,
+ * then any holes in the range are filled and zeroed.  If not, the holes
+ * are left alone as holes.
+ */
+
+int                                    /* error (positive) */
+xfs_zero_eof(
+       xfs_inode_t     *ip,
+       xfs_off_t       offset,         /* starting I/O offset */
+       xfs_fsize_t     isize)          /* current inode size */
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       xfs_fileoff_t   start_zero_fsb;
+       xfs_fileoff_t   end_zero_fsb;
+       xfs_fileoff_t   zero_count_fsb;
+       xfs_fileoff_t   last_fsb;
+       xfs_fileoff_t   zero_off;
+       xfs_fsize_t     zero_len;
+       int             nimaps;
+       int             error = 0;
+       xfs_bmbt_irec_t imap;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+       ASSERT(offset > isize);
+
+       /*
+        * First handle zeroing the block on which isize resides.
+        * We only zero a part of that block so it is handled specially.
+        */
+       error = xfs_zero_last_block(ip, offset, isize);
+       if (error) {
+               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+               return error;
+       }
+
+       /*
+        * Calculate the range between the new size and the old
+        * where blocks needing to be zeroed may exist.  To get the
+        * block where the last byte in the file currently resides,
+        * we need to subtract one from the size and truncate back
+        * to a block boundary.  We subtract 1 in case the size is
+        * exactly on a block boundary.
+        */
+       last_fsb = isize ? XFS_B_TO_FSBT(mp, isize - 1) : (xfs_fileoff_t)-1;
+       start_zero_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
+       end_zero_fsb = XFS_B_TO_FSBT(mp, offset - 1);
+       ASSERT((xfs_sfiloff_t)last_fsb < (xfs_sfiloff_t)start_zero_fsb);
+       if (last_fsb == end_zero_fsb) {
+               /*
+                * The size was only incremented on its last block.
+                * We took care of that above, so just return.
+                */
+               return 0;
+       }
+
+       ASSERT(start_zero_fsb <= end_zero_fsb);
+       while (start_zero_fsb <= end_zero_fsb) {
+               nimaps = 1;
+               zero_count_fsb = end_zero_fsb - start_zero_fsb + 1;
+               error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb,
+                                 0, NULL, 0, &imap, &nimaps, NULL);
+               if (error) {
+                       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL));
+                       return error;
+               }
+               ASSERT(nimaps > 0);
+
+               if (imap.br_state == XFS_EXT_UNWRITTEN ||
+                   imap.br_startblock == HOLESTARTBLOCK) {
+                       /*
+                        * This loop handles initializing pages that were
+                        * partially initialized by the code below this
+                        * loop. It basically zeroes the part of the page
+                        * that sits on a hole and sets the page as P_HOLE
+                        * and calls remapf if it is a mapped file.
+                        */
+                       start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+                       ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+                       continue;
+               }
+
+               /*
+                * There are blocks we need to zero.
+                * Drop the inode lock while we're doing the I/O.
+                * We'll still have the iolock to protect us.
+                */
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+               zero_off = XFS_FSB_TO_B(mp, start_zero_fsb);
+               zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount);
+
+               if ((zero_off + zero_len) > offset)
+                       zero_len = offset - zero_off;
+
+               error = xfs_iozero(ip, zero_off, zero_len);
+               if (error) {
+                       goto out_lock;
+               }
+
+               start_zero_fsb = imap.br_startoff + imap.br_blockcount;
+               ASSERT(start_zero_fsb <= (end_zero_fsb + 1));
+
+               xfs_ilock(ip, XFS_ILOCK_EXCL);
+       }
+
+       return 0;
+
+out_lock:
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       ASSERT(error >= 0);
+       return error;
+}
+
+/*
+ * Common pre-write limit and setup checks.
+ *
+ * Returns with iolock held according to @iolock.
+ */
+STATIC ssize_t
+xfs_file_aio_write_checks(
+       struct file             *file,
+       loff_t                  *pos,
+       size_t                  *count,
+       int                     *iolock)
+{
+       struct inode            *inode = file->f_mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       xfs_fsize_t             new_size;
+       int                     error = 0;
+
+       error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode));
+       if (error) {
+               xfs_rw_iunlock(ip, XFS_ILOCK_EXCL | *iolock);
+               *iolock = 0;
+               return error;
+       }
+
+       new_size = *pos + *count;
+       if (new_size > ip->i_size)
+               ip->i_new_size = new_size;
+
+       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
+               file_update_time(file);
+
+       /*
+        * If the offset is beyond the size of the file, we need to zero any
+        * blocks that fall between the existing EOF and the start of this
+        * write.
+        */
+       if (*pos > ip->i_size)
+               error = -xfs_zero_eof(ip, *pos, ip->i_size);
+
+       xfs_rw_iunlock(ip, XFS_ILOCK_EXCL);
+       if (error)
+               return error;
+
+       /*
+        * If we're writing the file then make sure to clear the setuid and
+        * setgid bits if the process is not being run by root.  This keeps
+        * people from modifying setuid and setgid binaries.
+        */
+       return file_remove_suid(file);
+
+}
+
+/*
+ * xfs_file_dio_aio_write - handle direct IO writes
+ *
+ * Lock the inode appropriately to prepare for and issue a direct IO write.
+ * By separating it from the buffered write path we remove all the tricky to
+ * follow locking changes and looping.
+ *
+ * If there are cached pages or we're extending the file, we need IOLOCK_EXCL
+ * until we're sure the bytes at the new EOF have been zeroed and/or the cached
+ * pages are flushed out.
+ *
+ * In most cases the direct IO writes will be done holding IOLOCK_SHARED
+ * allowing them to be done in parallel with reads and other direct IO writes.
+ * However, if the IO is not aligned to filesystem blocks, the direct IO layer
+ * needs to do sub-block zeroing and that requires serialisation against other
+ * direct IOs to the same block. In this case we need to serialise the
+ * submission of the unaligned IOs so that we don't get racing block zeroing in
+ * the dio layer.  To avoid the problem with aio, we also need to wait for
+ * outstanding IOs to complete so that unwritten extent conversion is completed
+ * before we try to map the overlapping block. This is currently implemented by
+ * hitting it with a big hammer (i.e. xfs_ioend_wait()).
+ *
+ * Returns with locks held indicated by @iolock and errors indicated by
+ * negative return values.
+ */
+STATIC ssize_t
+xfs_file_dio_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos,
+       size_t                  ocount,
+       int                     *iolock)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       ssize_t                 ret = 0;
+       size_t                  count = ocount;
+       int                     unaligned_io = 0;
+       struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
+                                       mp->m_rtdev_targp : mp->m_ddev_targp;
+
+       *iolock = 0;
+       if ((pos & target->bt_smask) || (count & target->bt_smask))
+               return -XFS_ERROR(EINVAL);
+
+       if ((pos & mp->m_blockmask) || ((pos + count) & mp->m_blockmask))
+               unaligned_io = 1;
+
+       if (unaligned_io || mapping->nrpages || pos > ip->i_size)
+               *iolock = XFS_IOLOCK_EXCL;
+       else
+               *iolock = XFS_IOLOCK_SHARED;
+       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       if (ret)
+               return ret;
+
+       if (mapping->nrpages) {
+               WARN_ON(*iolock != XFS_IOLOCK_EXCL);
+               ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                                                       FI_REMAPF_LOCKED);
+               if (ret)
+                       return ret;
+       }
+
+       /*
+        * If we are doing unaligned IO, wait for all other IO to drain,
+        * otherwise demote the lock if we had to flush cached pages
+        */
+       if (unaligned_io)
+               xfs_ioend_wait(ip);
+       else if (*iolock == XFS_IOLOCK_EXCL) {
+               xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+               *iolock = XFS_IOLOCK_SHARED;
+       }
+
+       trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0);
+       ret = generic_file_direct_write(iocb, iovp,
+                       &nr_segs, pos, &iocb->ki_pos, count, ocount);
+
+       /* No fallback to buffered IO on errors for XFS. */
+       ASSERT(ret < 0 || ret == count);
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_buffered_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos,
+       size_t                  ocount,
+       int                     *iolock)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       ssize_t                 ret;
+       int                     enospc = 0;
+       size_t                  count = ocount;
+
+       *iolock = XFS_IOLOCK_EXCL;
+       xfs_rw_ilock(ip, XFS_ILOCK_EXCL | *iolock);
+
+       ret = xfs_file_aio_write_checks(file, &pos, &count, iolock);
+       if (ret)
+               return ret;
+
+       /* We can write back this queue in page reclaim */
+       current->backing_dev_info = mapping->backing_dev_info;
+
+write_retry:
+       trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
+       ret = generic_file_buffered_write(iocb, iovp, nr_segs,
+                       pos, &iocb->ki_pos, count, ret);
+       /*
+        * if we just got an ENOSPC, flush the inode now we aren't holding any
+        * page locks and retry *once*
+        */
+       if (ret == -ENOSPC && !enospc) {
+               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+               if (ret)
+                       return ret;
+               enospc = 1;
+               goto write_retry;
+       }
+       current->backing_dev_info = NULL;
+       return ret;
+}
+
+STATIC ssize_t
+xfs_file_aio_write(
+       struct kiocb            *iocb,
+       const struct iovec      *iovp,
+       unsigned long           nr_segs,
+       loff_t                  pos)
+{
+       struct file             *file = iocb->ki_filp;
+       struct address_space    *mapping = file->f_mapping;
+       struct inode            *inode = mapping->host;
+       struct xfs_inode        *ip = XFS_I(inode);
+       ssize_t                 ret;
+       int                     iolock;
+       size_t                  ocount = 0;
+
+       XFS_STATS_INC(xs_write_calls);
+
+       BUG_ON(iocb->ki_pos != pos);
+
+       ret = generic_segment_checks(iovp, &nr_segs, &ocount, VERIFY_READ);
+       if (ret)
+               return ret;
+
+       if (ocount == 0)
+               return 0;
+
+       xfs_wait_for_freeze(ip->i_mount, SB_FREEZE_WRITE);
+
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return -EIO;
+
+       if (unlikely(file->f_flags & O_DIRECT))
+               ret = xfs_file_dio_aio_write(iocb, iovp, nr_segs, pos,
+                                               ocount, &iolock);
+       else
+               ret = xfs_file_buffered_aio_write(iocb, iovp, nr_segs, pos,
+                                               ocount, &iolock);
+
+       xfs_aio_write_isize_update(inode, &iocb->ki_pos, ret);
+
+       if (ret <= 0)
+               goto out_unlock;
+
+       /* Handle various SYNC-type writes */
+       if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
+               loff_t end = pos + ret - 1;
+               int error;
+
+               xfs_rw_iunlock(ip, iolock);
+               error = xfs_file_fsync(file, pos, end,
+                                     (file->f_flags & __O_SYNC) ? 0 : 1);
+               xfs_rw_ilock(ip, iolock);
+               if (error)
+                       ret = error;
+       }
+
+out_unlock:
+       xfs_aio_write_newsize_update(ip);
+       xfs_rw_iunlock(ip, iolock);
+       return ret;
+}
+
+STATIC long
+xfs_file_fallocate(
+       struct file     *file,
+       int             mode,
+       loff_t          offset,
+       loff_t          len)
+{
+       struct inode    *inode = file->f_path.dentry->d_inode;
+       long            error;
+       loff_t          new_size = 0;
+       xfs_flock64_t   bf;
+       xfs_inode_t     *ip = XFS_I(inode);
+       int             cmd = XFS_IOC_RESVSP;
+       int             attr_flags = XFS_ATTR_NOLOCK;
+
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               return -EOPNOTSUPP;
+
+       bf.l_whence = 0;
+       bf.l_start = offset;
+       bf.l_len = len;
+
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       if (mode & FALLOC_FL_PUNCH_HOLE)
+               cmd = XFS_IOC_UNRESVSP;
+
+       /* check the new inode size is valid before allocating */
+       if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+           offset + len > i_size_read(inode)) {
+               new_size = offset + len;
+               error = inode_newsize_ok(inode, new_size);
+               if (error)
+                       goto out_unlock;
+       }
+
+       if (file->f_flags & O_DSYNC)
+               attr_flags |= XFS_ATTR_SYNC;
+
+       error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags);
+       if (error)
+               goto out_unlock;
+
+       /* Change file size if needed */
+       if (new_size) {
+               struct iattr iattr;
+
+               iattr.ia_valid = ATTR_SIZE;
+               iattr.ia_size = new_size;
+               error = -xfs_setattr_size(ip, &iattr, XFS_ATTR_NOLOCK);
+       }
+
+out_unlock:
+       xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+}
+
+
+STATIC int
+xfs_file_open(
+       struct inode    *inode,
+       struct file     *file)
+{
+       if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+               return -EFBIG;
+       if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+               return -EIO;
+       return 0;
+}
+
+STATIC int
+xfs_dir_open(
+       struct inode    *inode,
+       struct file     *file)
+{
+       struct xfs_inode *ip = XFS_I(inode);
+       int             mode;
+       int             error;
+
+       error = xfs_file_open(inode, file);
+       if (error)
+               return error;
+
+       /*
+        * If there are any blocks, read-ahead block 0 as we're almost
+        * certain to have the next operation be a read there.
+        */
+       mode = xfs_ilock_map_shared(ip);
+       if (ip->i_d.di_nextents > 0)
+               xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+       xfs_iunlock(ip, mode);
+       return 0;
+}
+
+STATIC int
+xfs_file_release(
+       struct inode    *inode,
+       struct file     *filp)
+{
+       return -xfs_release(XFS_I(inode));
+}
+
+STATIC int
+xfs_file_readdir(
+       struct file     *filp,
+       void            *dirent,
+       filldir_t       filldir)
+{
+       struct inode    *inode = filp->f_path.dentry->d_inode;
+       xfs_inode_t     *ip = XFS_I(inode);
+       int             error;
+       size_t          bufsize;
+
+       /*
+        * The Linux API doesn't pass down the total size of the buffer
+        * we read into down to the filesystem.  With the filldir concept
+        * it's not needed for correct information, but the XFS dir2 leaf
+        * code wants an estimate of the buffer size to calculate it's
+        * readahead window and size the buffers used for mapping to
+        * physical blocks.
+        *
+        * Try to give it an estimate that's good enough, maybe at some
+        * point we can change the ->readdir prototype to include the
+        * buffer size.  For now we use the current glibc buffer size.
+        */
+       bufsize = (size_t)min_t(loff_t, 32768, ip->i_d.di_size);
+
+       error = xfs_readdir(ip, dirent, bufsize,
+                               (xfs_off_t *)&filp->f_pos, filldir);
+       if (error)
+               return -error;
+       return 0;
+}
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       vma->vm_ops = &xfs_file_vm_ops;
+       vma->vm_flags |= VM_CAN_NONLINEAR;
+
+       file_accessed(filp);
+       return 0;
+}
+
+/*
+ * mmap()d file has taken write protection fault and is being made
+ * writable. We can set the page state up correctly for a writable
+ * page, which means we can do correct delalloc accounting (ENOSPC
+ * checking!) and unwritten extent mapping.
+ */
+STATIC int
+xfs_vm_page_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+       return block_page_mkwrite(vma, vmf, xfs_get_blocks);
+}
+
+const struct file_operations xfs_file_operations = {
+       .llseek         = generic_file_llseek,
+       .read           = do_sync_read,
+       .write          = do_sync_write,
+       .aio_read       = xfs_file_aio_read,
+       .aio_write      = xfs_file_aio_write,
+       .splice_read    = xfs_file_splice_read,
+       .splice_write   = xfs_file_splice_write,
+       .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = xfs_file_compat_ioctl,
+#endif
+       .mmap           = xfs_file_mmap,
+       .open           = xfs_file_open,
+       .release        = xfs_file_release,
+       .fsync          = xfs_file_fsync,
+       .fallocate      = xfs_file_fallocate,
+};
+
+const struct file_operations xfs_dir_file_operations = {
+       .open           = xfs_dir_open,
+       .read           = generic_read_dir,
+       .readdir        = xfs_file_readdir,
+       .llseek         = generic_file_llseek,
+       .unlocked_ioctl = xfs_file_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = xfs_file_compat_ioctl,
+#endif
+       .fsync          = xfs_file_fsync,
+};
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = filemap_fault,
+       .page_mkwrite   = xfs_vm_page_mkwrite,
+};
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
new file mode 100644 (file)
index 0000000..ed88ed1
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_vnodeops.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trace.h"
+
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
+void
+xfs_tosspages(
+       xfs_inode_t     *ip,
+       xfs_off_t       first,
+       xfs_off_t       last,
+       int             fiopt)
+{
+       /* can't toss partial tail pages, so mask them out */
+       last &= ~(PAGE_SIZE - 1);
+       truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
+}
+
+int
+xfs_flushinval_pages(
+       xfs_inode_t     *ip,
+       xfs_off_t       first,
+       xfs_off_t       last,
+       int             fiopt)
+{
+       struct address_space *mapping = VFS_I(ip)->i_mapping;
+       int             ret = 0;
+
+       trace_xfs_pagecache_inval(ip, first, last);
+
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+       ret = filemap_write_and_wait_range(mapping, first,
+                               last == -1 ? LLONG_MAX : last);
+       if (!ret)
+               truncate_inode_pages_range(mapping, first, last);
+       return -ret;
+}
+
+int
+xfs_flush_pages(
+       xfs_inode_t     *ip,
+       xfs_off_t       first,
+       xfs_off_t       last,
+       uint64_t        flags,
+       int             fiopt)
+{
+       struct address_space *mapping = VFS_I(ip)->i_mapping;
+       int             ret = 0;
+       int             ret2;
+
+       xfs_iflags_clear(ip, XFS_ITRUNCATED);
+       ret = -filemap_fdatawrite_range(mapping, first,
+                               last == -1 ? LLONG_MAX : last);
+       if (flags & XBF_ASYNC)
+               return ret;
+       ret2 = xfs_wait_on_pages(ip, first, last);
+       if (!ret)
+               ret = ret2;
+       return ret;
+}
+
+int
+xfs_wait_on_pages(
+       xfs_inode_t     *ip,
+       xfs_off_t       first,
+       xfs_off_t       last)
+{
+       struct address_space *mapping = VFS_I(ip)->i_mapping;
+
+       if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
+               return -filemap_fdatawait_range(mapping, first,
+                                       last == -1 ? ip->i_size - 1 : last);
+       }
+       return 0;
+}
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
new file mode 100644 (file)
index 0000000..76e81cf
--- /dev/null
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sysctl.h"
+
+/*
+ * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
+ * other XFS code uses these values.  Times are measured in centisecs (i.e.
+ * 100ths of a second).
+ */
+xfs_param_t xfs_params = {
+                         /*    MIN             DFLT            MAX     */
+       .sgid_inherit   = {     0,              0,              1       },
+       .symlink_mode   = {     0,              0,              1       },
+       .panic_mask     = {     0,              0,              255     },
+       .error_level    = {     0,              3,              11      },
+       .syncd_timer    = {     1*100,          30*100,         7200*100},
+       .stats_clear    = {     0,              0,              1       },
+       .inherit_sync   = {     0,              1,              1       },
+       .inherit_nodump = {     0,              1,              1       },
+       .inherit_noatim = {     0,              1,              1       },
+       .xfs_buf_timer  = {     100/2,          1*100,          30*100  },
+       .xfs_buf_age    = {     1*100,          15*100,         7200*100},
+       .inherit_nosym  = {     0,              0,              1       },
+       .rotorstep      = {     1,              1,              255     },
+       .inherit_nodfrg = {     0,              1,              1       },
+       .fstrm_timer    = {     1,              30*100,         3600*100},
+};
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
new file mode 100644 (file)
index 0000000..f7ce7de
--- /dev/null
@@ -0,0 +1,1556 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ioctl.h"
+#include "xfs_rtalloc.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_bmap.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_dfrag.h"
+#include "xfs_fsops.h"
+#include "xfs_vnodeops.h"
+#include "xfs_discard.h"
+#include "xfs_quota.h"
+#include "xfs_inode_item.h"
+#include "xfs_export.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/dcache.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/exportfs.h>
+
+/*
+ * xfs_find_handle maps from userspace xfs_fsop_handlereq structure to
+ * a file or fs handle.
+ *
+ * XFS_IOC_PATH_TO_FSHANDLE
+ *    returns fs handle for a mount point or path within that mount point
+ * XFS_IOC_FD_TO_HANDLE
+ *    returns full handle for a FD opened in user space
+ * XFS_IOC_PATH_TO_HANDLE
+ *    returns full handle for a path
+ */
+int
+xfs_find_handle(
+       unsigned int            cmd,
+       xfs_fsop_handlereq_t    *hreq)
+{
+       int                     hsize;
+       xfs_handle_t            handle;
+       struct inode            *inode;
+       struct file             *file = NULL;
+       struct path             path;
+       int                     error;
+       struct xfs_inode        *ip;
+
+       if (cmd == XFS_IOC_FD_TO_HANDLE) {
+               file = fget(hreq->fd);
+               if (!file)
+                       return -EBADF;
+               inode = file->f_path.dentry->d_inode;
+       } else {
+               error = user_lpath((const char __user *)hreq->path, &path);
+               if (error)
+                       return error;
+               inode = path.dentry->d_inode;
+       }
+       ip = XFS_I(inode);
+
+       /*
+        * We can only generate handles for inodes residing on a XFS filesystem,
+        * and only for regular files, directories or symbolic links.
+        */
+       error = -EINVAL;
+       if (inode->i_sb->s_magic != XFS_SB_MAGIC)
+               goto out_put;
+
+       error = -EBADF;
+       if (!S_ISREG(inode->i_mode) &&
+           !S_ISDIR(inode->i_mode) &&
+           !S_ISLNK(inode->i_mode))
+               goto out_put;
+
+
+       memcpy(&handle.ha_fsid, ip->i_mount->m_fixedfsid, sizeof(xfs_fsid_t));
+
+       if (cmd == XFS_IOC_PATH_TO_FSHANDLE) {
+               /*
+                * This handle only contains an fsid, zero the rest.
+                */
+               memset(&handle.ha_fid, 0, sizeof(handle.ha_fid));
+               hsize = sizeof(xfs_fsid_t);
+       } else {
+               int             lock_mode;
+
+               lock_mode = xfs_ilock_map_shared(ip);
+               handle.ha_fid.fid_len = sizeof(xfs_fid_t) -
+                                       sizeof(handle.ha_fid.fid_len);
+               handle.ha_fid.fid_pad = 0;
+               handle.ha_fid.fid_gen = ip->i_d.di_gen;
+               handle.ha_fid.fid_ino = ip->i_ino;
+               xfs_iunlock_map_shared(ip, lock_mode);
+
+               hsize = XFS_HSIZE(handle);
+       }
+
+       error = -EFAULT;
+       if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+           copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32)))
+               goto out_put;
+
+       error = 0;
+
+ out_put:
+       if (cmd == XFS_IOC_FD_TO_HANDLE)
+               fput(file);
+       else
+               path_put(&path);
+       return error;
+}
+
+/*
+ * No need to do permission checks on the various pathname components
+ * as the handle operations are privileged.
+ */
+STATIC int
+xfs_handle_acceptable(
+       void                    *context,
+       struct dentry           *dentry)
+{
+       return 1;
+}
+
+/*
+ * Convert userspace handle data into a dentry.
+ */
+struct dentry *
+xfs_handle_to_dentry(
+       struct file             *parfilp,
+       void __user             *uhandle,
+       u32                     hlen)
+{
+       xfs_handle_t            handle;
+       struct xfs_fid64        fid;
+
+       /*
+        * Only allow handle opens under a directory.
+        */
+       if (!S_ISDIR(parfilp->f_path.dentry->d_inode->i_mode))
+               return ERR_PTR(-ENOTDIR);
+
+       if (hlen != sizeof(xfs_handle_t))
+               return ERR_PTR(-EINVAL);
+       if (copy_from_user(&handle, uhandle, hlen))
+               return ERR_PTR(-EFAULT);
+       if (handle.ha_fid.fid_len !=
+           sizeof(handle.ha_fid) - sizeof(handle.ha_fid.fid_len))
+               return ERR_PTR(-EINVAL);
+
+       memset(&fid, 0, sizeof(struct fid));
+       fid.ino = handle.ha_fid.fid_ino;
+       fid.gen = handle.ha_fid.fid_gen;
+
+       return exportfs_decode_fh(parfilp->f_path.mnt, (struct fid *)&fid, 3,
+                       FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG,
+                       xfs_handle_acceptable, NULL);
+}
+
+STATIC struct dentry *
+xfs_handlereq_to_dentry(
+       struct file             *parfilp,
+       xfs_fsop_handlereq_t    *hreq)
+{
+       return xfs_handle_to_dentry(parfilp, hreq->ihandle, hreq->ihandlen);
+}
+
+int
+xfs_open_by_handle(
+       struct file             *parfilp,
+       xfs_fsop_handlereq_t    *hreq)
+{
+       const struct cred       *cred = current_cred();
+       int                     error;
+       int                     fd;
+       int                     permflag;
+       struct file             *filp;
+       struct inode            *inode;
+       struct dentry           *dentry;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+
+       dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+       inode = dentry->d_inode;
+
+       /* Restrict xfs_open_by_handle to directories & regular files. */
+       if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
+               error = -XFS_ERROR(EPERM);
+               goto out_dput;
+       }
+
+#if BITS_PER_LONG != 32
+       hreq->oflags |= O_LARGEFILE;
+#endif
+
+       /* Put open permission in namei format. */
+       permflag = hreq->oflags;
+       if ((permflag+1) & O_ACCMODE)
+               permflag++;
+       if (permflag & O_TRUNC)
+               permflag |= 2;
+
+       if ((!(permflag & O_APPEND) || (permflag & O_TRUNC)) &&
+           (permflag & FMODE_WRITE) && IS_APPEND(inode)) {
+               error = -XFS_ERROR(EPERM);
+               goto out_dput;
+       }
+
+       if ((permflag & FMODE_WRITE) && IS_IMMUTABLE(inode)) {
+               error = -XFS_ERROR(EACCES);
+               goto out_dput;
+       }
+
+       /* Can't write directories. */
+       if (S_ISDIR(inode->i_mode) && (permflag & FMODE_WRITE)) {
+               error = -XFS_ERROR(EISDIR);
+               goto out_dput;
+       }
+
+       fd = get_unused_fd();
+       if (fd < 0) {
+               error = fd;
+               goto out_dput;
+       }
+
+       filp = dentry_open(dentry, mntget(parfilp->f_path.mnt),
+                          hreq->oflags, cred);
+       if (IS_ERR(filp)) {
+               put_unused_fd(fd);
+               return PTR_ERR(filp);
+       }
+
+       if (S_ISREG(inode->i_mode)) {
+               filp->f_flags |= O_NOATIME;
+               filp->f_mode |= FMODE_NOCMTIME;
+       }
+
+       fd_install(fd, filp);
+       return fd;
+
+ out_dput:
+       dput(dentry);
+       return error;
+}
+
+/*
+ * This is a copy from fs/namei.c:vfs_readlink(), except for removing it's
+ * unused first argument.
+ */
+STATIC int
+do_readlink(
+       char __user             *buffer,
+       int                     buflen,
+       const char              *link)
+{
+        int len;
+
+       len = PTR_ERR(link);
+       if (IS_ERR(link))
+               goto out;
+
+       len = strlen(link);
+       if (len > (unsigned) buflen)
+               len = buflen;
+       if (copy_to_user(buffer, link, len))
+               len = -EFAULT;
+ out:
+       return len;
+}
+
+
+int
+xfs_readlink_by_handle(
+       struct file             *parfilp,
+       xfs_fsop_handlereq_t    *hreq)
+{
+       struct dentry           *dentry;
+       __u32                   olen;
+       void                    *link;
+       int                     error;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+
+       dentry = xfs_handlereq_to_dentry(parfilp, hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       /* Restrict this handle operation to symlinks only. */
+       if (!S_ISLNK(dentry->d_inode->i_mode)) {
+               error = -XFS_ERROR(EINVAL);
+               goto out_dput;
+       }
+
+       if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
+               error = -XFS_ERROR(EFAULT);
+               goto out_dput;
+       }
+
+       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+       if (!link) {
+               error = -XFS_ERROR(ENOMEM);
+               goto out_dput;
+       }
+
+       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+       if (error)
+               goto out_kfree;
+       error = do_readlink(hreq->ohandle, olen, link);
+       if (error)
+               goto out_kfree;
+
+ out_kfree:
+       kfree(link);
+ out_dput:
+       dput(dentry);
+       return error;
+}
+
+STATIC int
+xfs_fssetdm_by_handle(
+       struct file             *parfilp,
+       void                    __user *arg)
+{
+       int                     error;
+       struct fsdmidata        fsd;
+       xfs_fsop_setdm_handlereq_t dmhreq;
+       struct dentry           *dentry;
+
+       if (!capable(CAP_MKNOD))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&dmhreq, arg, sizeof(xfs_fsop_setdm_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       dentry = xfs_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+               error = -XFS_ERROR(EPERM);
+               goto out;
+       }
+
+       if (copy_from_user(&fsd, dmhreq.data, sizeof(fsd))) {
+               error = -XFS_ERROR(EFAULT);
+               goto out;
+       }
+
+       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+                                fsd.fsd_dmstate);
+
+ out:
+       dput(dentry);
+       return error;
+}
+
+STATIC int
+xfs_attrlist_by_handle(
+       struct file             *parfilp,
+       void                    __user *arg)
+{
+       int                     error = -ENOMEM;
+       attrlist_cursor_kern_t  *cursor;
+       xfs_fsop_attrlist_handlereq_t al_hreq;
+       struct dentry           *dentry;
+       char                    *kbuf;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&al_hreq, arg, sizeof(xfs_fsop_attrlist_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+       if (al_hreq.buflen > XATTR_LIST_MAX)
+               return -XFS_ERROR(EINVAL);
+
+       /*
+        * Reject flags, only allow namespaces.
+        */
+       if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+               return -XFS_ERROR(EINVAL);
+
+       dentry = xfs_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       kbuf = kzalloc(al_hreq.buflen, GFP_KERNEL);
+       if (!kbuf)
+               goto out_dput;
+
+       cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+                                       al_hreq.flags, cursor);
+       if (error)
+               goto out_kfree;
+
+       if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
+               error = -EFAULT;
+
+ out_kfree:
+       kfree(kbuf);
+ out_dput:
+       dput(dentry);
+       return error;
+}
+
+int
+xfs_attrmulti_attr_get(
+       struct inode            *inode,
+       unsigned char           *name,
+       unsigned char           __user *ubuf,
+       __uint32_t              *len,
+       __uint32_t              flags)
+{
+       unsigned char           *kbuf;
+       int                     error = EFAULT;
+
+       if (*len > XATTR_SIZE_MAX)
+               return EINVAL;
+       kbuf = kmalloc(*len, GFP_KERNEL);
+       if (!kbuf)
+               return ENOMEM;
+
+       error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
+       if (error)
+               goto out_kfree;
+
+       if (copy_to_user(ubuf, kbuf, *len))
+               error = EFAULT;
+
+ out_kfree:
+       kfree(kbuf);
+       return error;
+}
+
+int
+xfs_attrmulti_attr_set(
+       struct inode            *inode,
+       unsigned char           *name,
+       const unsigned char     __user *ubuf,
+       __uint32_t              len,
+       __uint32_t              flags)
+{
+       unsigned char           *kbuf;
+       int                     error = EFAULT;
+
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               return EPERM;
+       if (len > XATTR_SIZE_MAX)
+               return EINVAL;
+
+       kbuf = memdup_user(ubuf, len);
+       if (IS_ERR(kbuf))
+               return PTR_ERR(kbuf);
+
+       error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags);
+
+       return error;
+}
+
+int
+xfs_attrmulti_attr_remove(
+       struct inode            *inode,
+       unsigned char           *name,
+       __uint32_t              flags)
+{
+       if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
+               return EPERM;
+       return xfs_attr_remove(XFS_I(inode), name, flags);
+}
+
+STATIC int
+xfs_attrmulti_by_handle(
+       struct file             *parfilp,
+       void                    __user *arg)
+{
+       int                     error;
+       xfs_attr_multiop_t      *ops;
+       xfs_fsop_attrmulti_handlereq_t am_hreq;
+       struct dentry           *dentry;
+       unsigned int            i, size;
+       unsigned char           *attr_name;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&am_hreq, arg, sizeof(xfs_fsop_attrmulti_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       /* overflow check */
+       if (am_hreq.opcount >= INT_MAX / sizeof(xfs_attr_multiop_t))
+               return -E2BIG;
+
+       dentry = xfs_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       error = E2BIG;
+       size = am_hreq.opcount * sizeof(xfs_attr_multiop_t);
+       if (!size || size > 16 * PAGE_SIZE)
+               goto out_dput;
+
+       ops = memdup_user(am_hreq.ops, size);
+       if (IS_ERR(ops)) {
+               error = PTR_ERR(ops);
+               goto out_dput;
+       }
+
+       attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+       if (!attr_name)
+               goto out_kfree_ops;
+
+       error = 0;
+       for (i = 0; i < am_hreq.opcount; i++) {
+               ops[i].am_error = strncpy_from_user((char *)attr_name,
+                               ops[i].am_attrname, MAXNAMELEN);
+               if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                       error = -ERANGE;
+               if (ops[i].am_error < 0)
+                       break;
+
+               switch (ops[i].am_opcode) {
+               case ATTR_OP_GET:
+                       ops[i].am_error = xfs_attrmulti_attr_get(
+                                       dentry->d_inode, attr_name,
+                                       ops[i].am_attrvalue, &ops[i].am_length,
+                                       ops[i].am_flags);
+                       break;
+               case ATTR_OP_SET:
+                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                       if (ops[i].am_error)
+                               break;
+                       ops[i].am_error = xfs_attrmulti_attr_set(
+                                       dentry->d_inode, attr_name,
+                                       ops[i].am_attrvalue, ops[i].am_length,
+                                       ops[i].am_flags);
+                       mnt_drop_write(parfilp->f_path.mnt);
+                       break;
+               case ATTR_OP_REMOVE:
+                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                       if (ops[i].am_error)
+                               break;
+                       ops[i].am_error = xfs_attrmulti_attr_remove(
+                                       dentry->d_inode, attr_name,
+                                       ops[i].am_flags);
+                       mnt_drop_write(parfilp->f_path.mnt);
+                       break;
+               default:
+                       ops[i].am_error = EINVAL;
+               }
+       }
+
+       if (copy_to_user(am_hreq.ops, ops, size))
+               error = XFS_ERROR(EFAULT);
+
+       kfree(attr_name);
+ out_kfree_ops:
+       kfree(ops);
+ out_dput:
+       dput(dentry);
+       return -error;
+}
+
+int
+xfs_ioc_space(
+       struct xfs_inode        *ip,
+       struct inode            *inode,
+       struct file             *filp,
+       int                     ioflags,
+       unsigned int            cmd,
+       xfs_flock64_t           *bf)
+{
+       int                     attr_flags = 0;
+       int                     error;
+
+       /*
+        * Only allow the sys admin to reserve space unless
+        * unwritten extents are enabled.
+        */
+       if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+           !capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+
+       if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
+               return -XFS_ERROR(EPERM);
+
+       if (!(filp->f_mode & FMODE_WRITE))
+               return -XFS_ERROR(EBADF);
+
+       if (!S_ISREG(inode->i_mode))
+               return -XFS_ERROR(EINVAL);
+
+       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+               attr_flags |= XFS_ATTR_NONBLOCK;
+
+       if (filp->f_flags & O_DSYNC)
+               attr_flags |= XFS_ATTR_SYNC;
+
+       if (ioflags & IO_INVIS)
+               attr_flags |= XFS_ATTR_DMI;
+
+       error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
+       return -error;
+}
+
+STATIC int
+xfs_ioc_bulkstat(
+       xfs_mount_t             *mp,
+       unsigned int            cmd,
+       void                    __user *arg)
+{
+       xfs_fsop_bulkreq_t      bulkreq;
+       int                     count;  /* # of records returned */
+       xfs_ino_t               inlast; /* last inode number */
+       int                     done;
+       int                     error;
+
+       /* done = 1 if there are more stats to get and if bulkstat */
+       /* should be called again (unused here, but used in dmapi) */
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       if (copy_from_user(&bulkreq, arg, sizeof(xfs_fsop_bulkreq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+               return -XFS_ERROR(EFAULT);
+
+       if ((count = bulkreq.icount) <= 0)
+               return -XFS_ERROR(EINVAL);
+
+       if (bulkreq.ubuffer == NULL)
+               return -XFS_ERROR(EINVAL);
+
+       if (cmd == XFS_IOC_FSINUMBERS)
+               error = xfs_inumbers(mp, &inlast, &count,
+                                       bulkreq.ubuffer, xfs_inumbers_fmt);
+       else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE)
+               error = xfs_bulkstat_single(mp, &inlast,
+                                               bulkreq.ubuffer, &done);
+       else    /* XFS_IOC_FSBULKSTAT */
+               error = xfs_bulkstat(mp, &inlast, &count, xfs_bulkstat_one,
+                                    sizeof(xfs_bstat_t), bulkreq.ubuffer,
+                                    &done);
+
+       if (error)
+               return -error;
+
+       if (bulkreq.ocount != NULL) {
+               if (copy_to_user(bulkreq.lastip, &inlast,
+                                               sizeof(xfs_ino_t)))
+                       return -XFS_ERROR(EFAULT);
+
+               if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+                       return -XFS_ERROR(EFAULT);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry_v1(
+       xfs_mount_t             *mp,
+       void                    __user *arg)
+{
+       xfs_fsop_geom_t         fsgeo;
+       int                     error;
+
+       error = xfs_fs_geometry(mp, &fsgeo, 3);
+       if (error)
+               return -error;
+
+       /*
+        * Caller should have passed an argument of type
+        * xfs_fsop_geom_v1_t.  This is a proper subset of the
+        * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+        */
+       if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_ioc_fsgeometry(
+       xfs_mount_t             *mp,
+       void                    __user *arg)
+{
+       xfs_fsop_geom_t         fsgeo;
+       int                     error;
+
+       error = xfs_fs_geometry(mp, &fsgeo, 4);
+       if (error)
+               return -error;
+
+       if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+/*
+ * Linux extended inode flags interface.
+ */
+
+STATIC unsigned int
+xfs_merge_ioc_xflags(
+       unsigned int    flags,
+       unsigned int    start)
+{
+       unsigned int    xflags = start;
+
+       if (flags & FS_IMMUTABLE_FL)
+               xflags |= XFS_XFLAG_IMMUTABLE;
+       else
+               xflags &= ~XFS_XFLAG_IMMUTABLE;
+       if (flags & FS_APPEND_FL)
+               xflags |= XFS_XFLAG_APPEND;
+       else
+               xflags &= ~XFS_XFLAG_APPEND;
+       if (flags & FS_SYNC_FL)
+               xflags |= XFS_XFLAG_SYNC;
+       else
+               xflags &= ~XFS_XFLAG_SYNC;
+       if (flags & FS_NOATIME_FL)
+               xflags |= XFS_XFLAG_NOATIME;
+       else
+               xflags &= ~XFS_XFLAG_NOATIME;
+       if (flags & FS_NODUMP_FL)
+               xflags |= XFS_XFLAG_NODUMP;
+       else
+               xflags &= ~XFS_XFLAG_NODUMP;
+
+       return xflags;
+}
+
+STATIC unsigned int
+xfs_di2lxflags(
+       __uint16_t      di_flags)
+{
+       unsigned int    flags = 0;
+
+       if (di_flags & XFS_DIFLAG_IMMUTABLE)
+               flags |= FS_IMMUTABLE_FL;
+       if (di_flags & XFS_DIFLAG_APPEND)
+               flags |= FS_APPEND_FL;
+       if (di_flags & XFS_DIFLAG_SYNC)
+               flags |= FS_SYNC_FL;
+       if (di_flags & XFS_DIFLAG_NOATIME)
+               flags |= FS_NOATIME_FL;
+       if (di_flags & XFS_DIFLAG_NODUMP)
+               flags |= FS_NODUMP_FL;
+       return flags;
+}
+
+STATIC int
+xfs_ioc_fsgetxattr(
+       xfs_inode_t             *ip,
+       int                     attr,
+       void                    __user *arg)
+{
+       struct fsxattr          fa;
+
+       memset(&fa, 0, sizeof(struct fsxattr));
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       fa.fsx_xflags = xfs_ip2xflags(ip);
+       fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+       fa.fsx_projid = xfs_get_projid(ip);
+
+       if (attr) {
+               if (ip->i_afp) {
+                       if (ip->i_afp->if_flags & XFS_IFEXTENTS)
+                               fa.fsx_nextents = ip->i_afp->if_bytes /
+                                                       sizeof(xfs_bmbt_rec_t);
+                       else
+                               fa.fsx_nextents = ip->i_d.di_anextents;
+               } else
+                       fa.fsx_nextents = 0;
+       } else {
+               if (ip->i_df.if_flags & XFS_IFEXTENTS)
+                       fa.fsx_nextents = ip->i_df.if_bytes /
+                                               sizeof(xfs_bmbt_rec_t);
+               else
+                       fa.fsx_nextents = ip->i_d.di_nextents;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+       if (copy_to_user(arg, &fa, sizeof(fa)))
+               return -EFAULT;
+       return 0;
+}
+
+STATIC void
+xfs_set_diflags(
+       struct xfs_inode        *ip,
+       unsigned int            xflags)
+{
+       unsigned int            di_flags;
+
+       /* can't set PREALLOC this way, just preserve it */
+       di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
+       if (xflags & XFS_XFLAG_IMMUTABLE)
+               di_flags |= XFS_DIFLAG_IMMUTABLE;
+       if (xflags & XFS_XFLAG_APPEND)
+               di_flags |= XFS_DIFLAG_APPEND;
+       if (xflags & XFS_XFLAG_SYNC)
+               di_flags |= XFS_DIFLAG_SYNC;
+       if (xflags & XFS_XFLAG_NOATIME)
+               di_flags |= XFS_DIFLAG_NOATIME;
+       if (xflags & XFS_XFLAG_NODUMP)
+               di_flags |= XFS_DIFLAG_NODUMP;
+       if (xflags & XFS_XFLAG_PROJINHERIT)
+               di_flags |= XFS_DIFLAG_PROJINHERIT;
+       if (xflags & XFS_XFLAG_NODEFRAG)
+               di_flags |= XFS_DIFLAG_NODEFRAG;
+       if (xflags & XFS_XFLAG_FILESTREAM)
+               di_flags |= XFS_DIFLAG_FILESTREAM;
+       if (S_ISDIR(ip->i_d.di_mode)) {
+               if (xflags & XFS_XFLAG_RTINHERIT)
+                       di_flags |= XFS_DIFLAG_RTINHERIT;
+               if (xflags & XFS_XFLAG_NOSYMLINKS)
+                       di_flags |= XFS_DIFLAG_NOSYMLINKS;
+               if (xflags & XFS_XFLAG_EXTSZINHERIT)
+                       di_flags |= XFS_DIFLAG_EXTSZINHERIT;
+       } else if (S_ISREG(ip->i_d.di_mode)) {
+               if (xflags & XFS_XFLAG_REALTIME)
+                       di_flags |= XFS_DIFLAG_REALTIME;
+               if (xflags & XFS_XFLAG_EXTSIZE)
+                       di_flags |= XFS_DIFLAG_EXTSIZE;
+       }
+
+       ip->i_d.di_flags = di_flags;
+}
+
+STATIC void
+xfs_diflags_to_linux(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = VFS_I(ip);
+       unsigned int            xflags = xfs_ip2xflags(ip);
+
+       if (xflags & XFS_XFLAG_IMMUTABLE)
+               inode->i_flags |= S_IMMUTABLE;
+       else
+               inode->i_flags &= ~S_IMMUTABLE;
+       if (xflags & XFS_XFLAG_APPEND)
+               inode->i_flags |= S_APPEND;
+       else
+               inode->i_flags &= ~S_APPEND;
+       if (xflags & XFS_XFLAG_SYNC)
+               inode->i_flags |= S_SYNC;
+       else
+               inode->i_flags &= ~S_SYNC;
+       if (xflags & XFS_XFLAG_NOATIME)
+               inode->i_flags |= S_NOATIME;
+       else
+               inode->i_flags &= ~S_NOATIME;
+}
+
+#define FSX_PROJID     1
+#define FSX_EXTSIZE    2
+#define FSX_XFLAGS     4
+#define FSX_NONBLOCK   8
+
+STATIC int
+xfs_ioctl_setattr(
+       xfs_inode_t             *ip,
+       struct fsxattr          *fa,
+       int                     mask)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       unsigned int            lock_flags = 0;
+       struct xfs_dquot        *udqp = NULL;
+       struct xfs_dquot        *gdqp = NULL;
+       struct xfs_dquot        *olddquot = NULL;
+       int                     code;
+
+       trace_xfs_ioctl_setattr(ip);
+
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return XFS_ERROR(EROFS);
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       /*
+        * Disallow 32bit project ids when projid32bit feature is not enabled.
+        */
+       if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
+                       !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+               return XFS_ERROR(EINVAL);
+
+       /*
+        * If disk quotas is on, we make sure that the dquots do exist on disk,
+        * before we start any other transactions. Trying to do this later
+        * is messy. We don't care to take a readlock to look at the ids
+        * in inode here, because we can't hold it across the trans_reserve.
+        * If the IDs do change before we take the ilock, we're covered
+        * because the i_*dquot fields will get updated anyway.
+        */
+       if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+               code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
+                                        ip->i_d.di_gid, fa->fsx_projid,
+                                        XFS_QMOPT_PQUOTA, &udqp, &gdqp);
+               if (code)
+                       return code;
+       }
+
+       /*
+        * For the other attributes, we acquire the inode lock and
+        * first do an error checking pass.
+        */
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+       code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (code)
+               goto error_return;
+
+       lock_flags = XFS_ILOCK_EXCL;
+       xfs_ilock(ip, lock_flags);
+
+       /*
+        * CAP_FOWNER overrides the following restrictions:
+        *
+        * The user ID of the calling process must be equal
+        * to the file owner ID, except in cases where the
+        * CAP_FSETID capability is applicable.
+        */
+       if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) {
+               code = XFS_ERROR(EPERM);
+               goto error_return;
+       }
+
+       /*
+        * Do a quota reservation only if projid is actually going to change.
+        */
+       if (mask & FSX_PROJID) {
+               if (XFS_IS_QUOTA_RUNNING(mp) &&
+                   XFS_IS_PQUOTA_ON(mp) &&
+                   xfs_get_projid(ip) != fa->fsx_projid) {
+                       ASSERT(tp);
+                       code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+                                               capable(CAP_FOWNER) ?
+                                               XFS_QMOPT_FORCE_RES : 0);
+                       if (code)       /* out of quota */
+                               goto error_return;
+               }
+       }
+
+       if (mask & FSX_EXTSIZE) {
+               /*
+                * Can't change extent size if any extents are allocated.
+                */
+               if (ip->i_d.di_nextents &&
+                   ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
+                    fa->fsx_extsize)) {
+                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                       goto error_return;
+               }
+
+               /*
+                * Extent size must be a multiple of the appropriate block
+                * size, if set at all. It must also be smaller than the
+                * maximum extent size supported by the filesystem.
+                *
+                * Also, for non-realtime files, limit the extent size hint to
+                * half the size of the AGs in the filesystem so alignment
+                * doesn't result in extents larger than an AG.
+                */
+               if (fa->fsx_extsize != 0) {
+                       xfs_extlen_t    size;
+                       xfs_fsblock_t   extsize_fsb;
+
+                       extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                       if (extsize_fsb > MAXEXTLEN) {
+                               code = XFS_ERROR(EINVAL);
+                               goto error_return;
+                       }
+
+                       if (XFS_IS_REALTIME_INODE(ip) ||
+                           ((mask & FSX_XFLAGS) &&
+                           (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
+                               size = mp->m_sb.sb_rextsize <<
+                                      mp->m_sb.sb_blocklog;
+                       } else {
+                               size = mp->m_sb.sb_blocksize;
+                               if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
+                                       code = XFS_ERROR(EINVAL);
+                                       goto error_return;
+                               }
+                       }
+
+                       if (fa->fsx_extsize % size) {
+                               code = XFS_ERROR(EINVAL);
+                               goto error_return;
+                       }
+               }
+       }
+
+
+       if (mask & FSX_XFLAGS) {
+               /*
+                * Can't change realtime flag if any extents are allocated.
+                */
+               if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+                   (XFS_IS_REALTIME_INODE(ip)) !=
+                   (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                       code = XFS_ERROR(EINVAL);       /* EFBIG? */
+                       goto error_return;
+               }
+
+               /*
+                * If realtime flag is set then must have realtime data.
+                */
+               if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                       if ((mp->m_sb.sb_rblocks == 0) ||
+                           (mp->m_sb.sb_rextsize == 0) ||
+                           (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
+                               code = XFS_ERROR(EINVAL);
+                               goto error_return;
+                       }
+               }
+
+               /*
+                * Can't modify an immutable/append-only file unless
+                * we have appropriate permission.
+                */
+               if ((ip->i_d.di_flags &
+                               (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
+                    (fa->fsx_xflags &
+                               (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+                   !capable(CAP_LINUX_IMMUTABLE)) {
+                       code = XFS_ERROR(EPERM);
+                       goto error_return;
+               }
+       }
+
+       xfs_trans_ijoin(tp, ip);
+
+       /*
+        * Change file ownership.  Must be the owner or privileged.
+        */
+       if (mask & FSX_PROJID) {
+               /*
+                * CAP_FSETID overrides the following restrictions:
+                *
+                * The set-user-ID and set-group-ID bits of a file will be
+                * cleared upon successful return from chown()
+                */
+               if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+                   !capable(CAP_FSETID))
+                       ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+               /*
+                * Change the ownerships and register quota modifications
+                * in the transaction.
+                */
+               if (xfs_get_projid(ip) != fa->fsx_projid) {
+                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+                               olddquot = xfs_qm_vop_chown(tp, ip,
+                                                       &ip->i_gdquot, gdqp);
+                       }
+                       xfs_set_projid(ip, fa->fsx_projid);
+
+                       /*
+                        * We may have to rev the inode as well as
+                        * the superblock version number since projids didn't
+                        * exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
+                        */
+                       if (ip->i_d.di_version == 1)
+                               xfs_bump_ino_vers2(tp, ip);
+               }
+
+       }
+
+       if (mask & FSX_EXTSIZE)
+               ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+       if (mask & FSX_XFLAGS) {
+               xfs_set_diflags(ip, fa->fsx_xflags);
+               xfs_diflags_to_linux(ip);
+       }
+
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       XFS_STATS_INC(xs_ig_attrchg);
+
+       /*
+        * If this is a synchronous mount, make sure that the
+        * transaction goes to disk before returning to the user.
+        * This is slightly sub-optimal in that truncates require
+        * two sync transactions instead of one for wsync filesystems.
+        * One for the truncate and one for the timestamps since we
+        * don't want to change the timestamps unless we're sure the
+        * truncate worked.  Truncates are less than 1% of the laddis
+        * mix so this probably isn't worth the trouble to optimize.
+        */
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(tp);
+       code = xfs_trans_commit(tp, 0);
+       xfs_iunlock(ip, lock_flags);
+
+       /*
+        * Release any dquot(s) the inode had kept before chown.
+        */
+       xfs_qm_dqrele(olddquot);
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+
+       return code;
+
+ error_return:
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       xfs_trans_cancel(tp, 0);
+       if (lock_flags)
+               xfs_iunlock(ip, lock_flags);
+       return code;
+}
+
+STATIC int
+xfs_ioc_fssetxattr(
+       xfs_inode_t             *ip,
+       struct file             *filp,
+       void                    __user *arg)
+{
+       struct fsxattr          fa;
+       unsigned int            mask;
+
+       if (copy_from_user(&fa, arg, sizeof(fa)))
+               return -EFAULT;
+
+       mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
+       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+               mask |= FSX_NONBLOCK;
+
+       return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_ioc_getxflags(
+       xfs_inode_t             *ip,
+       void                    __user *arg)
+{
+       unsigned int            flags;
+
+       flags = xfs_di2lxflags(ip->i_d.di_flags);
+       if (copy_to_user(arg, &flags, sizeof(flags)))
+               return -EFAULT;
+       return 0;
+}
+
+STATIC int
+xfs_ioc_setxflags(
+       xfs_inode_t             *ip,
+       struct file             *filp,
+       void                    __user *arg)
+{
+       struct fsxattr          fa;
+       unsigned int            flags;
+       unsigned int            mask;
+
+       if (copy_from_user(&flags, arg, sizeof(flags)))
+               return -EFAULT;
+
+       if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                     FS_NOATIME_FL | FS_NODUMP_FL | \
+                     FS_SYNC_FL))
+               return -EOPNOTSUPP;
+
+       mask = FSX_XFLAGS;
+       if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
+               mask |= FSX_NONBLOCK;
+       fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
+
+       return -xfs_ioctl_setattr(ip, &fa, mask);
+}
+
+STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+       struct getbmap __user   *base = *ap;
+
+       /* copy only getbmap portion (not getbmapx) */
+       if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+               return XFS_ERROR(EFAULT);
+
+       *ap += sizeof(struct getbmap);
+       return 0;
+}
+
+STATIC int
+xfs_ioc_getbmap(
+       struct xfs_inode        *ip,
+       int                     ioflags,
+       unsigned int            cmd,
+       void                    __user *arg)
+{
+       struct getbmapx         bmx;
+       int                     error;
+
+       if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+               return -XFS_ERROR(EFAULT);
+
+       if (bmx.bmv_count < 2)
+               return -XFS_ERROR(EINVAL);
+
+       bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+       if (ioflags & IO_INVIS)
+               bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
+
+       error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+                           (struct getbmap *)arg+1);
+       if (error)
+               return -error;
+
+       /* copy back header - only size of getbmap */
+       if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+       struct getbmapx __user  *base = *ap;
+
+       if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+               return XFS_ERROR(EFAULT);
+
+       *ap += sizeof(struct getbmapx);
+       return 0;
+}
+
+STATIC int
+xfs_ioc_getbmapx(
+       struct xfs_inode        *ip,
+       void                    __user *arg)
+{
+       struct getbmapx         bmx;
+       int                     error;
+
+       if (copy_from_user(&bmx, arg, sizeof(bmx)))
+               return -XFS_ERROR(EFAULT);
+
+       if (bmx.bmv_count < 2)
+               return -XFS_ERROR(EINVAL);
+
+       if (bmx.bmv_iflags & (~BMV_IF_VALID))
+               return -XFS_ERROR(EINVAL);
+
+       error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+                           (struct getbmapx *)arg+1);
+       if (error)
+               return -error;
+
+       /* copy back header */
+       if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
+               return -XFS_ERROR(EFAULT);
+
+       return 0;
+}
+
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines.  This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
+       struct file             *filp,
+       unsigned int            cmd,
+       unsigned long           p)
+{
+       struct inode            *inode = filp->f_path.dentry->d_inode;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       void                    __user *arg = (void __user *)p;
+       int                     ioflags = 0;
+       int                     error;
+
+       if (filp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       trace_xfs_file_ioctl(ip);
+
+       switch (cmd) {
+       case FITRIM:
+               return xfs_ioc_trim(mp, arg);
+       case XFS_IOC_ALLOCSP:
+       case XFS_IOC_FREESP:
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_ALLOCSP64:
+       case XFS_IOC_FREESP64:
+       case XFS_IOC_RESVSP64:
+       case XFS_IOC_UNRESVSP64:
+       case XFS_IOC_ZERO_RANGE: {
+               xfs_flock64_t           bf;
+
+               if (copy_from_user(&bf, arg, sizeof(bf)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+       }
+       case XFS_IOC_DIOINFO: {
+               struct dioattr  da;
+               xfs_buftarg_t   *target =
+                       XFS_IS_REALTIME_INODE(ip) ?
+                       mp->m_rtdev_targp : mp->m_ddev_targp;
+
+               da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
+               da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
+
+               if (copy_to_user(arg, &da, sizeof(da)))
+                       return -XFS_ERROR(EFAULT);
+               return 0;
+       }
+
+       case XFS_IOC_FSBULKSTAT_SINGLE:
+       case XFS_IOC_FSBULKSTAT:
+       case XFS_IOC_FSINUMBERS:
+               return xfs_ioc_bulkstat(mp, cmd, arg);
+
+       case XFS_IOC_FSGEOMETRY_V1:
+               return xfs_ioc_fsgeometry_v1(mp, arg);
+
+       case XFS_IOC_FSGEOMETRY:
+               return xfs_ioc_fsgeometry(mp, arg);
+
+       case XFS_IOC_GETVERSION:
+               return put_user(inode->i_generation, (int __user *)arg);
+
+       case XFS_IOC_FSGETXATTR:
+               return xfs_ioc_fsgetxattr(ip, 0, arg);
+       case XFS_IOC_FSGETXATTRA:
+               return xfs_ioc_fsgetxattr(ip, 1, arg);
+       case XFS_IOC_FSSETXATTR:
+               return xfs_ioc_fssetxattr(ip, filp, arg);
+       case XFS_IOC_GETXFLAGS:
+               return xfs_ioc_getxflags(ip, arg);
+       case XFS_IOC_SETXFLAGS:
+               return xfs_ioc_setxflags(ip, filp, arg);
+
+       case XFS_IOC_FSSETDM: {
+               struct fsdmidata        dmi;
+
+               if (copy_from_user(&dmi, arg, sizeof(dmi)))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
+                               dmi.fsd_dmstate);
+               return -error;
+       }
+
+       case XFS_IOC_GETBMAP:
+       case XFS_IOC_GETBMAPA:
+               return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
+
+       case XFS_IOC_GETBMAPX:
+               return xfs_ioc_getbmapx(ip, arg);
+
+       case XFS_IOC_FD_TO_HANDLE:
+       case XFS_IOC_PATH_TO_HANDLE:
+       case XFS_IOC_PATH_TO_FSHANDLE: {
+               xfs_fsop_handlereq_t    hreq;
+
+               if (copy_from_user(&hreq, arg, sizeof(hreq)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_find_handle(cmd, &hreq);
+       }
+       case XFS_IOC_OPEN_BY_HANDLE: {
+               xfs_fsop_handlereq_t    hreq;
+
+               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_open_by_handle(filp, &hreq);
+       }
+       case XFS_IOC_FSSETDM_BY_HANDLE:
+               return xfs_fssetdm_by_handle(filp, arg);
+
+       case XFS_IOC_READLINK_BY_HANDLE: {
+               xfs_fsop_handlereq_t    hreq;
+
+               if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_readlink_by_handle(filp, &hreq);
+       }
+       case XFS_IOC_ATTRLIST_BY_HANDLE:
+               return xfs_attrlist_by_handle(filp, arg);
+
+       case XFS_IOC_ATTRMULTI_BY_HANDLE:
+               return xfs_attrmulti_by_handle(filp, arg);
+
+       case XFS_IOC_SWAPEXT: {
+               struct xfs_swapext      sxp;
+
+               if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_swapext(&sxp);
+               return -error;
+       }
+
+       case XFS_IOC_FSCOUNTS: {
+               xfs_fsop_counts_t out;
+
+               error = xfs_fs_counts(mp, &out);
+               if (error)
+                       return -error;
+
+               if (copy_to_user(arg, &out, sizeof(out)))
+                       return -XFS_ERROR(EFAULT);
+               return 0;
+       }
+
+       case XFS_IOC_SET_RESBLKS: {
+               xfs_fsop_resblks_t inout;
+               __uint64_t         in;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (mp->m_flags & XFS_MOUNT_RDONLY)
+                       return -XFS_ERROR(EROFS);
+
+               if (copy_from_user(&inout, arg, sizeof(inout)))
+                       return -XFS_ERROR(EFAULT);
+
+               /* input parameter is passed in resblks field of structure */
+               in = inout.resblks;
+               error = xfs_reserve_blocks(mp, &in, &inout);
+               if (error)
+                       return -error;
+
+               if (copy_to_user(arg, &inout, sizeof(inout)))
+                       return -XFS_ERROR(EFAULT);
+               return 0;
+       }
+
+       case XFS_IOC_GET_RESBLKS: {
+               xfs_fsop_resblks_t out;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               error = xfs_reserve_blocks(mp, NULL, &out);
+               if (error)
+                       return -error;
+
+               if (copy_to_user(arg, &out, sizeof(out)))
+                       return -XFS_ERROR(EFAULT);
+
+               return 0;
+       }
+
+       case XFS_IOC_FSGROWFSDATA: {
+               xfs_growfs_data_t in;
+
+               if (copy_from_user(&in, arg, sizeof(in)))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_growfs_data(mp, &in);
+               return -error;
+       }
+
+       case XFS_IOC_FSGROWFSLOG: {
+               xfs_growfs_log_t in;
+
+               if (copy_from_user(&in, arg, sizeof(in)))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_growfs_log(mp, &in);
+               return -error;
+       }
+
+       case XFS_IOC_FSGROWFSRT: {
+               xfs_growfs_rt_t in;
+
+               if (copy_from_user(&in, arg, sizeof(in)))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_growfs_rt(mp, &in);
+               return -error;
+       }
+
+       case XFS_IOC_GOINGDOWN: {
+               __uint32_t in;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (get_user(in, (__uint32_t __user *)arg))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_fs_goingdown(mp, in);
+               return -error;
+       }
+
+       case XFS_IOC_ERROR_INJECTION: {
+               xfs_error_injection_t in;
+
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               if (copy_from_user(&in, arg, sizeof(in)))
+                       return -XFS_ERROR(EFAULT);
+
+               error = xfs_errortag_add(in.errtag, mp);
+               return -error;
+       }
+
+       case XFS_IOC_ERROR_CLEARALL:
+               if (!capable(CAP_SYS_ADMIN))
+                       return -EPERM;
+
+               error = xfs_errortag_clearall(mp, 1);
+               return -error;
+
+       default:
+               return -ENOTTY;
+       }
+}
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
new file mode 100644 (file)
index 0000000..d56173b
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+       struct xfs_inode        *ip,
+       struct inode            *inode,
+       struct file             *filp,
+       int                     ioflags,
+       unsigned int            cmd,
+       xfs_flock64_t           *bf);
+
+extern int
+xfs_find_handle(
+       unsigned int            cmd,
+       xfs_fsop_handlereq_t    *hreq);
+
+extern int
+xfs_open_by_handle(
+       struct file             *parfilp,
+       xfs_fsop_handlereq_t    *hreq);
+
+extern int
+xfs_readlink_by_handle(
+       struct file             *parfilp,
+       xfs_fsop_handlereq_t    *hreq);
+
+extern int
+xfs_attrmulti_attr_get(
+       struct inode            *inode,
+       unsigned char           *name,
+       unsigned char           __user *ubuf,
+       __uint32_t              *len,
+       __uint32_t              flags);
+
+extern int
+xfs_attrmulti_attr_set(
+       struct inode            *inode,
+       unsigned char           *name,
+       const unsigned char     __user *ubuf,
+       __uint32_t              len,
+       __uint32_t              flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+       struct inode            *inode,
+       unsigned char           *name,
+       __uint32_t              flags);
+
+extern struct dentry *
+xfs_handle_to_dentry(
+       struct file             *parfilp,
+       void __user             *uhandle,
+       u32                     hlen);
+
+extern long
+xfs_file_ioctl(
+       struct file             *filp,
+       unsigned int            cmd,
+       unsigned long           p);
+
+extern long
+xfs_file_compat_ioctl(
+       struct file             *file,
+       unsigned int            cmd,
+       unsigned long           arg);
+
+#endif
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
new file mode 100644 (file)
index 0000000..54e623b
--- /dev/null
@@ -0,0 +1,672 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include <linux/compat.h>
+#include <linux/ioctl.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_vnode.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_error.h"
+#include "xfs_dfrag.h"
+#include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
+#include "xfs_ioctl32.h"
+#include "xfs_trace.h"
+
+#define  _NATIVE_IOC(cmd, type) \
+         _IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
+
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+       xfs_flock64_t           *bf,
+       compat_xfs_flock64_t    __user *arg32)
+{
+       if (get_user(bf->l_type,        &arg32->l_type) ||
+           get_user(bf->l_whence,      &arg32->l_whence) ||
+           get_user(bf->l_start,       &arg32->l_start) ||
+           get_user(bf->l_len,         &arg32->l_len) ||
+           get_user(bf->l_sysid,       &arg32->l_sysid) ||
+           get_user(bf->l_pid,         &arg32->l_pid) ||
+           copy_from_user(bf->l_pad,   &arg32->l_pad,  4*sizeof(u32)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+       struct xfs_mount          *mp,
+       compat_xfs_fsop_geom_v1_t __user *arg32)
+{
+       xfs_fsop_geom_t           fsgeo;
+       int                       error;
+
+       error = xfs_fs_geometry(mp, &fsgeo, 3);
+       if (error)
+               return -error;
+       /* The 32-bit variant simply has some padding at the end */
+       if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_compat_growfs_data_copyin(
+       struct xfs_growfs_data   *in,
+       compat_xfs_growfs_data_t __user *arg32)
+{
+       if (get_user(in->newblocks, &arg32->newblocks) ||
+           get_user(in->imaxpct,   &arg32->imaxpct))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+       struct xfs_growfs_rt     *in,
+       compat_xfs_growfs_rt_t  __user *arg32)
+{
+       if (get_user(in->newblocks, &arg32->newblocks) ||
+           get_user(in->extsize,   &arg32->extsize))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+       void                    __user *ubuffer,
+       const xfs_inogrp_t      *buffer,
+       long                    count,
+       long                    *written)
+{
+       compat_xfs_inogrp_t     __user *p32 = ubuffer;
+       long                    i;
+
+       for (i = 0; i < count; i++) {
+               if (put_user(buffer[i].xi_startino,   &p32[i].xi_startino) ||
+                   put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
+                   put_user(buffer[i].xi_allocmask,  &p32[i].xi_allocmask))
+                       return -XFS_ERROR(EFAULT);
+       }
+       *written = count * sizeof(*p32);
+       return 0;
+}
+
+#else
+#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
+#endif /* BROKEN_X86_ALIGNMENT */
+
+STATIC int
+xfs_ioctl32_bstime_copyin(
+       xfs_bstime_t            *bstime,
+       compat_xfs_bstime_t     __user *bstime32)
+{
+       compat_time_t           sec32;  /* tv_sec differs on 64 vs. 32 */
+
+       if (get_user(sec32,             &bstime32->tv_sec)      ||
+           get_user(bstime->tv_nsec,   &bstime32->tv_nsec))
+               return -XFS_ERROR(EFAULT);
+       bstime->tv_sec = sec32;
+       return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+       xfs_bstat_t             *bstat,
+       compat_xfs_bstat_t      __user *bstat32)
+{
+       if (get_user(bstat->bs_ino,     &bstat32->bs_ino)       ||
+           get_user(bstat->bs_mode,    &bstat32->bs_mode)      ||
+           get_user(bstat->bs_nlink,   &bstat32->bs_nlink)     ||
+           get_user(bstat->bs_uid,     &bstat32->bs_uid)       ||
+           get_user(bstat->bs_gid,     &bstat32->bs_gid)       ||
+           get_user(bstat->bs_rdev,    &bstat32->bs_rdev)      ||
+           get_user(bstat->bs_blksize, &bstat32->bs_blksize)   ||
+           get_user(bstat->bs_size,    &bstat32->bs_size)      ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+           xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+           get_user(bstat->bs_blocks,  &bstat32->bs_size)      ||
+           get_user(bstat->bs_xflags,  &bstat32->bs_size)      ||
+           get_user(bstat->bs_extsize, &bstat32->bs_extsize)   ||
+           get_user(bstat->bs_extents, &bstat32->bs_extents)   ||
+           get_user(bstat->bs_gen,     &bstat32->bs_gen)       ||
+           get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) ||
+           get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) ||
+           get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+           get_user(bstat->bs_dmstate, &bstat32->bs_dmstate)   ||
+           get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+/* XFS_IOC_FSBULKSTAT and friends */
+
+STATIC int
+xfs_bstime_store_compat(
+       compat_xfs_bstime_t     __user *p32,
+       const xfs_bstime_t      *p)
+{
+       __s32                   sec32;
+
+       sec32 = p->tv_sec;
+       if (put_user(sec32, &p32->tv_sec) ||
+           put_user(p->tv_nsec, &p32->tv_nsec))
+               return -XFS_ERROR(EFAULT);
+       return 0;
+}
+
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
+       void                    __user *ubuffer,
+       int                     ubsize,
+       int                     *ubused,
+       const xfs_bstat_t       *buffer)
+{
+       compat_xfs_bstat_t      __user *p32 = ubuffer;
+
+       if (ubsize < sizeof(*p32))
+               return XFS_ERROR(ENOMEM);
+
+       if (put_user(buffer->bs_ino,      &p32->bs_ino)         ||
+           put_user(buffer->bs_mode,     &p32->bs_mode)        ||
+           put_user(buffer->bs_nlink,    &p32->bs_nlink)       ||
+           put_user(buffer->bs_uid,      &p32->bs_uid)         ||
+           put_user(buffer->bs_gid,      &p32->bs_gid)         ||
+           put_user(buffer->bs_rdev,     &p32->bs_rdev)        ||
+           put_user(buffer->bs_blksize,  &p32->bs_blksize)     ||
+           put_user(buffer->bs_size,     &p32->bs_size)        ||
+           xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
+           xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
+           xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
+           put_user(buffer->bs_blocks,   &p32->bs_blocks)      ||
+           put_user(buffer->bs_xflags,   &p32->bs_xflags)      ||
+           put_user(buffer->bs_extsize,  &p32->bs_extsize)     ||
+           put_user(buffer->bs_extents,  &p32->bs_extents)     ||
+           put_user(buffer->bs_gen,      &p32->bs_gen)         ||
+           put_user(buffer->bs_projid,   &p32->bs_projid)      ||
+           put_user(buffer->bs_projid_hi,      &p32->bs_projid_hi)     ||
+           put_user(buffer->bs_dmevmask, &p32->bs_dmevmask)    ||
+           put_user(buffer->bs_dmstate,  &p32->bs_dmstate)     ||
+           put_user(buffer->bs_aextents, &p32->bs_aextents))
+               return XFS_ERROR(EFAULT);
+       if (ubused)
+               *ubused = sizeof(*p32);
+       return 0;
+}
+
+STATIC int
+xfs_bulkstat_one_compat(
+       xfs_mount_t     *mp,            /* mount point for filesystem */
+       xfs_ino_t       ino,            /* inode number to get data for */
+       void            __user *buffer, /* buffer to place output in */
+       int             ubsize,         /* size of buffer */
+       int             *ubused,        /* bytes used by me */
+       int             *stat)          /* BULKSTAT_RV_... */
+{
+       return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+                                   xfs_bulkstat_one_fmt_compat,
+                                   ubused, stat);
+}
+
+/* copied from xfs_ioctl.c */
+STATIC int
+xfs_compat_ioc_bulkstat(
+       xfs_mount_t               *mp,
+       unsigned int              cmd,
+       compat_xfs_fsop_bulkreq_t __user *p32)
+{
+       u32                     addr;
+       xfs_fsop_bulkreq_t      bulkreq;
+       int                     count;  /* # of records returned */
+       xfs_ino_t               inlast; /* last inode number */
+       int                     done;
+       int                     error;
+
+       /* done = 1 if there are more stats to get and if bulkstat */
+       /* should be called again (unused here, but used in dmapi) */
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return -XFS_ERROR(EIO);
+
+       if (get_user(addr, &p32->lastip))
+               return -XFS_ERROR(EFAULT);
+       bulkreq.lastip = compat_ptr(addr);
+       if (get_user(bulkreq.icount, &p32->icount) ||
+           get_user(addr, &p32->ubuffer))
+               return -XFS_ERROR(EFAULT);
+       bulkreq.ubuffer = compat_ptr(addr);
+       if (get_user(addr, &p32->ocount))
+               return -XFS_ERROR(EFAULT);
+       bulkreq.ocount = compat_ptr(addr);
+
+       if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
+               return -XFS_ERROR(EFAULT);
+
+       if ((count = bulkreq.icount) <= 0)
+               return -XFS_ERROR(EINVAL);
+
+       if (bulkreq.ubuffer == NULL)
+               return -XFS_ERROR(EINVAL);
+
+       if (cmd == XFS_IOC_FSINUMBERS_32) {
+               error = xfs_inumbers(mp, &inlast, &count,
+                               bulkreq.ubuffer, xfs_inumbers_fmt_compat);
+       } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+               int res;
+
+               error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+                               sizeof(compat_xfs_bstat_t), 0, &res);
+       } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
+               error = xfs_bulkstat(mp, &inlast, &count,
+                       xfs_bulkstat_one_compat, sizeof(compat_xfs_bstat_t),
+                       bulkreq.ubuffer, &done);
+       } else
+               error = XFS_ERROR(EINVAL);
+       if (error)
+               return -error;
+
+       if (bulkreq.ocount != NULL) {
+               if (copy_to_user(bulkreq.lastip, &inlast,
+                                               sizeof(xfs_ino_t)))
+                       return -XFS_ERROR(EFAULT);
+
+               if (copy_to_user(bulkreq.ocount, &count, sizeof(count)))
+                       return -XFS_ERROR(EFAULT);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_compat_handlereq_copyin(
+       xfs_fsop_handlereq_t            *hreq,
+       compat_xfs_fsop_handlereq_t     __user *arg32)
+{
+       compat_xfs_fsop_handlereq_t     hreq32;
+
+       if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       hreq->fd = hreq32.fd;
+       hreq->path = compat_ptr(hreq32.path);
+       hreq->oflags = hreq32.oflags;
+       hreq->ihandle = compat_ptr(hreq32.ihandle);
+       hreq->ihandlen = hreq32.ihandlen;
+       hreq->ohandle = compat_ptr(hreq32.ohandle);
+       hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+
+       return 0;
+}
+
+STATIC struct dentry *
+xfs_compat_handlereq_to_dentry(
+       struct file             *parfilp,
+       compat_xfs_fsop_handlereq_t *hreq)
+{
+       return xfs_handle_to_dentry(parfilp,
+                       compat_ptr(hreq->ihandle), hreq->ihandlen);
+}
+
+STATIC int
+xfs_compat_attrlist_by_handle(
+       struct file             *parfilp,
+       void                    __user *arg)
+{
+       int                     error;
+       attrlist_cursor_kern_t  *cursor;
+       compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+       struct dentry           *dentry;
+       char                    *kbuf;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&al_hreq, arg,
+                          sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+       if (al_hreq.buflen > XATTR_LIST_MAX)
+               return -XFS_ERROR(EINVAL);
+
+       /*
+        * Reject flags, only allow namespaces.
+        */
+       if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+               return -XFS_ERROR(EINVAL);
+
+       dentry = xfs_compat_handlereq_to_dentry(parfilp, &al_hreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       error = -ENOMEM;
+       kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+       if (!kbuf)
+               goto out_dput;
+
+       cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+       error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
+                                       al_hreq.flags, cursor);
+       if (error)
+               goto out_kfree;
+
+       if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+               error = -EFAULT;
+
+ out_kfree:
+       kfree(kbuf);
+ out_dput:
+       dput(dentry);
+       return error;
+}
+
+STATIC int
+xfs_compat_attrmulti_by_handle(
+       struct file                             *parfilp,
+       void                                    __user *arg)
+{
+       int                                     error;
+       compat_xfs_attr_multiop_t               *ops;
+       compat_xfs_fsop_attrmulti_handlereq_t   am_hreq;
+       struct dentry                           *dentry;
+       unsigned int                            i, size;
+       unsigned char                           *attr_name;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&am_hreq, arg,
+                          sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       /* overflow check */
+       if (am_hreq.opcount >= INT_MAX / sizeof(compat_xfs_attr_multiop_t))
+               return -E2BIG;
+
+       dentry = xfs_compat_handlereq_to_dentry(parfilp, &am_hreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       error = E2BIG;
+       size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+       if (!size || size > 16 * PAGE_SIZE)
+               goto out_dput;
+
+       ops = memdup_user(compat_ptr(am_hreq.ops), size);
+       if (IS_ERR(ops)) {
+               error = PTR_ERR(ops);
+               goto out_dput;
+       }
+
+       attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+       if (!attr_name)
+               goto out_kfree_ops;
+
+       error = 0;
+       for (i = 0; i < am_hreq.opcount; i++) {
+               ops[i].am_error = strncpy_from_user((char *)attr_name,
+                               compat_ptr(ops[i].am_attrname),
+                               MAXNAMELEN);
+               if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+                       error = -ERANGE;
+               if (ops[i].am_error < 0)
+                       break;
+
+               switch (ops[i].am_opcode) {
+               case ATTR_OP_GET:
+                       ops[i].am_error = xfs_attrmulti_attr_get(
+                                       dentry->d_inode, attr_name,
+                                       compat_ptr(ops[i].am_attrvalue),
+                                       &ops[i].am_length, ops[i].am_flags);
+                       break;
+               case ATTR_OP_SET:
+                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                       if (ops[i].am_error)
+                               break;
+                       ops[i].am_error = xfs_attrmulti_attr_set(
+                                       dentry->d_inode, attr_name,
+                                       compat_ptr(ops[i].am_attrvalue),
+                                       ops[i].am_length, ops[i].am_flags);
+                       mnt_drop_write(parfilp->f_path.mnt);
+                       break;
+               case ATTR_OP_REMOVE:
+                       ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
+                       if (ops[i].am_error)
+                               break;
+                       ops[i].am_error = xfs_attrmulti_attr_remove(
+                                       dentry->d_inode, attr_name,
+                                       ops[i].am_flags);
+                       mnt_drop_write(parfilp->f_path.mnt);
+                       break;
+               default:
+                       ops[i].am_error = EINVAL;
+               }
+       }
+
+       if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+               error = XFS_ERROR(EFAULT);
+
+       kfree(attr_name);
+ out_kfree_ops:
+       kfree(ops);
+ out_dput:
+       dput(dentry);
+       return -error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+       struct file             *parfilp,
+       void                    __user *arg)
+{
+       int                     error;
+       struct fsdmidata        fsd;
+       compat_xfs_fsop_setdm_handlereq_t dmhreq;
+       struct dentry           *dentry;
+
+       if (!capable(CAP_MKNOD))
+               return -XFS_ERROR(EPERM);
+       if (copy_from_user(&dmhreq, arg,
+                          sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+               return -XFS_ERROR(EFAULT);
+
+       dentry = xfs_compat_handlereq_to_dentry(parfilp, &dmhreq.hreq);
+       if (IS_ERR(dentry))
+               return PTR_ERR(dentry);
+
+       if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) {
+               error = -XFS_ERROR(EPERM);
+               goto out;
+       }
+
+       if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+               error = -XFS_ERROR(EFAULT);
+               goto out;
+       }
+
+       error = -xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask,
+                                fsd.fsd_dmstate);
+
+out:
+       dput(dentry);
+       return error;
+}
+
+long
+xfs_file_compat_ioctl(
+       struct file             *filp,
+       unsigned                cmd,
+       unsigned long           p)
+{
+       struct inode            *inode = filp->f_path.dentry->d_inode;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       void                    __user *arg = (void __user *)p;
+       int                     ioflags = 0;
+       int                     error;
+
+       if (filp->f_mode & FMODE_NOCMTIME)
+               ioflags |= IO_INVIS;
+
+       trace_xfs_file_compat_ioctl(ip);
+
+       switch (cmd) {
+       /* No size or alignment issues on any arch */
+       case XFS_IOC_DIOINFO:
+       case XFS_IOC_FSGEOMETRY:
+       case XFS_IOC_FSGETXATTR:
+       case XFS_IOC_FSSETXATTR:
+       case XFS_IOC_FSGETXATTRA:
+       case XFS_IOC_FSSETDM:
+       case XFS_IOC_GETBMAP:
+       case XFS_IOC_GETBMAPA:
+       case XFS_IOC_GETBMAPX:
+       case XFS_IOC_FSCOUNTS:
+       case XFS_IOC_SET_RESBLKS:
+       case XFS_IOC_GET_RESBLKS:
+       case XFS_IOC_FSGROWFSLOG:
+       case XFS_IOC_GOINGDOWN:
+       case XFS_IOC_ERROR_INJECTION:
+       case XFS_IOC_ERROR_CLEARALL:
+               return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+       /* These are handled fine if no alignment issues */
+       case XFS_IOC_ALLOCSP:
+       case XFS_IOC_FREESP:
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_ALLOCSP64:
+       case XFS_IOC_FREESP64:
+       case XFS_IOC_RESVSP64:
+       case XFS_IOC_UNRESVSP64:
+       case XFS_IOC_FSGEOMETRY_V1:
+       case XFS_IOC_FSGROWFSDATA:
+       case XFS_IOC_FSGROWFSRT:
+       case XFS_IOC_ZERO_RANGE:
+               return xfs_file_ioctl(filp, cmd, p);
+#else
+       case XFS_IOC_ALLOCSP_32:
+       case XFS_IOC_FREESP_32:
+       case XFS_IOC_ALLOCSP64_32:
+       case XFS_IOC_FREESP64_32:
+       case XFS_IOC_RESVSP_32:
+       case XFS_IOC_UNRESVSP_32:
+       case XFS_IOC_RESVSP64_32:
+       case XFS_IOC_UNRESVSP64_32:
+       case XFS_IOC_ZERO_RANGE_32: {
+               struct xfs_flock64      bf;
+
+               if (xfs_compat_flock64_copyin(&bf, arg))
+                       return -XFS_ERROR(EFAULT);
+               cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+               return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+       }
+       case XFS_IOC_FSGEOMETRY_V1_32:
+               return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+       case XFS_IOC_FSGROWFSDATA_32: {
+               struct xfs_growfs_data  in;
+
+               if (xfs_compat_growfs_data_copyin(&in, arg))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_growfs_data(mp, &in);
+               return -error;
+       }
+       case XFS_IOC_FSGROWFSRT_32: {
+               struct xfs_growfs_rt    in;
+
+               if (xfs_compat_growfs_rt_copyin(&in, arg))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_growfs_rt(mp, &in);
+               return -error;
+       }
+#endif
+       /* long changes size, but xfs only copiese out 32 bits */
+       case XFS_IOC_GETXFLAGS_32:
+       case XFS_IOC_SETXFLAGS_32:
+       case XFS_IOC_GETVERSION_32:
+               cmd = _NATIVE_IOC(cmd, long);
+               return xfs_file_ioctl(filp, cmd, p);
+       case XFS_IOC_SWAPEXT_32: {
+               struct xfs_swapext        sxp;
+               struct compat_xfs_swapext __user *sxu = arg;
+
+               /* Bulk copy in up to the sx_stat field, then copy bstat */
+               if (copy_from_user(&sxp, sxu,
+                                  offsetof(struct xfs_swapext, sx_stat)) ||
+                   xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+                       return -XFS_ERROR(EFAULT);
+               error = xfs_swapext(&sxp);
+               return -error;
+       }
+       case XFS_IOC_FSBULKSTAT_32:
+       case XFS_IOC_FSBULKSTAT_SINGLE_32:
+       case XFS_IOC_FSINUMBERS_32:
+               return xfs_compat_ioc_bulkstat(mp, cmd, arg);
+       case XFS_IOC_FD_TO_HANDLE_32:
+       case XFS_IOC_PATH_TO_HANDLE_32:
+       case XFS_IOC_PATH_TO_FSHANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
+
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
+               cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
+               return xfs_find_handle(cmd, &hreq);
+       }
+       case XFS_IOC_OPEN_BY_HANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
+
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_open_by_handle(filp, &hreq);
+       }
+       case XFS_IOC_READLINK_BY_HANDLE_32: {
+               struct xfs_fsop_handlereq       hreq;
+
+               if (xfs_compat_handlereq_copyin(&hreq, arg))
+                       return -XFS_ERROR(EFAULT);
+               return xfs_readlink_by_handle(filp, &hreq);
+       }
+       case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+               return xfs_compat_attrlist_by_handle(filp, arg);
+       case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+               return xfs_compat_attrmulti_by_handle(filp, arg);
+       case XFS_IOC_FSSETDM_BY_HANDLE_32:
+               return xfs_compat_fssetdm_by_handle(filp, arg);
+       default:
+               return -XFS_ERROR(ENOIOCTLCMD);
+       }
+}
diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h
new file mode 100644 (file)
index 0000000..80f4060
--- /dev/null
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2004-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOCTL32_H__
+#define __XFS_IOCTL32_H__
+
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment.  We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32   FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32   FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32  FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+       compat_time_t   tv_sec;         /* seconds              */
+       __s32           tv_nsec;        /* and nanoseconds      */
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+       __u64           bs_ino;         /* inode number                 */
+       __u16           bs_mode;        /* type and mode                */
+       __u16           bs_nlink;       /* number of links              */
+       __u32           bs_uid;         /* user id                      */
+       __u32           bs_gid;         /* group id                     */
+       __u32           bs_rdev;        /* device value                 */
+       __s32           bs_blksize;     /* block size                   */
+       __s64           bs_size;        /* file size                    */
+       compat_xfs_bstime_t bs_atime;   /* access time                  */
+       compat_xfs_bstime_t bs_mtime;   /* modify time                  */
+       compat_xfs_bstime_t bs_ctime;   /* inode change time            */
+       int64_t         bs_blocks;      /* number of blocks             */
+       __u32           bs_xflags;      /* extended flags               */
+       __s32           bs_extsize;     /* extent size                  */
+       __s32           bs_extents;     /* number of extents            */
+       __u32           bs_gen;         /* generation count             */
+       __u16           bs_projid_lo;   /* lower part of project id     */
+#define        bs_projid       bs_projid_lo    /* (previously just bs_projid)  */
+       __u16           bs_projid_hi;   /* high part of project id      */
+       unsigned char   bs_pad[12];     /* pad space, unused            */
+       __u32           bs_dmevmask;    /* DMIG event mask              */
+       __u16           bs_dmstate;     /* DMIG state info              */
+       __u16           bs_aextents;    /* attribute number of extents  */
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+       compat_uptr_t   lastip;         /* last inode # pointer         */
+       __s32           icount;         /* count of entries in buffer   */
+       compat_uptr_t   ubuffer;        /* user buffer for inode desc.  */
+       compat_uptr_t   ocount;         /* output count pointer         */
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+       _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+       _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+       _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+       __u32           fd;             /* fd for FD_TO_HANDLE          */
+       compat_uptr_t   path;           /* user pathname                */
+       __u32           oflags;         /* open flags                   */
+       compat_uptr_t   ihandle;        /* user supplied handle         */
+       __u32           ihandlen;       /* user supplied length         */
+       compat_uptr_t   ohandle;        /* user buffer for handle       */
+       compat_uptr_t   ohandlen;       /* user buffer length           */
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+       _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+       _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+       _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+       _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+       _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+       __int64_t               sx_version;     /* version */
+       __int64_t               sx_fdtarget;    /* fd of target file */
+       __int64_t               sx_fdtmp;       /* fd of tmp file */
+       xfs_off_t               sx_offset;      /* offset into file */
+       xfs_off_t               sx_length;      /* leng from offset */
+       char                    sx_pad[16];     /* pad space, unused */
+       compat_xfs_bstat_t      sx_stat;        /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32     _IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+       struct xfs_attrlist_cursor      pos; /* opaque cookie, list offset */
+       __u32                           flags;  /* which namespace to use */
+       __u32                           buflen; /* length of buffer supplied */
+       compat_uptr_t                   buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+       _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+       __u32           am_opcode;
+       __s32           am_error;
+       compat_uptr_t   am_attrname;
+       compat_uptr_t   am_attrvalue;
+       __u32           am_length;
+       __u32           am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+       struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+       __u32                           opcount;/* count of following multiop */
+       /* ptr to compat_xfs_attr_multiop */
+       compat_uptr_t                   ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+       _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+       struct compat_xfs_fsop_handlereq hreq;  /* handle information   */
+       /* ptr to struct fsdmidata */
+       compat_uptr_t                   data;   /* DMAPI data   */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+       _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+       __s16           l_type;
+       __s16           l_whence;
+       __s64           l_start __attribute__((packed));
+                       /* len == 0 means until end of file */
+       __s64           l_len __attribute__((packed));
+       __s32           l_sysid;
+       __u32           l_pid;
+       __s32           l_pad[4];       /* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32     _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32      _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32   _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32    _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32      _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32    _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32    _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32  _IOW('X', 43, struct compat_xfs_flock64)
+#define XFS_IOC_ZERO_RANGE_32  _IOW('X', 57, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+       __u32           blocksize;      /* filesystem (data) block size */
+       __u32           rtextsize;      /* realtime extent size         */
+       __u32           agblocks;       /* fsblocks in an AG            */
+       __u32           agcount;        /* number of allocation groups  */
+       __u32           logblocks;      /* fsblocks in the log          */
+       __u32           sectsize;       /* (data) sector size, bytes    */
+       __u32           inodesize;      /* inode size in bytes          */
+       __u32           imaxpct;        /* max allowed inode space(%)   */
+       __u64           datablocks;     /* fsblocks in data subvolume   */
+       __u64           rtblocks;       /* fsblocks in realtime subvol  */
+       __u64           rtextents;      /* rt extents in realtime subvol*/
+       __u64           logstart;       /* starting fsblock of the log  */
+       unsigned char   uuid[16];       /* unique id of the filesystem  */
+       __u32           sunit;          /* stripe unit, fsblocks        */
+       __u32           swidth;         /* stripe width, fsblocks       */
+       __s32           version;        /* structure version            */
+       __u32           flags;          /* superblock version flags     */
+       __u32           logsectsize;    /* log sector size, bytes       */
+       __u32           rtsectsize;     /* realtime sector size, bytes  */
+       __u32           dirblocksize;   /* directory block size, bytes  */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32  \
+       _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+       __u64           xi_startino;    /* starting inode number        */
+       __s32           xi_alloccount;  /* # bits set in allocmask      */
+       __u64           xi_allocmask;   /* mask of allocated inodes     */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+       __u64           newblocks;      /* new data subvol size, fsblocks */
+       __u32           imaxpct;        /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+       __u64           newblocks;      /* new realtime size, fsblocks */
+       __u32           extsize;        /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32   _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
+
+#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
new file mode 100644 (file)
index 0000000..b9c172b
--- /dev/null
@@ -0,0 +1,1210 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_acl.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_rw.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_trace.h"
+
+#include <linux/capability.h>
+#include <linux/xattr.h>
+#include <linux/namei.h>
+#include <linux/posix_acl.h>
+#include <linux/security.h>
+#include <linux/fiemap.h>
+#include <linux/slab.h>
+
+/*
+ * Bring the timestamps in the XFS inode uptodate.
+ *
+ * Used before writing the inode to disk.
+ */
+void
+xfs_synchronize_times(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
+       ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
+       ip->i_d.di_ctime.t_sec = (__int32_t)inode->i_ctime.tv_sec;
+       ip->i_d.di_ctime.t_nsec = (__int32_t)inode->i_ctime.tv_nsec;
+       ip->i_d.di_mtime.t_sec = (__int32_t)inode->i_mtime.tv_sec;
+       ip->i_d.di_mtime.t_nsec = (__int32_t)inode->i_mtime.tv_nsec;
+}
+
+/*
+ * If the linux inode is valid, mark it dirty.
+ * Used when committing a dirty inode into a transaction so that
+ * the inode will get written back by the linux code
+ */
+void
+xfs_mark_inode_dirty_sync(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+               mark_inode_dirty_sync(inode);
+}
+
+void
+xfs_mark_inode_dirty(
+       xfs_inode_t     *ip)
+{
+       struct inode    *inode = VFS_I(ip);
+
+       if (!(inode->i_state & (I_WILL_FREE|I_FREEING)))
+               mark_inode_dirty(inode);
+}
+
+/*
+ * Hook in SELinux.  This is not quite correct yet, what we really need
+ * here (as we do for default ACLs) is a mechanism by which creation of
+ * these attrs can be journalled at inode creation time (along with the
+ * inode, of course, such that log replay can't cause these to be lost).
+ */
+STATIC int
+xfs_init_security(
+       struct inode    *inode,
+       struct inode    *dir,
+       const struct qstr *qstr)
+{
+       struct xfs_inode *ip = XFS_I(inode);
+       size_t          length;
+       void            *value;
+       unsigned char   *name;
+       int             error;
+
+       error = security_inode_init_security(inode, dir, qstr, (char **)&name,
+                                            &value, &length);
+       if (error) {
+               if (error == -EOPNOTSUPP)
+                       return 0;
+               return -error;
+       }
+
+       error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
+
+       kfree(name);
+       kfree(value);
+       return error;
+}
+
+static void
+xfs_dentry_to_name(
+       struct xfs_name *namep,
+       struct dentry   *dentry)
+{
+       namep->name = dentry->d_name.name;
+       namep->len = dentry->d_name.len;
+}
+
+STATIC void
+xfs_cleanup_inode(
+       struct inode    *dir,
+       struct inode    *inode,
+       struct dentry   *dentry)
+{
+       struct xfs_name teardown;
+
+       /* Oh, the horror.
+        * If we can't add the ACL or we fail in
+        * xfs_init_security we must back out.
+        * ENOSPC can hit here, among other things.
+        */
+       xfs_dentry_to_name(&teardown, dentry);
+
+       xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
+       iput(inode);
+}
+
+STATIC int
+xfs_vn_mknod(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       int             mode,
+       dev_t           rdev)
+{
+       struct inode    *inode;
+       struct xfs_inode *ip = NULL;
+       struct posix_acl *default_acl = NULL;
+       struct xfs_name name;
+       int             error;
+
+       /*
+        * Irix uses Missed'em'V split, but doesn't want to see
+        * the upper 5 bits of (14bit) major.
+        */
+       if (S_ISCHR(mode) || S_ISBLK(mode)) {
+               if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
+                       return -EINVAL;
+               rdev = sysv_encode_dev(rdev);
+       } else {
+               rdev = 0;
+       }
+
+       if (IS_POSIXACL(dir)) {
+               default_acl = xfs_get_acl(dir, ACL_TYPE_DEFAULT);
+               if (IS_ERR(default_acl))
+                       return PTR_ERR(default_acl);
+
+               if (!default_acl)
+                       mode &= ~current_umask();
+       }
+
+       xfs_dentry_to_name(&name, dentry);
+       error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+       if (unlikely(error))
+               goto out_free_acl;
+
+       inode = VFS_I(ip);
+
+       error = xfs_init_security(inode, dir, &dentry->d_name);
+       if (unlikely(error))
+               goto out_cleanup_inode;
+
+       if (default_acl) {
+               error = -xfs_inherit_acl(inode, default_acl);
+               default_acl = NULL;
+               if (unlikely(error))
+                       goto out_cleanup_inode;
+       }
+
+
+       d_instantiate(dentry, inode);
+       return -error;
+
+ out_cleanup_inode:
+       xfs_cleanup_inode(dir, inode, dentry);
+ out_free_acl:
+       posix_acl_release(default_acl);
+       return -error;
+}
+
+STATIC int
+xfs_vn_create(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       int             mode,
+       struct nameidata *nd)
+{
+       return xfs_vn_mknod(dir, dentry, mode, 0);
+}
+
+STATIC int
+xfs_vn_mkdir(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       int             mode)
+{
+       return xfs_vn_mknod(dir, dentry, mode|S_IFDIR, 0);
+}
+
+STATIC struct dentry *
+xfs_vn_lookup(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       struct nameidata *nd)
+{
+       struct xfs_inode *cip;
+       struct xfs_name name;
+       int             error;
+
+       if (dentry->d_name.len >= MAXNAMELEN)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       xfs_dentry_to_name(&name, dentry);
+       error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
+       if (unlikely(error)) {
+               if (unlikely(error != ENOENT))
+                       return ERR_PTR(-error);
+               d_add(dentry, NULL);
+               return NULL;
+       }
+
+       return d_splice_alias(VFS_I(cip), dentry);
+}
+
+STATIC struct dentry *
+xfs_vn_ci_lookup(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       struct nameidata *nd)
+{
+       struct xfs_inode *ip;
+       struct xfs_name xname;
+       struct xfs_name ci_name;
+       struct qstr     dname;
+       int             error;
+
+       if (dentry->d_name.len >= MAXNAMELEN)
+               return ERR_PTR(-ENAMETOOLONG);
+
+       xfs_dentry_to_name(&xname, dentry);
+       error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
+       if (unlikely(error)) {
+               if (unlikely(error != ENOENT))
+                       return ERR_PTR(-error);
+               /*
+                * call d_add(dentry, NULL) here when d_drop_negative_children
+                * is called in xfs_vn_mknod (ie. allow negative dentries
+                * with CI filesystems).
+                */
+               return NULL;
+       }
+
+       /* if exact match, just splice and exit */
+       if (!ci_name.name)
+               return d_splice_alias(VFS_I(ip), dentry);
+
+       /* else case-insensitive match... */
+       dname.name = ci_name.name;
+       dname.len = ci_name.len;
+       dentry = d_add_ci(dentry, VFS_I(ip), &dname);
+       kmem_free(ci_name.name);
+       return dentry;
+}
+
+STATIC int
+xfs_vn_link(
+       struct dentry   *old_dentry,
+       struct inode    *dir,
+       struct dentry   *dentry)
+{
+       struct inode    *inode = old_dentry->d_inode;
+       struct xfs_name name;
+       int             error;
+
+       xfs_dentry_to_name(&name, dentry);
+
+       error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
+       if (unlikely(error))
+               return -error;
+
+       ihold(inode);
+       d_instantiate(dentry, inode);
+       return 0;
+}
+
+STATIC int
+xfs_vn_unlink(
+       struct inode    *dir,
+       struct dentry   *dentry)
+{
+       struct xfs_name name;
+       int             error;
+
+       xfs_dentry_to_name(&name, dentry);
+
+       error = -xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode));
+       if (error)
+               return error;
+
+       /*
+        * With unlink, the VFS makes the dentry "negative": no inode,
+        * but still hashed. This is incompatible with case-insensitive
+        * mode, so invalidate (unhash) the dentry in CI-mode.
+        */
+       if (xfs_sb_version_hasasciici(&XFS_M(dir->i_sb)->m_sb))
+               d_invalidate(dentry);
+       return 0;
+}
+
+STATIC int
+xfs_vn_symlink(
+       struct inode    *dir,
+       struct dentry   *dentry,
+       const char      *symname)
+{
+       struct inode    *inode;
+       struct xfs_inode *cip = NULL;
+       struct xfs_name name;
+       int             error;
+       mode_t          mode;
+
+       mode = S_IFLNK |
+               (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
+       xfs_dentry_to_name(&name, dentry);
+
+       error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
+       if (unlikely(error))
+               goto out;
+
+       inode = VFS_I(cip);
+
+       error = xfs_init_security(inode, dir, &dentry->d_name);
+       if (unlikely(error))
+               goto out_cleanup_inode;
+
+       d_instantiate(dentry, inode);
+       return 0;
+
+ out_cleanup_inode:
+       xfs_cleanup_inode(dir, inode, dentry);
+ out:
+       return -error;
+}
+
+STATIC int
+xfs_vn_rename(
+       struct inode    *odir,
+       struct dentry   *odentry,
+       struct inode    *ndir,
+       struct dentry   *ndentry)
+{
+       struct inode    *new_inode = ndentry->d_inode;
+       struct xfs_name oname;
+       struct xfs_name nname;
+
+       xfs_dentry_to_name(&oname, odentry);
+       xfs_dentry_to_name(&nname, ndentry);
+
+       return -xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
+                          XFS_I(ndir), &nname, new_inode ?
+                                               XFS_I(new_inode) : NULL);
+}
+
+/*
+ * careful here - this function can get called recursively, so
+ * we need to be very careful about how much stack we use.
+ * uio is kmalloced for this reason...
+ */
+STATIC void *
+xfs_vn_follow_link(
+       struct dentry           *dentry,
+       struct nameidata        *nd)
+{
+       char                    *link;
+       int                     error = -ENOMEM;
+
+       link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
+       if (!link)
+               goto out_err;
+
+       error = -xfs_readlink(XFS_I(dentry->d_inode), link);
+       if (unlikely(error))
+               goto out_kfree;
+
+       nd_set_link(nd, link);
+       return NULL;
+
+ out_kfree:
+       kfree(link);
+ out_err:
+       nd_set_link(nd, ERR_PTR(error));
+       return NULL;
+}
+
+STATIC void
+xfs_vn_put_link(
+       struct dentry   *dentry,
+       struct nameidata *nd,
+       void            *p)
+{
+       char            *s = nd_get_link(nd);
+
+       if (!IS_ERR(s))
+               kfree(s);
+}
+
+STATIC int
+xfs_vn_getattr(
+       struct vfsmount         *mnt,
+       struct dentry           *dentry,
+       struct kstat            *stat)
+{
+       struct inode            *inode = dentry->d_inode;
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+
+       trace_xfs_getattr(ip);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       stat->size = XFS_ISIZE(ip);
+       stat->dev = inode->i_sb->s_dev;
+       stat->mode = ip->i_d.di_mode;
+       stat->nlink = ip->i_d.di_nlink;
+       stat->uid = ip->i_d.di_uid;
+       stat->gid = ip->i_d.di_gid;
+       stat->ino = ip->i_ino;
+       stat->atime = inode->i_atime;
+       stat->mtime = inode->i_mtime;
+       stat->ctime = inode->i_ctime;
+       stat->blocks =
+               XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
+
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFBLK:
+       case S_IFCHR:
+               stat->blksize = BLKDEV_IOSIZE;
+               stat->rdev = MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                                  sysv_minor(ip->i_df.if_u2.if_rdev));
+               break;
+       default:
+               if (XFS_IS_REALTIME_INODE(ip)) {
+                       /*
+                        * If the file blocks are being allocated from a
+                        * realtime volume, then return the inode's realtime
+                        * extent size or the realtime volume's extent size.
+                        */
+                       stat->blksize =
+                               xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog;
+               } else
+                       stat->blksize = xfs_preferred_iosize(mp);
+               stat->rdev = 0;
+               break;
+       }
+
+       return 0;
+}
+
+int
+xfs_setattr_nonsize(
+       struct xfs_inode        *ip,
+       struct iattr            *iattr,
+       int                     flags)
+{
+       xfs_mount_t             *mp = ip->i_mount;
+       struct inode            *inode = VFS_I(ip);
+       int                     mask = iattr->ia_valid;
+       xfs_trans_t             *tp;
+       int                     error;
+       uid_t                   uid = 0, iuid = 0;
+       gid_t                   gid = 0, igid = 0;
+       struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
+       struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
+
+       trace_xfs_setattr(ip);
+
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return XFS_ERROR(EROFS);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = -inode_change_ok(inode, iattr);
+       if (error)
+               return XFS_ERROR(error);
+
+       ASSERT((mask & ATTR_SIZE) == 0);
+
+       /*
+        * If disk quotas is on, we make sure that the dquots do exist on disk,
+        * before we start any other transactions. Trying to do this later
+        * is messy. We don't care to take a readlock to look at the ids
+        * in inode here, because we can't hold it across the trans_reserve.
+        * If the IDs do change before we take the ilock, we're covered
+        * because the i_*dquot fields will get updated anyway.
+        */
+       if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+               uint    qflags = 0;
+
+               if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+                       uid = iattr->ia_uid;
+                       qflags |= XFS_QMOPT_UQUOTA;
+               } else {
+                       uid = ip->i_d.di_uid;
+               }
+               if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+                       gid = iattr->ia_gid;
+                       qflags |= XFS_QMOPT_GQUOTA;
+               }  else {
+                       gid = ip->i_d.di_gid;
+               }
+
+               /*
+                * We take a reference when we initialize udqp and gdqp,
+                * so it is important that we never blindly double trip on
+                * the same variable. See xfs_create() for an example.
+                */
+               ASSERT(udqp == NULL);
+               ASSERT(gdqp == NULL);
+               error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
+                                        qflags, &udqp, &gdqp);
+               if (error)
+                       return error;
+       }
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+       error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+       if (error)
+               goto out_dqrele;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       /*
+        * Change file ownership.  Must be the owner or privileged.
+        */
+       if (mask & (ATTR_UID|ATTR_GID)) {
+               /*
+                * These IDs could have changed since we last looked at them.
+                * But, we're assured that if the ownership did change
+                * while we didn't have the inode locked, inode's dquot(s)
+                * would have changed also.
+                */
+               iuid = ip->i_d.di_uid;
+               igid = ip->i_d.di_gid;
+               gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+               uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
+
+               /*
+                * Do a quota reservation only if uid/gid is actually
+                * going to change.
+                */
+               if (XFS_IS_QUOTA_RUNNING(mp) &&
+                   ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+                    (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+                       ASSERT(tp);
+                       error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+                                               capable(CAP_FOWNER) ?
+                                               XFS_QMOPT_FORCE_RES : 0);
+                       if (error)      /* out of quota */
+                               goto out_trans_cancel;
+               }
+       }
+
+       xfs_trans_ijoin(tp, ip);
+
+       /*
+        * Change file ownership.  Must be the owner or privileged.
+        */
+       if (mask & (ATTR_UID|ATTR_GID)) {
+               /*
+                * CAP_FSETID overrides the following restrictions:
+                *
+                * The set-user-ID and set-group-ID bits of a file will be
+                * cleared upon successful return from chown()
+                */
+               if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+                   !capable(CAP_FSETID))
+                       ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+
+               /*
+                * Change the ownerships and register quota modifications
+                * in the transaction.
+                */
+               if (iuid != uid) {
+                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+                               ASSERT(mask & ATTR_UID);
+                               ASSERT(udqp);
+                               olddquot1 = xfs_qm_vop_chown(tp, ip,
+                                                       &ip->i_udquot, udqp);
+                       }
+                       ip->i_d.di_uid = uid;
+                       inode->i_uid = uid;
+               }
+               if (igid != gid) {
+                       if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+                               ASSERT(!XFS_IS_PQUOTA_ON(mp));
+                               ASSERT(mask & ATTR_GID);
+                               ASSERT(gdqp);
+                               olddquot2 = xfs_qm_vop_chown(tp, ip,
+                                                       &ip->i_gdquot, gdqp);
+                       }
+                       ip->i_d.di_gid = gid;
+                       inode->i_gid = gid;
+               }
+       }
+
+       /*
+        * Change file access modes.
+        */
+       if (mask & ATTR_MODE) {
+               umode_t mode = iattr->ia_mode;
+
+               if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                       mode &= ~S_ISGID;
+
+               ip->i_d.di_mode &= S_IFMT;
+               ip->i_d.di_mode |= mode & ~S_IFMT;
+
+               inode->i_mode &= S_IFMT;
+               inode->i_mode |= mode & ~S_IFMT;
+       }
+
+       /*
+        * Change file access or modified times.
+        */
+       if (mask & ATTR_ATIME) {
+               inode->i_atime = iattr->ia_atime;
+               ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+               ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+               ip->i_update_core = 1;
+       }
+       if (mask & ATTR_CTIME) {
+               inode->i_ctime = iattr->ia_ctime;
+               ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+               ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+               ip->i_update_core = 1;
+       }
+       if (mask & ATTR_MTIME) {
+               inode->i_mtime = iattr->ia_mtime;
+               ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+               ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+               ip->i_update_core = 1;
+       }
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       XFS_STATS_INC(xs_ig_attrchg);
+
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(tp);
+       error = xfs_trans_commit(tp, 0);
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       /*
+        * Release any dquot(s) the inode had kept before chown.
+        */
+       xfs_qm_dqrele(olddquot1);
+       xfs_qm_dqrele(olddquot2);
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+
+       if (error)
+               return XFS_ERROR(error);
+
+       /*
+        * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+        *           update.  We could avoid this with linked transactions
+        *           and passing down the transaction pointer all the way
+        *           to attr_set.  No previous user of the generic
+        *           Posix ACL code seems to care about this issue either.
+        */
+       if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+               error = -xfs_acl_chmod(inode);
+               if (error)
+                       return XFS_ERROR(error);
+       }
+
+       return 0;
+
+out_trans_cancel:
+       xfs_trans_cancel(tp, 0);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out_dqrele:
+       xfs_qm_dqrele(udqp);
+       xfs_qm_dqrele(gdqp);
+       return error;
+}
+
+/*
+ * Truncate file.  Must have write permission and not be a directory.
+ */
+int
+xfs_setattr_size(
+       struct xfs_inode        *ip,
+       struct iattr            *iattr,
+       int                     flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct inode            *inode = VFS_I(ip);
+       int                     mask = iattr->ia_valid;
+       struct xfs_trans        *tp;
+       int                     error;
+       uint                    lock_flags;
+       uint                    commit_flags = 0;
+
+       trace_xfs_setattr(ip);
+
+       if (mp->m_flags & XFS_MOUNT_RDONLY)
+               return XFS_ERROR(EROFS);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       error = -inode_change_ok(inode, iattr);
+       if (error)
+               return XFS_ERROR(error);
+
+       ASSERT(S_ISREG(ip->i_d.di_mode));
+       ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+                       ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
+                       ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+
+       lock_flags = XFS_ILOCK_EXCL;
+       if (!(flags & XFS_ATTR_NOLOCK))
+               lock_flags |= XFS_IOLOCK_EXCL;
+       xfs_ilock(ip, lock_flags);
+
+       /*
+        * Short circuit the truncate case for zero length files.
+        */
+       if (iattr->ia_size == 0 &&
+           ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+               if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
+                       goto out_unlock;
+
+               /*
+                * Use the regular setattr path to update the timestamps.
+                */
+               xfs_iunlock(ip, lock_flags);
+               iattr->ia_valid &= ~ATTR_SIZE;
+               return xfs_setattr_nonsize(ip, iattr, 0);
+       }
+
+       /*
+        * Make sure that the dquots are attached to the inode.
+        */
+       error = xfs_qm_dqattach_locked(ip, 0);
+       if (error)
+               goto out_unlock;
+
+       /*
+        * Now we can make the changes.  Before we join the inode to the
+        * transaction, take care of the part of the truncation that must be
+        * done without the inode lock.  This needs to be done before joining
+        * the inode to the transaction, because the inode cannot be unlocked
+        * once it is a part of the transaction.
+        */
+       if (iattr->ia_size > ip->i_size) {
+               /*
+                * Do the first part of growing a file: zero any data in the
+                * last block that is beyond the old EOF.  We need to do this
+                * before the inode is joined to the transaction to modify
+                * i_size.
+                */
+               error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+               if (error)
+                       goto out_unlock;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       lock_flags &= ~XFS_ILOCK_EXCL;
+
+       /*
+        * We are going to log the inode size change in this transaction so
+        * any previous writes that are beyond the on disk EOF and the new
+        * EOF that have not been written out need to be written here.  If we
+        * do not write the data out, we expose ourselves to the null files
+        * problem.
+        *
+        * Only flush from the on disk size to the smaller of the in memory
+        * file size or the new size as that's the range we really care about
+        * here and prevents waiting for other data not within the range we
+        * care about here.
+        */
+       if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
+               error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
+                                       XBF_ASYNC, FI_NONE);
+               if (error)
+                       goto out_unlock;
+       }
+
+       /*
+        * Wait for all I/O to complete.
+        */
+       xfs_ioend_wait(ip);
+
+       error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+                                    xfs_get_blocks);
+       if (error)
+               goto out_unlock;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                XFS_TRANS_PERM_LOG_RES,
+                                XFS_ITRUNCATE_LOG_COUNT);
+       if (error)
+               goto out_trans_cancel;
+
+       truncate_setsize(inode, iattr->ia_size);
+
+       commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+       lock_flags |= XFS_ILOCK_EXCL;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       xfs_trans_ijoin(tp, ip);
+
+       /*
+        * Only change the c/mtime if we are changing the size or we are
+        * explicitly asked to change it.  This handles the semantic difference
+        * between truncate() and ftruncate() as implemented in the VFS.
+        *
+        * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+        * special case where we need to update the times despite not having
+        * these flags set.  For all other operations the VFS set these flags
+        * explicitly if it wants a timestamp update.
+        */
+       if (iattr->ia_size != ip->i_size &&
+           (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+               iattr->ia_ctime = iattr->ia_mtime =
+                       current_fs_time(inode->i_sb);
+               mask |= ATTR_CTIME | ATTR_MTIME;
+       }
+
+       if (iattr->ia_size > ip->i_size) {
+               ip->i_d.di_size = iattr->ia_size;
+               ip->i_size = iattr->ia_size;
+       } else if (iattr->ia_size <= ip->i_size ||
+                  (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+               error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+               if (error)
+                       goto out_trans_abort;
+
+               /*
+                * Truncated "down", so we're removing references to old data
+                * here - if we delay flushing for a long time, we expose
+                * ourselves unduly to the notorious NULL files problem.  So,
+                * we mark this inode and flush it when the file is closed,
+                * and do not wait the usual (long) time for writeout.
+                */
+               xfs_iflags_set(ip, XFS_ITRUNCATED);
+       }
+
+       if (mask & ATTR_CTIME) {
+               inode->i_ctime = iattr->ia_ctime;
+               ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+               ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+               ip->i_update_core = 1;
+       }
+       if (mask & ATTR_MTIME) {
+               inode->i_mtime = iattr->ia_mtime;
+               ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+               ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+               ip->i_update_core = 1;
+       }
+
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+       XFS_STATS_INC(xs_ig_attrchg);
+
+       if (mp->m_flags & XFS_MOUNT_WSYNC)
+               xfs_trans_set_sync(tp);
+
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+out_unlock:
+       if (lock_flags)
+               xfs_iunlock(ip, lock_flags);
+       return error;
+
+out_trans_abort:
+       commit_flags |= XFS_TRANS_ABORT;
+out_trans_cancel:
+       xfs_trans_cancel(tp, commit_flags);
+       goto out_unlock;
+}
+
+STATIC int
+xfs_vn_setattr(
+       struct dentry   *dentry,
+       struct iattr    *iattr)
+{
+       if (iattr->ia_valid & ATTR_SIZE)
+               return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
+       return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
+}
+
+#define XFS_FIEMAP_FLAGS       (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+       void                    **arg,
+       struct getbmapx         *bmv,
+       int                     *full)
+{
+       int                     error;
+       struct fiemap_extent_info *fieinfo = *arg;
+       u32                     fiemap_flags = 0;
+       u64                     logical, physical, length;
+
+       /* Do nothing for a hole */
+       if (bmv->bmv_block == -1LL)
+               return 0;
+
+       logical = BBTOB(bmv->bmv_offset);
+       physical = BBTOB(bmv->bmv_block);
+       length = BBTOB(bmv->bmv_length);
+
+       if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+               fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+       else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+               fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+               physical = 0;   /* no block yet */
+       }
+       if (bmv->bmv_oflags & BMV_OF_LAST)
+               fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+       error = fiemap_fill_next_extent(fieinfo, logical, physical,
+                                       length, fiemap_flags);
+       if (error > 0) {
+               error = 0;
+               *full = 1;      /* user array now full */
+       }
+
+       return -error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+       struct inode            *inode,
+       struct fiemap_extent_info *fieinfo,
+       u64                     start,
+       u64                     length)
+{
+       xfs_inode_t             *ip = XFS_I(inode);
+       struct getbmapx         bm;
+       int                     error;
+
+       error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+       if (error)
+               return error;
+
+       /* Set up bmap header for xfs internal routine */
+       bm.bmv_offset = BTOBB(start);
+       /* Special case for whole file */
+       if (length == FIEMAP_MAX_OFFSET)
+               bm.bmv_length = -1LL;
+       else
+               bm.bmv_length = BTOBB(length);
+
+       /* We add one because in getbmap world count includes the header */
+       bm.bmv_count = !fieinfo->fi_extents_max ? MAXEXTNUM :
+                                       fieinfo->fi_extents_max + 1;
+       bm.bmv_count = min_t(__s32, bm.bmv_count,
+                            (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
+       bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
+       if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+               bm.bmv_iflags |= BMV_IF_ATTRFORK;
+       if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+               bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+       error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+       if (error)
+               return -error;
+
+       return 0;
+}
+
+static const struct inode_operations xfs_inode_operations = {
+       .get_acl                = xfs_get_acl,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+       .fiemap                 = xfs_vn_fiemap,
+};
+
+static const struct inode_operations xfs_dir_inode_operations = {
+       .create                 = xfs_vn_create,
+       .lookup                 = xfs_vn_lookup,
+       .link                   = xfs_vn_link,
+       .unlink                 = xfs_vn_unlink,
+       .symlink                = xfs_vn_symlink,
+       .mkdir                  = xfs_vn_mkdir,
+       /*
+        * Yes, XFS uses the same method for rmdir and unlink.
+        *
+        * There are some subtile differences deeper in the code,
+        * but we use S_ISDIR to check for those.
+        */
+       .rmdir                  = xfs_vn_unlink,
+       .mknod                  = xfs_vn_mknod,
+       .rename                 = xfs_vn_rename,
+       .get_acl                = xfs_get_acl,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_dir_ci_inode_operations = {
+       .create                 = xfs_vn_create,
+       .lookup                 = xfs_vn_ci_lookup,
+       .link                   = xfs_vn_link,
+       .unlink                 = xfs_vn_unlink,
+       .symlink                = xfs_vn_symlink,
+       .mkdir                  = xfs_vn_mkdir,
+       /*
+        * Yes, XFS uses the same method for rmdir and unlink.
+        *
+        * There are some subtile differences deeper in the code,
+        * but we use S_ISDIR to check for those.
+        */
+       .rmdir                  = xfs_vn_unlink,
+       .mknod                  = xfs_vn_mknod,
+       .rename                 = xfs_vn_rename,
+       .get_acl                = xfs_get_acl,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+};
+
+static const struct inode_operations xfs_symlink_inode_operations = {
+       .readlink               = generic_readlink,
+       .follow_link            = xfs_vn_follow_link,
+       .put_link               = xfs_vn_put_link,
+       .get_acl                = xfs_get_acl,
+       .getattr                = xfs_vn_getattr,
+       .setattr                = xfs_vn_setattr,
+       .setxattr               = generic_setxattr,
+       .getxattr               = generic_getxattr,
+       .removexattr            = generic_removexattr,
+       .listxattr              = xfs_vn_listxattr,
+};
+
+STATIC void
+xfs_diflags_to_iflags(
+       struct inode            *inode,
+       struct xfs_inode        *ip)
+{
+       if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
+               inode->i_flags |= S_IMMUTABLE;
+       else
+               inode->i_flags &= ~S_IMMUTABLE;
+       if (ip->i_d.di_flags & XFS_DIFLAG_APPEND)
+               inode->i_flags |= S_APPEND;
+       else
+               inode->i_flags &= ~S_APPEND;
+       if (ip->i_d.di_flags & XFS_DIFLAG_SYNC)
+               inode->i_flags |= S_SYNC;
+       else
+               inode->i_flags &= ~S_SYNC;
+       if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME)
+               inode->i_flags |= S_NOATIME;
+       else
+               inode->i_flags &= ~S_NOATIME;
+}
+
+/*
+ * Initialize the Linux inode, set up the operation vectors and
+ * unlock the inode.
+ *
+ * When reading existing inodes from disk this is called directly
+ * from xfs_iget, when creating a new inode it is called from
+ * xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
+ */
+void
+xfs_setup_inode(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = &ip->i_vnode;
+
+       inode->i_ino = ip->i_ino;
+       inode->i_state = I_NEW;
+
+       inode_sb_list_add(inode);
+       /* make the inode look hashed for the writeback code */
+       hlist_add_fake(&inode->i_hash);
+
+       inode->i_mode   = ip->i_d.di_mode;
+       inode->i_nlink  = ip->i_d.di_nlink;
+       inode->i_uid    = ip->i_d.di_uid;
+       inode->i_gid    = ip->i_d.di_gid;
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFBLK:
+       case S_IFCHR:
+               inode->i_rdev =
+                       MKDEV(sysv_major(ip->i_df.if_u2.if_rdev) & 0x1ff,
+                             sysv_minor(ip->i_df.if_u2.if_rdev));
+               break;
+       default:
+               inode->i_rdev = 0;
+               break;
+       }
+
+       inode->i_generation = ip->i_d.di_gen;
+       i_size_write(inode, ip->i_d.di_size);
+       inode->i_atime.tv_sec   = ip->i_d.di_atime.t_sec;
+       inode->i_atime.tv_nsec  = ip->i_d.di_atime.t_nsec;
+       inode->i_mtime.tv_sec   = ip->i_d.di_mtime.t_sec;
+       inode->i_mtime.tv_nsec  = ip->i_d.di_mtime.t_nsec;
+       inode->i_ctime.tv_sec   = ip->i_d.di_ctime.t_sec;
+       inode->i_ctime.tv_nsec  = ip->i_d.di_ctime.t_nsec;
+       xfs_diflags_to_iflags(inode, ip);
+
+       switch (inode->i_mode & S_IFMT) {
+       case S_IFREG:
+               inode->i_op = &xfs_inode_operations;
+               inode->i_fop = &xfs_file_operations;
+               inode->i_mapping->a_ops = &xfs_address_space_operations;
+               break;
+       case S_IFDIR:
+               if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
+                       inode->i_op = &xfs_dir_ci_inode_operations;
+               else
+                       inode->i_op = &xfs_dir_inode_operations;
+               inode->i_fop = &xfs_dir_file_operations;
+               break;
+       case S_IFLNK:
+               inode->i_op = &xfs_symlink_inode_operations;
+               if (!(ip->i_df.if_flags & XFS_IFINLINE))
+                       inode->i_mapping->a_ops = &xfs_address_space_operations;
+               break;
+       default:
+               inode->i_op = &xfs_inode_operations;
+               init_special_inode(inode, inode->i_mode, inode->i_rdev);
+               break;
+       }
+
+       /*
+        * If there is no attribute fork no ACL can exist on this inode,
+        * and it can't have any file capabilities attached to it either.
+        */
+       if (!XFS_IFORK_Q(ip)) {
+               inode_has_no_xattr(inode);
+               cache_no_acl(inode);
+       }
+
+       xfs_iflags_clear(ip, XFS_INEW);
+       barrier();
+
+       unlock_new_inode(inode);
+}
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
new file mode 100644 (file)
index 0000000..ef41c92
--- /dev/null
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_IOPS_H__
+#define __XFS_IOPS_H__
+
+struct xfs_inode;
+
+extern const struct file_operations xfs_file_operations;
+extern const struct file_operations xfs_dir_file_operations;
+
+extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
+
+extern void xfs_setup_inode(struct xfs_inode *);
+
+#endif /* __XFS_IOPS_H__ */
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
new file mode 100644 (file)
index 0000000..1e8a45e
--- /dev/null
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_LINUX__
+#define __XFS_LINUX__
+
+#include <linux/types.h>
+
+/*
+ * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
+ */
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
+# define XFS_BIG_BLKNOS        1
+# define XFS_BIG_INUMS 1
+#else
+# define XFS_BIG_BLKNOS        0
+# define XFS_BIG_INUMS 0
+#endif
+
+#include "xfs_types.h"
+
+#include "kmem.h"
+#include "mrlock.h"
+#include "time.h"
+#include "uuid.h"
+
+#include <linux/semaphore.h>
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <linux/swap.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/bitops.h>
+#include <linux/major.h>
+#include <linux/pagemap.h>
+#include <linux/vfs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/proc_fs.h>
+#include <linux/sort.h>
+#include <linux/cpu.h>
+#include <linux/notifier.h>
+#include <linux/delay.h>
+#include <linux/log2.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/ctype.h>
+#include <linux/writeback.h>
+#include <linux/capability.h>
+#include <linux/list_sort.h>
+
+#include <asm/page.h>
+#include <asm/div64.h>
+#include <asm/param.h>
+#include <asm/uaccess.h>
+#include <asm/byteorder.h>
+#include <asm/unaligned.h>
+
+#include "xfs_vnode.h"
+#include "xfs_stats.h"
+#include "xfs_sysctl.h"
+#include "xfs_iops.h"
+#include "xfs_aops.h"
+#include "xfs_super.h"
+#include "xfs_buf.h"
+#include "xfs_message.h"
+
+#ifdef __BIG_ENDIAN
+#define XFS_NATIVE_HOST 1
+#else
+#undef XFS_NATIVE_HOST
+#endif
+
+/*
+ * Feature macros (disable/enable)
+ */
+#ifdef CONFIG_SMP
+#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
+#else
+#undef  HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
+#endif
+
+#define irix_sgid_inherit      xfs_params.sgid_inherit.val
+#define irix_symlink_mode      xfs_params.symlink_mode.val
+#define xfs_panic_mask         xfs_params.panic_mask.val
+#define xfs_error_level                xfs_params.error_level.val
+#define xfs_syncd_centisecs    xfs_params.syncd_timer.val
+#define xfs_stats_clear                xfs_params.stats_clear.val
+#define xfs_inherit_sync       xfs_params.inherit_sync.val
+#define xfs_inherit_nodump     xfs_params.inherit_nodump.val
+#define xfs_inherit_noatime    xfs_params.inherit_noatim.val
+#define xfs_buf_timer_centisecs        xfs_params.xfs_buf_timer.val
+#define xfs_buf_age_centisecs  xfs_params.xfs_buf_age.val
+#define xfs_inherit_nosymlinks xfs_params.inherit_nosym.val
+#define xfs_rotorstep          xfs_params.rotorstep.val
+#define xfs_inherit_nodefrag   xfs_params.inherit_nodfrg.val
+#define xfs_fstrm_centisecs    xfs_params.fstrm_timer.val
+
+#define current_cpu()          (raw_smp_processor_id())
+#define current_pid()          (current->pid)
+#define current_test_flags(f)  (current->flags & (f))
+#define current_set_flags_nested(sp, f)                \
+               (*(sp) = current->flags, current->flags |= (f))
+#define current_clear_flags_nested(sp, f)      \
+               (*(sp) = current->flags, current->flags &= ~(f))
+#define current_restore_flags_nested(sp, f)    \
+               (current->flags = ((current->flags & ~(f)) | (*(sp) & (f))))
+
+#define spinlock_destroy(lock)
+
+#define NBBY           8               /* number of bits per byte */
+
+/*
+ * Size of block device i/o is parameterized here.
+ * Currently the system supports page-sized i/o.
+ */
+#define        BLKDEV_IOSHIFT          PAGE_CACHE_SHIFT
+#define        BLKDEV_IOSIZE           (1<<BLKDEV_IOSHIFT)
+/* number of BB's per block device block */
+#define        BLKDEV_BB               BTOBB(BLKDEV_IOSIZE)
+
+#define ENOATTR                ENODATA         /* Attribute not found */
+#define EWRONGFS       EINVAL          /* Mount with wrong filesystem type */
+#define EFSCORRUPTED   EUCLEAN         /* Filesystem is corrupted */
+
+#define SYNCHRONIZE()  barrier()
+#define __return_address __builtin_return_address(0)
+
+#define XFS_PROJID_DEFAULT     0
+#define MAXPATHLEN     1024
+
+#define MIN(a,b)       (min(a,b))
+#define MAX(a,b)       (max(a,b))
+#define howmany(x, y)  (((x)+((y)-1))/(y))
+
+/*
+ * Various platform dependent calls that don't fit anywhere else
+ */
+#define xfs_sort(a,n,s,fn)     sort(a,n,s,fn,NULL)
+#define xfs_stack_trace()      dump_stack()
+
+
+/* Move the kernel do_div definition off to one side */
+
+#if defined __i386__
+/* For ia32 we need to pull some tricks to get past various versions
+ * of the compiler which do not like us using do_div in the middle
+ * of large functions.
+ */
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+       __u32   mod;
+
+       switch (n) {
+               case 4:
+                       mod = *(__u32 *)a % b;
+                       *(__u32 *)a = *(__u32 *)a / b;
+                       return mod;
+               case 8:
+                       {
+                       unsigned long __upper, __low, __high, __mod;
+                       __u64   c = *(__u64 *)a;
+                       __upper = __high = c >> 32;
+                       __low = c;
+                       if (__high) {
+                               __upper = __high % (b);
+                               __high = __high / (b);
+                       }
+                       asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+                       asm("":"=A" (c):"a" (__low),"d" (__high));
+                       *(__u64 *)a = c;
+                       return __mod;
+                       }
+       }
+
+       /* NOTREACHED */
+       return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+       switch (n) {
+               case 4:
+                       return *(__u32 *)a % b;
+               case 8:
+                       {
+                       unsigned long __upper, __low, __high, __mod;
+                       __u64   c = *(__u64 *)a;
+                       __upper = __high = c >> 32;
+                       __low = c;
+                       if (__high) {
+                               __upper = __high % (b);
+                               __high = __high / (b);
+                       }
+                       asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
+                       asm("":"=A" (c):"a" (__low),"d" (__high));
+                       return __mod;
+                       }
+       }
+
+       /* NOTREACHED */
+       return 0;
+}
+#else
+static inline __u32 xfs_do_div(void *a, __u32 b, int n)
+{
+       __u32   mod;
+
+       switch (n) {
+               case 4:
+                       mod = *(__u32 *)a % b;
+                       *(__u32 *)a = *(__u32 *)a / b;
+                       return mod;
+               case 8:
+                       mod = do_div(*(__u64 *)a, b);
+                       return mod;
+       }
+
+       /* NOTREACHED */
+       return 0;
+}
+
+/* Side effect free 64 bit mod operation */
+static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
+{
+       switch (n) {
+               case 4:
+                       return *(__u32 *)a % b;
+               case 8:
+                       {
+                       __u64   c = *(__u64 *)a;
+                       return do_div(c, b);
+                       }
+       }
+
+       /* NOTREACHED */
+       return 0;
+}
+#endif
+
+#undef do_div
+#define do_div(a, b)   xfs_do_div(&(a), (b), sizeof(a))
+#define do_mod(a, b)   xfs_do_mod(&(a), (b), sizeof(a))
+
+static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
+{
+       x += y - 1;
+       do_div(x, y);
+       return(x * y);
+}
+
+static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
+{
+       x += y - 1;
+       do_div(x, y);
+       return x;
+}
+
+/* ARM old ABI has some weird alignment/padding */
+#if defined(__arm__) && !defined(__ARM_EABI__)
+#define __arch_pack __attribute__((packed))
+#else
+#define __arch_pack
+#endif
+
+#define ASSERT_ALWAYS(expr)    \
+       (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef DEBUG
+#define ASSERT(expr)   ((void)0)
+
+#ifndef STATIC
+# define STATIC static noinline
+#endif
+
+#else /* DEBUG */
+
+#define ASSERT(expr)   \
+       (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+
+#ifndef STATIC
+# define STATIC noinline
+#endif
+
+#endif /* DEBUG */
+
+#endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
new file mode 100644 (file)
index 0000000..bd672de
--- /dev/null
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2011 Red Hat, Inc.  All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+
+/*
+ * XFS logging functions
+ */
+static void
+__xfs_printk(
+       const char              *level,
+       const struct xfs_mount  *mp,
+       struct va_format        *vaf)
+{
+       if (mp && mp->m_fsname) {
+               printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf);
+               return;
+       }
+       printk("%sXFS: %pV\n", level, vaf);
+}
+
+#define define_xfs_printk_level(func, kern_level)              \
+void func(const struct xfs_mount *mp, const char *fmt, ...)    \
+{                                                              \
+       struct va_format        vaf;                            \
+       va_list                 args;                           \
+                                                               \
+       va_start(args, fmt);                                    \
+                                                               \
+       vaf.fmt = fmt;                                          \
+       vaf.va = &args;                                         \
+                                                               \
+       __xfs_printk(kern_level, mp, &vaf);                     \
+       va_end(args);                                           \
+}                                                              \
+
+define_xfs_printk_level(xfs_emerg, KERN_EMERG);
+define_xfs_printk_level(xfs_alert, KERN_ALERT);
+define_xfs_printk_level(xfs_crit, KERN_CRIT);
+define_xfs_printk_level(xfs_err, KERN_ERR);
+define_xfs_printk_level(xfs_warn, KERN_WARNING);
+define_xfs_printk_level(xfs_notice, KERN_NOTICE);
+define_xfs_printk_level(xfs_info, KERN_INFO);
+#ifdef DEBUG
+define_xfs_printk_level(xfs_debug, KERN_DEBUG);
+#endif
+
+void
+xfs_alert_tag(
+       const struct xfs_mount  *mp,
+       int                     panic_tag,
+       const char              *fmt, ...)
+{
+       struct va_format        vaf;
+       va_list                 args;
+       int                     do_panic = 0;
+
+       if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) {
+               xfs_alert(mp, "Transforming an alert into a BUG.");
+               do_panic = 1;
+       }
+
+       va_start(args, fmt);
+
+       vaf.fmt = fmt;
+       vaf.va = &args;
+
+       __xfs_printk(KERN_ALERT, mp, &vaf);
+       va_end(args);
+
+       BUG_ON(do_panic);
+}
+
+void
+assfail(char *expr, char *file, int line)
+{
+       xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d",
+               expr, file, line);
+       BUG();
+}
+
+void
+xfs_hex_dump(void *p, int length)
+{
+       print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
+}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
new file mode 100644 (file)
index 0000000..7fb7ea0
--- /dev/null
@@ -0,0 +1,39 @@
+#ifndef __XFS_MESSAGE_H
+#define __XFS_MESSAGE_H 1
+
+struct xfs_mount;
+
+extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_alert_tag(const struct xfs_mount *mp, int tag,
+                        const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4)));
+extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+
+#ifdef DEBUG
+extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+        __attribute__ ((format (printf, 2, 3)));
+#else
+static inline void
+__attribute__ ((format (printf, 2, 3)))
+xfs_debug(const struct xfs_mount *mp, const char *fmt, ...)
+{
+}
+#endif
+
+extern void assfail(char *expr, char *f, int l);
+
+extern void xfs_hex_dump(void *p, int length);
+
+#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
new file mode 100644 (file)
index 0000000..9a0aa76
--- /dev/null
@@ -0,0 +1,2416 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_space.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+/*
+ * The global quota manager. There is only one of these for the entire
+ * system, _not_ one per file system. XQM keeps track of the overall
+ * quota functionality, including maintaining the freelist and hash
+ * tables of dquots.
+ */
+struct mutex   xfs_Gqm_lock;
+struct xfs_qm  *xfs_Gqm;
+uint           ndquot;
+
+kmem_zone_t    *qm_dqzone;
+kmem_zone_t    *qm_dqtrxzone;
+
+STATIC void    xfs_qm_list_init(xfs_dqlist_t *, char *, int);
+STATIC void    xfs_qm_list_destroy(xfs_dqlist_t *);
+
+STATIC int     xfs_qm_init_quotainos(xfs_mount_t *);
+STATIC int     xfs_qm_init_quotainfo(xfs_mount_t *);
+STATIC int     xfs_qm_shake(struct shrinker *, struct shrink_control *);
+
+static struct shrinker xfs_qm_shaker = {
+       .shrink = xfs_qm_shake,
+       .seeks = DEFAULT_SEEKS,
+};
+
+/*
+ * Initialize the XQM structure.
+ * Note that there is not one quota manager per file system.
+ */
+STATIC struct xfs_qm *
+xfs_Gqm_init(void)
+{
+       xfs_dqhash_t    *udqhash, *gdqhash;
+       xfs_qm_t        *xqm;
+       size_t          hsize;
+       uint            i;
+
+       /*
+        * Initialize the dquot hash tables.
+        */
+       udqhash = kmem_zalloc_greedy(&hsize,
+                                    XFS_QM_HASHSIZE_LOW * sizeof(xfs_dqhash_t),
+                                    XFS_QM_HASHSIZE_HIGH * sizeof(xfs_dqhash_t));
+       if (!udqhash)
+               goto out;
+
+       gdqhash = kmem_zalloc_large(hsize);
+       if (!gdqhash)
+               goto out_free_udqhash;
+
+       hsize /= sizeof(xfs_dqhash_t);
+       ndquot = hsize << 8;
+
+       xqm = kmem_zalloc(sizeof(xfs_qm_t), KM_SLEEP);
+       xqm->qm_dqhashmask = hsize - 1;
+       xqm->qm_usr_dqhtable = udqhash;
+       xqm->qm_grp_dqhtable = gdqhash;
+       ASSERT(xqm->qm_usr_dqhtable != NULL);
+       ASSERT(xqm->qm_grp_dqhtable != NULL);
+
+       for (i = 0; i < hsize; i++) {
+               xfs_qm_list_init(&(xqm->qm_usr_dqhtable[i]), "uxdqh", i);
+               xfs_qm_list_init(&(xqm->qm_grp_dqhtable[i]), "gxdqh", i);
+       }
+
+       /*
+        * Freelist of all dquots of all file systems
+        */
+       INIT_LIST_HEAD(&xqm->qm_dqfrlist);
+       xqm->qm_dqfrlist_cnt = 0;
+       mutex_init(&xqm->qm_dqfrlist_lock);
+
+       /*
+        * dquot zone. we register our own low-memory callback.
+        */
+       if (!qm_dqzone) {
+               xqm->qm_dqzone = kmem_zone_init(sizeof(xfs_dquot_t),
+                                               "xfs_dquots");
+               qm_dqzone = xqm->qm_dqzone;
+       } else
+               xqm->qm_dqzone = qm_dqzone;
+
+       register_shrinker(&xfs_qm_shaker);
+
+       /*
+        * The t_dqinfo portion of transactions.
+        */
+       if (!qm_dqtrxzone) {
+               xqm->qm_dqtrxzone = kmem_zone_init(sizeof(xfs_dquot_acct_t),
+                                                  "xfs_dqtrx");
+               qm_dqtrxzone = xqm->qm_dqtrxzone;
+       } else
+               xqm->qm_dqtrxzone = qm_dqtrxzone;
+
+       atomic_set(&xqm->qm_totaldquots, 0);
+       xqm->qm_dqfree_ratio = XFS_QM_DQFREE_RATIO;
+       xqm->qm_nrefs = 0;
+       return xqm;
+
+ out_free_udqhash:
+       kmem_free_large(udqhash);
+ out:
+       return NULL;
+}
+
+/*
+ * Destroy the global quota manager when its reference count goes to zero.
+ */
+STATIC void
+xfs_qm_destroy(
+       struct xfs_qm   *xqm)
+{
+       struct xfs_dquot *dqp, *n;
+       int             hsize, i;
+
+       ASSERT(xqm != NULL);
+       ASSERT(xqm->qm_nrefs == 0);
+       unregister_shrinker(&xfs_qm_shaker);
+       hsize = xqm->qm_dqhashmask + 1;
+       for (i = 0; i < hsize; i++) {
+               xfs_qm_list_destroy(&(xqm->qm_usr_dqhtable[i]));
+               xfs_qm_list_destroy(&(xqm->qm_grp_dqhtable[i]));
+       }
+       kmem_free_large(xqm->qm_usr_dqhtable);
+       kmem_free_large(xqm->qm_grp_dqhtable);
+       xqm->qm_usr_dqhtable = NULL;
+       xqm->qm_grp_dqhtable = NULL;
+       xqm->qm_dqhashmask = 0;
+
+       /* frlist cleanup */
+       mutex_lock(&xqm->qm_dqfrlist_lock);
+       list_for_each_entry_safe(dqp, n, &xqm->qm_dqfrlist, q_freelist) {
+               xfs_dqlock(dqp);
+               list_del_init(&dqp->q_freelist);
+               xfs_Gqm->qm_dqfrlist_cnt--;
+               xfs_dqunlock(dqp);
+               xfs_qm_dqdestroy(dqp);
+       }
+       mutex_unlock(&xqm->qm_dqfrlist_lock);
+       mutex_destroy(&xqm->qm_dqfrlist_lock);
+       kmem_free(xqm);
+}
+
+/*
+ * Called at mount time to let XQM know that another file system is
+ * starting quotas. This isn't crucial information as the individual mount
+ * structures are pretty independent, but it helps the XQM keep a
+ * global view of what's going on.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_hold_quotafs_ref(
+       struct xfs_mount *mp)
+{
+       /*
+        * Need to lock the xfs_Gqm structure for things like this. For example,
+        * the structure could disappear between the entry to this routine and
+        * a HOLD operation if not locked.
+        */
+       mutex_lock(&xfs_Gqm_lock);
+
+       if (!xfs_Gqm) {
+               xfs_Gqm = xfs_Gqm_init();
+               if (!xfs_Gqm) {
+                       mutex_unlock(&xfs_Gqm_lock);
+                       return ENOMEM;
+               }
+       }
+
+       /*
+        * We can keep a list of all filesystems with quotas mounted for
+        * debugging and statistical purposes, but ...
+        * Just take a reference and get out.
+        */
+       xfs_Gqm->qm_nrefs++;
+       mutex_unlock(&xfs_Gqm_lock);
+
+       return 0;
+}
+
+
+/*
+ * Release the reference that a filesystem took at mount time,
+ * so that we know when we need to destroy the entire quota manager.
+ */
+/* ARGSUSED */
+STATIC void
+xfs_qm_rele_quotafs_ref(
+       struct xfs_mount *mp)
+{
+       xfs_dquot_t     *dqp, *n;
+
+       ASSERT(xfs_Gqm);
+       ASSERT(xfs_Gqm->qm_nrefs > 0);
+
+       /*
+        * Go thru the freelist and destroy all inactive dquots.
+        */
+       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+       list_for_each_entry_safe(dqp, n, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+               xfs_dqlock(dqp);
+               if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+                       ASSERT(dqp->q_mount == NULL);
+                       ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+                       ASSERT(list_empty(&dqp->q_hashlist));
+                       ASSERT(list_empty(&dqp->q_mplist));
+                       list_del_init(&dqp->q_freelist);
+                       xfs_Gqm->qm_dqfrlist_cnt--;
+                       xfs_dqunlock(dqp);
+                       xfs_qm_dqdestroy(dqp);
+               } else {
+                       xfs_dqunlock(dqp);
+               }
+       }
+       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+
+       /*
+        * Destroy the entire XQM. If somebody mounts with quotaon, this'll
+        * be restarted.
+        */
+       mutex_lock(&xfs_Gqm_lock);
+       if (--xfs_Gqm->qm_nrefs == 0) {
+               xfs_qm_destroy(xfs_Gqm);
+               xfs_Gqm = NULL;
+       }
+       mutex_unlock(&xfs_Gqm_lock);
+}
+
+/*
+ * Just destroy the quotainfo structure.
+ */
+void
+xfs_qm_unmount(
+       struct xfs_mount        *mp)
+{
+       if (mp->m_quotainfo) {
+               xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+               xfs_qm_destroy_quotainfo(mp);
+       }
+}
+
+
+/*
+ * This is called from xfs_mountfs to start quotas and initialize all
+ * necessary data structures like quotainfo.  This is also responsible for
+ * running a quotacheck as necessary.  We are guaranteed that the superblock
+ * is consistently read in at this point.
+ *
+ * If we fail here, the mount will continue with quota turned off. We don't
+ * need to inidicate success or failure at all.
+ */
+void
+xfs_qm_mount_quotas(
+       xfs_mount_t     *mp)
+{
+       int             error = 0;
+       uint            sbf;
+
+       /*
+        * If quotas on realtime volumes is not supported, we disable
+        * quotas immediately.
+        */
+       if (mp->m_sb.sb_rextents) {
+               xfs_notice(mp, "Cannot turn on quotas for realtime filesystem");
+               mp->m_qflags = 0;
+               goto write_changes;
+       }
+
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       /*
+        * Allocate the quotainfo structure inside the mount struct, and
+        * create quotainode(s), and change/rev superblock if necessary.
+        */
+       error = xfs_qm_init_quotainfo(mp);
+       if (error) {
+               /*
+                * We must turn off quotas.
+                */
+               ASSERT(mp->m_quotainfo == NULL);
+               mp->m_qflags = 0;
+               goto write_changes;
+       }
+       /*
+        * If any of the quotas are not consistent, do a quotacheck.
+        */
+       if (XFS_QM_NEED_QUOTACHECK(mp)) {
+               error = xfs_qm_quotacheck(mp);
+               if (error) {
+                       /* Quotacheck failed and disabled quotas. */
+                       return;
+               }
+       }
+       /* 
+        * If one type of quotas is off, then it will lose its
+        * quotachecked status, since we won't be doing accounting for
+        * that type anymore.
+        */
+       if (!XFS_IS_UQUOTA_ON(mp))
+               mp->m_qflags &= ~XFS_UQUOTA_CHKD;
+       if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
+               mp->m_qflags &= ~XFS_OQUOTA_CHKD;
+
+ write_changes:
+       /*
+        * We actually don't have to acquire the m_sb_lock at all.
+        * This can only be called from mount, and that's single threaded. XXX
+        */
+       spin_lock(&mp->m_sb_lock);
+       sbf = mp->m_sb.sb_qflags;
+       mp->m_sb.sb_qflags = mp->m_qflags & XFS_MOUNT_QUOTA_ALL;
+       spin_unlock(&mp->m_sb_lock);
+
+       if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
+               if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                       /*
+                        * We could only have been turning quotas off.
+                        * We aren't in very good shape actually because
+                        * the incore structures are convinced that quotas are
+                        * off, but the on disk superblock doesn't know that !
+                        */
+                       ASSERT(!(XFS_IS_QUOTA_RUNNING(mp)));
+                       xfs_alert(mp, "%s: Superblock update failed!",
+                               __func__);
+               }
+       }
+
+       if (error) {
+               xfs_warn(mp, "Failed to initialize disk quotas.");
+               return;
+       }
+}
+
+/*
+ * Called from the vfsops layer.
+ */
+void
+xfs_qm_unmount_quotas(
+       xfs_mount_t     *mp)
+{
+       /*
+        * Release the dquots that root inode, et al might be holding,
+        * before we flush quotas and blow away the quotainfo structure.
+        */
+       ASSERT(mp->m_rootip);
+       xfs_qm_dqdetach(mp->m_rootip);
+       if (mp->m_rbmip)
+               xfs_qm_dqdetach(mp->m_rbmip);
+       if (mp->m_rsumip)
+               xfs_qm_dqdetach(mp->m_rsumip);
+
+       /*
+        * Release the quota inodes.
+        */
+       if (mp->m_quotainfo) {
+               if (mp->m_quotainfo->qi_uquotaip) {
+                       IRELE(mp->m_quotainfo->qi_uquotaip);
+                       mp->m_quotainfo->qi_uquotaip = NULL;
+               }
+               if (mp->m_quotainfo->qi_gquotaip) {
+                       IRELE(mp->m_quotainfo->qi_gquotaip);
+                       mp->m_quotainfo->qi_gquotaip = NULL;
+               }
+       }
+}
+
+/*
+ * Flush all dquots of the given file system to disk. The dquots are
+ * _not_ purged from memory here, just their data written to disk.
+ */
+STATIC int
+xfs_qm_dqflush_all(
+       struct xfs_mount        *mp,
+       int                     sync_mode)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       int                     recl;
+       struct xfs_dquot        *dqp;
+       int                     error;
+
+       if (!q)
+               return 0;
+again:
+       mutex_lock(&q->qi_dqlist_lock);
+       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+               xfs_dqlock(dqp);
+               if (! XFS_DQ_IS_DIRTY(dqp)) {
+                       xfs_dqunlock(dqp);
+                       continue;
+               }
+
+               /* XXX a sentinel would be better */
+               recl = q->qi_dqreclaims;
+               if (!xfs_dqflock_nowait(dqp)) {
+                       /*
+                        * If we can't grab the flush lock then check
+                        * to see if the dquot has been flushed delayed
+                        * write.  If so, grab its buffer and send it
+                        * out immediately.  We'll be able to acquire
+                        * the flush lock when the I/O completes.
+                        */
+                       xfs_qm_dqflock_pushbuf_wait(dqp);
+               }
+               /*
+                * Let go of the mplist lock. We don't want to hold it
+                * across a disk write.
+                */
+               mutex_unlock(&q->qi_dqlist_lock);
+               error = xfs_qm_dqflush(dqp, sync_mode);
+               xfs_dqunlock(dqp);
+               if (error)
+                       return error;
+
+               mutex_lock(&q->qi_dqlist_lock);
+               if (recl != q->qi_dqreclaims) {
+                       mutex_unlock(&q->qi_dqlist_lock);
+                       /* XXX restart limit */
+                       goto again;
+               }
+       }
+
+       mutex_unlock(&q->qi_dqlist_lock);
+       /* return ! busy */
+       return 0;
+}
+/*
+ * Release the group dquot pointers the user dquots may be
+ * carrying around as a hint. mplist is locked on entry and exit.
+ */
+STATIC void
+xfs_qm_detach_gdquots(
+       struct xfs_mount        *mp)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       struct xfs_dquot        *dqp, *gdqp;
+       int                     nrecl;
+
+ again:
+       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+               xfs_dqlock(dqp);
+               if ((gdqp = dqp->q_gdquot)) {
+                       xfs_dqlock(gdqp);
+                       dqp->q_gdquot = NULL;
+               }
+               xfs_dqunlock(dqp);
+
+               if (gdqp) {
+                       /*
+                        * Can't hold the mplist lock across a dqput.
+                        * XXXmust convert to marker based iterations here.
+                        */
+                       nrecl = q->qi_dqreclaims;
+                       mutex_unlock(&q->qi_dqlist_lock);
+                       xfs_qm_dqput(gdqp);
+
+                       mutex_lock(&q->qi_dqlist_lock);
+                       if (nrecl != q->qi_dqreclaims)
+                               goto again;
+               }
+       }
+}
+
+/*
+ * Go through all the incore dquots of this file system and take them
+ * off the mplist and hashlist, if the dquot type matches the dqtype
+ * parameter. This is used when turning off quota accounting for
+ * users and/or groups, as well as when the filesystem is unmounting.
+ */
+STATIC int
+xfs_qm_dqpurge_int(
+       struct xfs_mount        *mp,
+       uint                    flags)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       struct xfs_dquot        *dqp, *n;
+       uint                    dqtype;
+       int                     nrecl;
+       int                     nmisses;
+
+       if (!q)
+               return 0;
+
+       dqtype = (flags & XFS_QMOPT_UQUOTA) ? XFS_DQ_USER : 0;
+       dqtype |= (flags & XFS_QMOPT_PQUOTA) ? XFS_DQ_PROJ : 0;
+       dqtype |= (flags & XFS_QMOPT_GQUOTA) ? XFS_DQ_GROUP : 0;
+
+       mutex_lock(&q->qi_dqlist_lock);
+
+       /*
+        * In the first pass through all incore dquots of this filesystem,
+        * we release the group dquot pointers the user dquots may be
+        * carrying around as a hint. We need to do this irrespective of
+        * what's being turned off.
+        */
+       xfs_qm_detach_gdquots(mp);
+
+      again:
+       nmisses = 0;
+       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+       /*
+        * Try to get rid of all of the unwanted dquots. The idea is to
+        * get them off mplist and hashlist, but leave them on freelist.
+        */
+       list_for_each_entry_safe(dqp, n, &q->qi_dqlist, q_mplist) {
+               /*
+                * It's OK to look at the type without taking dqlock here.
+                * We're holding the mplist lock here, and that's needed for
+                * a dqreclaim.
+                */
+               if ((dqp->dq_flags & dqtype) == 0)
+                       continue;
+
+               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+                       nrecl = q->qi_dqreclaims;
+                       mutex_unlock(&q->qi_dqlist_lock);
+                       mutex_lock(&dqp->q_hash->qh_lock);
+                       mutex_lock(&q->qi_dqlist_lock);
+
+                       /*
+                        * XXXTheoretically, we can get into a very long
+                        * ping pong game here.
+                        * No one can be adding dquots to the mplist at
+                        * this point, but somebody might be taking things off.
+                        */
+                       if (nrecl != q->qi_dqreclaims) {
+                               mutex_unlock(&dqp->q_hash->qh_lock);
+                               goto again;
+                       }
+               }
+
+               /*
+                * Take the dquot off the mplist and hashlist. It may remain on
+                * freelist in INACTIVE state.
+                */
+               nmisses += xfs_qm_dqpurge(dqp);
+       }
+       mutex_unlock(&q->qi_dqlist_lock);
+       return nmisses;
+}
+
+int
+xfs_qm_dqpurge_all(
+       xfs_mount_t     *mp,
+       uint            flags)
+{
+       int             ndquots;
+
+       /*
+        * Purge the dquot cache.
+        * None of the dquots should really be busy at this point.
+        */
+       if (mp->m_quotainfo) {
+               while ((ndquots = xfs_qm_dqpurge_int(mp, flags))) {
+                       delay(ndquots * 10);
+               }
+       }
+       return 0;
+}
+
+STATIC int
+xfs_qm_dqattach_one(
+       xfs_inode_t     *ip,
+       xfs_dqid_t      id,
+       uint            type,
+       uint            doalloc,
+       xfs_dquot_t     *udqhint, /* hint */
+       xfs_dquot_t     **IO_idqpp)
+{
+       xfs_dquot_t     *dqp;
+       int             error;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       error = 0;
+
+       /*
+        * See if we already have it in the inode itself. IO_idqpp is
+        * &i_udquot or &i_gdquot. This made the code look weird, but
+        * made the logic a lot simpler.
+        */
+       dqp = *IO_idqpp;
+       if (dqp) {
+               trace_xfs_dqattach_found(dqp);
+               return 0;
+       }
+
+       /*
+        * udqhint is the i_udquot field in inode, and is non-NULL only
+        * when the type arg is group/project. Its purpose is to save a
+        * lookup by dqid (xfs_qm_dqget) by caching a group dquot inside
+        * the user dquot.
+        */
+       if (udqhint) {
+               ASSERT(type == XFS_DQ_GROUP || type == XFS_DQ_PROJ);
+               xfs_dqlock(udqhint);
+
+               /*
+                * No need to take dqlock to look at the id.
+                *
+                * The ID can't change until it gets reclaimed, and it won't
+                * be reclaimed as long as we have a ref from inode and we
+                * hold the ilock.
+                */
+               dqp = udqhint->q_gdquot;
+               if (dqp && be32_to_cpu(dqp->q_core.d_id) == id) {
+                       xfs_dqlock(dqp);
+                       XFS_DQHOLD(dqp);
+                       ASSERT(*IO_idqpp == NULL);
+                       *IO_idqpp = dqp;
+
+                       xfs_dqunlock(dqp);
+                       xfs_dqunlock(udqhint);
+                       return 0;
+               }
+
+               /*
+                * We can't hold a dquot lock when we call the dqget code.
+                * We'll deadlock in no time, because of (not conforming to)
+                * lock ordering - the inodelock comes before any dquot lock,
+                * and we may drop and reacquire the ilock in xfs_qm_dqget().
+                */
+               xfs_dqunlock(udqhint);
+       }
+
+       /*
+        * Find the dquot from somewhere. This bumps the
+        * reference count of dquot and returns it locked.
+        * This can return ENOENT if dquot didn't exist on
+        * disk and we didn't ask it to allocate;
+        * ESRCH if quotas got turned off suddenly.
+        */
+       error = xfs_qm_dqget(ip->i_mount, ip, id, type, XFS_QMOPT_DOWARN, &dqp);
+       if (error)
+               return error;
+
+       trace_xfs_dqattach_get(dqp);
+
+       /*
+        * dqget may have dropped and re-acquired the ilock, but it guarantees
+        * that the dquot returned is the one that should go in the inode.
+        */
+       *IO_idqpp = dqp;
+       xfs_dqunlock(dqp);
+       return 0;
+}
+
+
+/*
+ * Given a udquot and gdquot, attach a ptr to the group dquot in the
+ * udquot as a hint for future lookups. The idea sounds simple, but the
+ * execution isn't, because the udquot might have a group dquot attached
+ * already and getting rid of that gets us into lock ordering constraints.
+ * The process is complicated more by the fact that the dquots may or may not
+ * be locked on entry.
+ */
+STATIC void
+xfs_qm_dqattach_grouphint(
+       xfs_dquot_t     *udq,
+       xfs_dquot_t     *gdq)
+{
+       xfs_dquot_t     *tmp;
+
+       xfs_dqlock(udq);
+
+       if ((tmp = udq->q_gdquot)) {
+               if (tmp == gdq) {
+                       xfs_dqunlock(udq);
+                       return;
+               }
+
+               udq->q_gdquot = NULL;
+               /*
+                * We can't keep any dqlocks when calling dqrele,
+                * because the freelist lock comes before dqlocks.
+                */
+               xfs_dqunlock(udq);
+               /*
+                * we took a hard reference once upon a time in dqget,
+                * so give it back when the udquot no longer points at it
+                * dqput() does the unlocking of the dquot.
+                */
+               xfs_qm_dqrele(tmp);
+
+               xfs_dqlock(udq);
+               xfs_dqlock(gdq);
+
+       } else {
+               ASSERT(XFS_DQ_IS_LOCKED(udq));
+               xfs_dqlock(gdq);
+       }
+
+       ASSERT(XFS_DQ_IS_LOCKED(udq));
+       ASSERT(XFS_DQ_IS_LOCKED(gdq));
+       /*
+        * Somebody could have attached a gdquot here,
+        * when we dropped the uqlock. If so, just do nothing.
+        */
+       if (udq->q_gdquot == NULL) {
+               XFS_DQHOLD(gdq);
+               udq->q_gdquot = gdq;
+       }
+
+       xfs_dqunlock(gdq);
+       xfs_dqunlock(udq);
+}
+
+
+/*
+ * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON
+ * into account.
+ * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed.
+ * Inode may get unlocked and relocked in here, and the caller must deal with
+ * the consequences.
+ */
+int
+xfs_qm_dqattach_locked(
+       xfs_inode_t     *ip,
+       uint            flags)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       uint            nquotas = 0;
+       int             error = 0;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) ||
+           !XFS_IS_QUOTA_ON(mp) ||
+           !XFS_NOT_DQATTACHED(mp, ip) ||
+           ip->i_ino == mp->m_sb.sb_uquotino ||
+           ip->i_ino == mp->m_sb.sb_gquotino)
+               return 0;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+       if (XFS_IS_UQUOTA_ON(mp)) {
+               error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
+                                               flags & XFS_QMOPT_DQALLOC,
+                                               NULL, &ip->i_udquot);
+               if (error)
+                       goto done;
+               nquotas++;
+       }
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       if (XFS_IS_OQUOTA_ON(mp)) {
+               error = XFS_IS_GQUOTA_ON(mp) ?
+                       xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
+                                               flags & XFS_QMOPT_DQALLOC,
+                                               ip->i_udquot, &ip->i_gdquot) :
+                       xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+                                               flags & XFS_QMOPT_DQALLOC,
+                                               ip->i_udquot, &ip->i_gdquot);
+               /*
+                * Don't worry about the udquot that we may have
+                * attached above. It'll get detached, if not already.
+                */
+               if (error)
+                       goto done;
+               nquotas++;
+       }
+
+       /*
+        * Attach this group quota to the user quota as a hint.
+        * This WON'T, in general, result in a thrash.
+        */
+       if (nquotas == 2) {
+               ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+               ASSERT(ip->i_udquot);
+               ASSERT(ip->i_gdquot);
+
+               /*
+                * We may or may not have the i_udquot locked at this point,
+                * but this check is OK since we don't depend on the i_gdquot to
+                * be accurate 100% all the time. It is just a hint, and this
+                * will succeed in general.
+                */
+               if (ip->i_udquot->q_gdquot == ip->i_gdquot)
+                       goto done;
+               /*
+                * Attach i_gdquot to the gdquot hint inside the i_udquot.
+                */
+               xfs_qm_dqattach_grouphint(ip->i_udquot, ip->i_gdquot);
+       }
+
+ done:
+#ifdef DEBUG
+       if (!error) {
+               if (XFS_IS_UQUOTA_ON(mp))
+                       ASSERT(ip->i_udquot);
+               if (XFS_IS_OQUOTA_ON(mp))
+                       ASSERT(ip->i_gdquot);
+       }
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+#endif
+       return error;
+}
+
+int
+xfs_qm_dqattach(
+       struct xfs_inode        *ip,
+       uint                    flags)
+{
+       int                     error;
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       error = xfs_qm_dqattach_locked(ip, flags);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       return error;
+}
+
+/*
+ * Release dquots (and their references) if any.
+ * The inode should be locked EXCL except when this's called by
+ * xfs_ireclaim.
+ */
+void
+xfs_qm_dqdetach(
+       xfs_inode_t     *ip)
+{
+       if (!(ip->i_udquot || ip->i_gdquot))
+               return;
+
+       trace_xfs_dquot_dqdetach(ip);
+
+       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_uquotino);
+       ASSERT(ip->i_ino != ip->i_mount->m_sb.sb_gquotino);
+       if (ip->i_udquot) {
+               xfs_qm_dqrele(ip->i_udquot);
+               ip->i_udquot = NULL;
+       }
+       if (ip->i_gdquot) {
+               xfs_qm_dqrele(ip->i_gdquot);
+               ip->i_gdquot = NULL;
+       }
+}
+
+int
+xfs_qm_sync(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       int                     recl, restarts;
+       struct xfs_dquot        *dqp;
+       int                     error;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return 0;
+
+       restarts = 0;
+
+  again:
+       mutex_lock(&q->qi_dqlist_lock);
+       /*
+        * dqpurge_all() also takes the mplist lock and iterate thru all dquots
+        * in quotaoff. However, if the QUOTA_ACTIVE bits are not cleared
+        * when we have the mplist lock, we know that dquots will be consistent
+        * as long as we have it locked.
+        */
+       if (!XFS_IS_QUOTA_ON(mp)) {
+               mutex_unlock(&q->qi_dqlist_lock);
+               return 0;
+       }
+       ASSERT(mutex_is_locked(&q->qi_dqlist_lock));
+       list_for_each_entry(dqp, &q->qi_dqlist, q_mplist) {
+               /*
+                * If this is vfs_sync calling, then skip the dquots that
+                * don't 'seem' to be dirty. ie. don't acquire dqlock.
+                * This is very similar to what xfs_sync does with inodes.
+                */
+               if (flags & SYNC_TRYLOCK) {
+                       if (!XFS_DQ_IS_DIRTY(dqp))
+                               continue;
+                       if (!xfs_qm_dqlock_nowait(dqp))
+                               continue;
+               } else {
+                       xfs_dqlock(dqp);
+               }
+
+               /*
+                * Now, find out for sure if this dquot is dirty or not.
+                */
+               if (! XFS_DQ_IS_DIRTY(dqp)) {
+                       xfs_dqunlock(dqp);
+                       continue;
+               }
+
+               /* XXX a sentinel would be better */
+               recl = q->qi_dqreclaims;
+               if (!xfs_dqflock_nowait(dqp)) {
+                       if (flags & SYNC_TRYLOCK) {
+                               xfs_dqunlock(dqp);
+                               continue;
+                       }
+                       /*
+                        * If we can't grab the flush lock then if the caller
+                        * really wanted us to give this our best shot, so
+                        * see if we can give a push to the buffer before we wait
+                        * on the flush lock. At this point, we know that
+                        * even though the dquot is being flushed,
+                        * it has (new) dirty data.
+                        */
+                       xfs_qm_dqflock_pushbuf_wait(dqp);
+               }
+               /*
+                * Let go of the mplist lock. We don't want to hold it
+                * across a disk write
+                */
+               mutex_unlock(&q->qi_dqlist_lock);
+               error = xfs_qm_dqflush(dqp, flags);
+               xfs_dqunlock(dqp);
+               if (error && XFS_FORCED_SHUTDOWN(mp))
+                       return 0;       /* Need to prevent umount failure */
+               else if (error)
+                       return error;
+
+               mutex_lock(&q->qi_dqlist_lock);
+               if (recl != q->qi_dqreclaims) {
+                       if (++restarts >= XFS_QM_SYNC_MAX_RESTARTS)
+                               break;
+
+                       mutex_unlock(&q->qi_dqlist_lock);
+                       goto again;
+               }
+       }
+
+       mutex_unlock(&q->qi_dqlist_lock);
+       return 0;
+}
+
+/*
+ * The hash chains and the mplist use the same xfs_dqhash structure as
+ * their list head, but we can take the mplist qh_lock and one of the
+ * hash qh_locks at the same time without any problem as they aren't
+ * related.
+ */
+static struct lock_class_key xfs_quota_mplist_class;
+
+/*
+ * This initializes all the quota information that's kept in the
+ * mount structure
+ */
+STATIC int
+xfs_qm_init_quotainfo(
+       xfs_mount_t     *mp)
+{
+       xfs_quotainfo_t *qinf;
+       int             error;
+       xfs_dquot_t     *dqp;
+
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       /*
+        * Tell XQM that we exist as soon as possible.
+        */
+       if ((error = xfs_qm_hold_quotafs_ref(mp))) {
+               return error;
+       }
+
+       qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
+
+       /*
+        * See if quotainodes are setup, and if not, allocate them,
+        * and change the superblock accordingly.
+        */
+       if ((error = xfs_qm_init_quotainos(mp))) {
+               kmem_free(qinf);
+               mp->m_quotainfo = NULL;
+               return error;
+       }
+
+       INIT_LIST_HEAD(&qinf->qi_dqlist);
+       mutex_init(&qinf->qi_dqlist_lock);
+       lockdep_set_class(&qinf->qi_dqlist_lock, &xfs_quota_mplist_class);
+
+       qinf->qi_dqreclaims = 0;
+
+       /* mutex used to serialize quotaoffs */
+       mutex_init(&qinf->qi_quotaofflock);
+
+       /* Precalc some constants */
+       qinf->qi_dqchunklen = XFS_FSB_TO_BB(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+       ASSERT(qinf->qi_dqchunklen);
+       qinf->qi_dqperchunk = BBTOB(qinf->qi_dqchunklen);
+       do_div(qinf->qi_dqperchunk, sizeof(xfs_dqblk_t));
+
+       mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD);
+
+       /*
+        * We try to get the limits from the superuser's limits fields.
+        * This is quite hacky, but it is standard quota practice.
+        * We look at the USR dquot with id == 0 first, but if user quotas
+        * are not enabled we goto the GRP dquot with id == 0.
+        * We don't really care to keep separate default limits for user
+        * and group quotas, at least not at this point.
+        */
+       error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)0,
+                            XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : 
+                            (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP :
+                               XFS_DQ_PROJ),
+                            XFS_QMOPT_DQSUSER|XFS_QMOPT_DOWARN,
+                            &dqp);
+       if (! error) {
+               xfs_disk_dquot_t        *ddqp = &dqp->q_core;
+
+               /*
+                * The warnings and timers set the grace period given to
+                * a user or group before he or she can not perform any
+                * more writing. If it is zero, a default is used.
+                */
+               qinf->qi_btimelimit = ddqp->d_btimer ?
+                       be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT;
+               qinf->qi_itimelimit = ddqp->d_itimer ?
+                       be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT;
+               qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ?
+                       be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT;
+               qinf->qi_bwarnlimit = ddqp->d_bwarns ?
+                       be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT;
+               qinf->qi_iwarnlimit = ddqp->d_iwarns ?
+                       be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT;
+               qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ?
+                       be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT;
+               qinf->qi_bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit);
+               qinf->qi_bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit);
+               qinf->qi_ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit);
+               qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
+               qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
+               qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
+               /*
+                * We sent the XFS_QMOPT_DQSUSER flag to dqget because
+                * we don't want this dquot cached. We haven't done a
+                * quotacheck yet, and quotacheck doesn't like incore dquots.
+                */
+               xfs_qm_dqdestroy(dqp);
+       } else {
+               qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
+               qinf->qi_itimelimit = XFS_QM_ITIMELIMIT;
+               qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT;
+               qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT;
+               qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT;
+               qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
+       }
+
+       return 0;
+}
+
+
+/*
+ * Gets called when unmounting a filesystem or when all quotas get
+ * turned off.
+ * This purges the quota inodes, destroys locks and frees itself.
+ */
+void
+xfs_qm_destroy_quotainfo(
+       xfs_mount_t     *mp)
+{
+       xfs_quotainfo_t *qi;
+
+       qi = mp->m_quotainfo;
+       ASSERT(qi != NULL);
+       ASSERT(xfs_Gqm != NULL);
+
+       /*
+        * Release the reference that XQM kept, so that we know
+        * when the XQM structure should be freed. We cannot assume
+        * that xfs_Gqm is non-null after this point.
+        */
+       xfs_qm_rele_quotafs_ref(mp);
+
+       ASSERT(list_empty(&qi->qi_dqlist));
+       mutex_destroy(&qi->qi_dqlist_lock);
+
+       if (qi->qi_uquotaip) {
+               IRELE(qi->qi_uquotaip);
+               qi->qi_uquotaip = NULL; /* paranoia */
+       }
+       if (qi->qi_gquotaip) {
+               IRELE(qi->qi_gquotaip);
+               qi->qi_gquotaip = NULL;
+       }
+       mutex_destroy(&qi->qi_quotaofflock);
+       kmem_free(qi);
+       mp->m_quotainfo = NULL;
+}
+
+
+
+/* ------------------- PRIVATE STATIC FUNCTIONS ----------------------- */
+
+/* ARGSUSED */
+STATIC void
+xfs_qm_list_init(
+       xfs_dqlist_t    *list,
+       char            *str,
+       int             n)
+{
+       mutex_init(&list->qh_lock);
+       INIT_LIST_HEAD(&list->qh_list);
+       list->qh_version = 0;
+       list->qh_nelems = 0;
+}
+
+STATIC void
+xfs_qm_list_destroy(
+       xfs_dqlist_t    *list)
+{
+       mutex_destroy(&(list->qh_lock));
+}
+
+/*
+ * Create an inode and return with a reference already taken, but unlocked
+ * This is how we create quota inodes
+ */
+STATIC int
+xfs_qm_qino_alloc(
+       xfs_mount_t     *mp,
+       xfs_inode_t     **ip,
+       __int64_t       sbfields,
+       uint            flags)
+{
+       xfs_trans_t     *tp;
+       int             error;
+       int             committed;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QINOCREATE);
+       if ((error = xfs_trans_reserve(tp,
+                                     XFS_QM_QINOCREATE_SPACE_RES(mp),
+                                     XFS_CREATE_LOG_RES(mp), 0,
+                                     XFS_TRANS_PERM_LOG_RES,
+                                     XFS_CREATE_LOG_COUNT))) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed);
+       if (error) {
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                XFS_TRANS_ABORT);
+               return error;
+       }
+
+       /*
+        * Make the changes in the superblock, and log those too.
+        * sbfields arg may contain fields other than *QUOTINO;
+        * VERSIONNUM for example.
+        */
+       spin_lock(&mp->m_sb_lock);
+       if (flags & XFS_QMOPT_SBVERSION) {
+               ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
+               ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                                  XFS_SB_GQUOTINO | XFS_SB_QFLAGS)) ==
+                      (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                       XFS_SB_GQUOTINO | XFS_SB_QFLAGS));
+
+               xfs_sb_version_addquota(&mp->m_sb);
+               mp->m_sb.sb_uquotino = NULLFSINO;
+               mp->m_sb.sb_gquotino = NULLFSINO;
+
+               /* qflags will get updated _after_ quotacheck */
+               mp->m_sb.sb_qflags = 0;
+       }
+       if (flags & XFS_QMOPT_UQUOTA)
+               mp->m_sb.sb_uquotino = (*ip)->i_ino;
+       else
+               mp->m_sb.sb_gquotino = (*ip)->i_ino;
+       spin_unlock(&mp->m_sb_lock);
+       xfs_mod_sb(tp, sbfields);
+
+       if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
+               xfs_alert(mp, "%s failed (error %d)!", __func__, error);
+               return error;
+       }
+       return 0;
+}
+
+
+STATIC void
+xfs_qm_reset_dqcounts(
+       xfs_mount_t     *mp,
+       xfs_buf_t       *bp,
+       xfs_dqid_t      id,
+       uint            type)
+{
+       xfs_disk_dquot_t        *ddq;
+       int                     j;
+
+       trace_xfs_reset_dqcounts(bp, _RET_IP_);
+
+       /*
+        * Reset all counters and timers. They'll be
+        * started afresh by xfs_qm_quotacheck.
+        */
+#ifdef DEBUG
+       j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
+       do_div(j, sizeof(xfs_dqblk_t));
+       ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
+#endif
+       ddq = bp->b_addr;
+       for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) {
+               /*
+                * Do a sanity check, and if needed, repair the dqblk. Don't
+                * output any warnings because it's perfectly possible to
+                * find uninitialised dquot blks. See comment in xfs_qm_dqcheck.
+                */
+               (void) xfs_qm_dqcheck(mp, ddq, id+j, type, XFS_QMOPT_DQREPAIR,
+                                     "xfs_quotacheck");
+               ddq->d_bcount = 0;
+               ddq->d_icount = 0;
+               ddq->d_rtbcount = 0;
+               ddq->d_btimer = 0;
+               ddq->d_itimer = 0;
+               ddq->d_rtbtimer = 0;
+               ddq->d_bwarns = 0;
+               ddq->d_iwarns = 0;
+               ddq->d_rtbwarns = 0;
+               ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
+       }
+}
+
+STATIC int
+xfs_qm_dqiter_bufs(
+       xfs_mount_t     *mp,
+       xfs_dqid_t      firstid,
+       xfs_fsblock_t   bno,
+       xfs_filblks_t   blkcnt,
+       uint            flags)
+{
+       xfs_buf_t       *bp;
+       int             error;
+       int             type;
+
+       ASSERT(blkcnt > 0);
+       type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
+               (flags & XFS_QMOPT_PQUOTA ? XFS_DQ_PROJ : XFS_DQ_GROUP);
+       error = 0;
+
+       /*
+        * Blkcnt arg can be a very big number, and might even be
+        * larger than the log itself. So, we have to break it up into
+        * manageable-sized transactions.
+        * Note that we don't start a permanent transaction here; we might
+        * not be able to get a log reservation for the whole thing up front,
+        * and we don't really care to either, because we just discard
+        * everything if we were to crash in the middle of this loop.
+        */
+       while (blkcnt--) {
+               error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+                             XFS_FSB_TO_DADDR(mp, bno),
+                             mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+               if (error)
+                       break;
+
+               xfs_qm_reset_dqcounts(mp, bp, firstid, type);
+               xfs_bdwrite(mp, bp);
+               /*
+                * goto the next block.
+                */
+               bno++;
+               firstid += mp->m_quotainfo->qi_dqperchunk;
+       }
+       return error;
+}
+
+/*
+ * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a
+ * caller supplied function for every chunk of dquots that we find.
+ */
+STATIC int
+xfs_qm_dqiterate(
+       xfs_mount_t     *mp,
+       xfs_inode_t     *qip,
+       uint            flags)
+{
+       xfs_bmbt_irec_t         *map;
+       int                     i, nmaps;       /* number of map entries */
+       int                     error;          /* return value */
+       xfs_fileoff_t           lblkno;
+       xfs_filblks_t           maxlblkcnt;
+       xfs_dqid_t              firstid;
+       xfs_fsblock_t           rablkno;
+       xfs_filblks_t           rablkcnt;
+
+       error = 0;
+       /*
+        * This looks racy, but we can't keep an inode lock across a
+        * trans_reserve. But, this gets called during quotacheck, and that
+        * happens only at mount time which is single threaded.
+        */
+       if (qip->i_d.di_nblocks == 0)
+               return 0;
+
+       map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), KM_SLEEP);
+
+       lblkno = 0;
+       maxlblkcnt = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
+       do {
+               nmaps = XFS_DQITER_MAP_SIZE;
+               /*
+                * We aren't changing the inode itself. Just changing
+                * some of its data. No new blocks are added here, and
+                * the inode is never added to the transaction.
+                */
+               xfs_ilock(qip, XFS_ILOCK_SHARED);
+               error = xfs_bmapi(NULL, qip, lblkno,
+                                 maxlblkcnt - lblkno,
+                                 XFS_BMAPI_METADATA,
+                                 NULL,
+                                 0, map, &nmaps, NULL);
+               xfs_iunlock(qip, XFS_ILOCK_SHARED);
+               if (error)
+                       break;
+
+               ASSERT(nmaps <= XFS_DQITER_MAP_SIZE);
+               for (i = 0; i < nmaps; i++) {
+                       ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+                       ASSERT(map[i].br_blockcount);
+
+
+                       lblkno += map[i].br_blockcount;
+
+                       if (map[i].br_startblock == HOLESTARTBLOCK)
+                               continue;
+
+                       firstid = (xfs_dqid_t) map[i].br_startoff *
+                               mp->m_quotainfo->qi_dqperchunk;
+                       /*
+                        * Do a read-ahead on the next extent.
+                        */
+                       if ((i+1 < nmaps) &&
+                           (map[i+1].br_startblock != HOLESTARTBLOCK)) {
+                               rablkcnt =  map[i+1].br_blockcount;
+                               rablkno = map[i+1].br_startblock;
+                               while (rablkcnt--) {
+                                       xfs_buf_readahead(mp->m_ddev_targp,
+                                              XFS_FSB_TO_DADDR(mp, rablkno),
+                                              mp->m_quotainfo->qi_dqchunklen);
+                                       rablkno++;
+                               }
+                       }
+                       /*
+                        * Iterate thru all the blks in the extent and
+                        * reset the counters of all the dquots inside them.
+                        */
+                       if ((error = xfs_qm_dqiter_bufs(mp,
+                                                      firstid,
+                                                      map[i].br_startblock,
+                                                      map[i].br_blockcount,
+                                                      flags))) {
+                               break;
+                       }
+               }
+
+               if (error)
+                       break;
+       } while (nmaps > 0);
+
+       kmem_free(map);
+
+       return error;
+}
+
+/*
+ * Called by dqusage_adjust in doing a quotacheck.
+ *
+ * Given the inode, and a dquot id this updates both the incore dqout as well
+ * as the buffer copy. This is so that once the quotacheck is done, we can
+ * just log all the buffers, as opposed to logging numerous updates to
+ * individual dquots.
+ */
+STATIC int
+xfs_qm_quotacheck_dqadjust(
+       struct xfs_inode        *ip,
+       xfs_dqid_t              id,
+       uint                    type,
+       xfs_qcnt_t              nblks,
+       xfs_qcnt_t              rtblks)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_dquot        *dqp;
+       int                     error;
+
+       error = xfs_qm_dqget(mp, ip, id, type,
+                            XFS_QMOPT_DQALLOC | XFS_QMOPT_DOWARN, &dqp);
+       if (error) {
+               /*
+                * Shouldn't be able to turn off quotas here.
+                */
+               ASSERT(error != ESRCH);
+               ASSERT(error != ENOENT);
+               return error;
+       }
+
+       trace_xfs_dqadjust(dqp);
+
+       /*
+        * Adjust the inode count and the block count to reflect this inode's
+        * resource usage.
+        */
+       be64_add_cpu(&dqp->q_core.d_icount, 1);
+       dqp->q_res_icount++;
+       if (nblks) {
+               be64_add_cpu(&dqp->q_core.d_bcount, nblks);
+               dqp->q_res_bcount += nblks;
+       }
+       if (rtblks) {
+               be64_add_cpu(&dqp->q_core.d_rtbcount, rtblks);
+               dqp->q_res_rtbcount += rtblks;
+       }
+
+       /*
+        * Set default limits, adjust timers (since we changed usages)
+        *
+        * There are no timers for the default values set in the root dquot.
+        */
+       if (dqp->q_core.d_id) {
+               xfs_qm_adjust_dqlimits(mp, &dqp->q_core);
+               xfs_qm_adjust_dqtimers(mp, &dqp->q_core);
+       }
+
+       dqp->dq_flags |= XFS_DQ_DIRTY;
+       xfs_qm_dqput(dqp);
+       return 0;
+}
+
+STATIC int
+xfs_qm_get_rtblks(
+       xfs_inode_t     *ip,
+       xfs_qcnt_t      *O_rtblks)
+{
+       xfs_filblks_t   rtblks;                 /* total rt blks */
+       xfs_extnum_t    idx;                    /* extent record index */
+       xfs_ifork_t     *ifp;                   /* inode fork pointer */
+       xfs_extnum_t    nextents;               /* number of extent entries */
+       int             error;
+
+       ASSERT(XFS_IS_REALTIME_INODE(ip));
+       ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+       if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+               if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK)))
+                       return error;
+       }
+       rtblks = 0;
+       nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+       for (idx = 0; idx < nextents; idx++)
+               rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
+       *O_rtblks = (xfs_qcnt_t)rtblks;
+       return 0;
+}
+
+/*
+ * callback routine supplied to bulkstat(). Given an inumber, find its
+ * dquots and update them to account for resources taken by that inode.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_dqusage_adjust(
+       xfs_mount_t     *mp,            /* mount point for filesystem */
+       xfs_ino_t       ino,            /* inode number to get data for */
+       void            __user *buffer, /* not used */
+       int             ubsize,         /* not used */
+       int             *ubused,        /* not used */
+       int             *res)           /* result code value */
+{
+       xfs_inode_t     *ip;
+       xfs_qcnt_t      nblks, rtblks = 0;
+       int             error;
+
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       /*
+        * rootino must have its resources accounted for, not so with the quota
+        * inodes.
+        */
+       if (ino == mp->m_sb.sb_uquotino || ino == mp->m_sb.sb_gquotino) {
+               *res = BULKSTAT_RV_NOTHING;
+               return XFS_ERROR(EINVAL);
+       }
+
+       /*
+        * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget
+        * interface expects the inode to be exclusively locked because that's
+        * the case in all other instances. It's OK that we do this because
+        * quotacheck is done only at mount time.
+        */
+       error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_EXCL, &ip);
+       if (error) {
+               *res = BULKSTAT_RV_NOTHING;
+               return error;
+       }
+
+       ASSERT(ip->i_delayed_blks == 0);
+
+       if (XFS_IS_REALTIME_INODE(ip)) {
+               /*
+                * Walk thru the extent list and count the realtime blocks.
+                */
+               error = xfs_qm_get_rtblks(ip, &rtblks);
+               if (error)
+                       goto error0;
+       }
+
+       nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks;
+
+       /*
+        * Add the (disk blocks and inode) resources occupied by this
+        * inode to its dquots. We do this adjustment in the incore dquot,
+        * and also copy the changes to its buffer.
+        * We don't care about putting these changes in a transaction
+        * envelope because if we crash in the middle of a 'quotacheck'
+        * we have to start from the beginning anyway.
+        * Once we're done, we'll log all the dquot bufs.
+        *
+        * The *QUOTA_ON checks below may look pretty racy, but quotachecks
+        * and quotaoffs don't race. (Quotachecks happen at mount time only).
+        */
+       if (XFS_IS_UQUOTA_ON(mp)) {
+               error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid,
+                                                  XFS_DQ_USER, nblks, rtblks);
+               if (error)
+                       goto error0;
+       }
+
+       if (XFS_IS_GQUOTA_ON(mp)) {
+               error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid,
+                                                  XFS_DQ_GROUP, nblks, rtblks);
+               if (error)
+                       goto error0;
+       }
+
+       if (XFS_IS_PQUOTA_ON(mp)) {
+               error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip),
+                                                  XFS_DQ_PROJ, nblks, rtblks);
+               if (error)
+                       goto error0;
+       }
+
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       IRELE(ip);
+       *res = BULKSTAT_RV_DIDONE;
+       return 0;
+
+error0:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       IRELE(ip);
+       *res = BULKSTAT_RV_GIVEUP;
+       return error;
+}
+
+/*
+ * Walk thru all the filesystem inodes and construct a consistent view
+ * of the disk quota world. If the quotacheck fails, disable quotas.
+ */
+int
+xfs_qm_quotacheck(
+       xfs_mount_t     *mp)
+{
+       int             done, count, error;
+       xfs_ino_t       lastino;
+       size_t          structsz;
+       xfs_inode_t     *uip, *gip;
+       uint            flags;
+
+       count = INT_MAX;
+       structsz = 1;
+       lastino = 0;
+       flags = 0;
+
+       ASSERT(mp->m_quotainfo->qi_uquotaip || mp->m_quotainfo->qi_gquotaip);
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       /*
+        * There should be no cached dquots. The (simplistic) quotacheck
+        * algorithm doesn't like that.
+        */
+       ASSERT(list_empty(&mp->m_quotainfo->qi_dqlist));
+
+       xfs_notice(mp, "Quotacheck needed: Please wait.");
+
+       /*
+        * First we go thru all the dquots on disk, USR and GRP/PRJ, and reset
+        * their counters to zero. We need a clean slate.
+        * We don't log our changes till later.
+        */
+       uip = mp->m_quotainfo->qi_uquotaip;
+       if (uip) {
+               error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA);
+               if (error)
+                       goto error_return;
+               flags |= XFS_UQUOTA_CHKD;
+       }
+
+       gip = mp->m_quotainfo->qi_gquotaip;
+       if (gip) {
+               error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
+                                       XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+               if (error)
+                       goto error_return;
+               flags |= XFS_OQUOTA_CHKD;
+       }
+
+       do {
+               /*
+                * Iterate thru all the inodes in the file system,
+                * adjusting the corresponding dquot counters in core.
+                */
+               error = xfs_bulkstat(mp, &lastino, &count,
+                                    xfs_qm_dqusage_adjust,
+                                    structsz, NULL, &done);
+               if (error)
+                       break;
+
+       } while (!done);
+
+       /*
+        * We've made all the changes that we need to make incore.
+        * Flush them down to disk buffers if everything was updated
+        * successfully.
+        */
+       if (!error)
+               error = xfs_qm_dqflush_all(mp, 0);
+
+       /*
+        * We can get this error if we couldn't do a dquot allocation inside
+        * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
+        * dirty dquots that might be cached, we just want to get rid of them
+        * and turn quotaoff. The dquots won't be attached to any of the inodes
+        * at this point (because we intentionally didn't in dqget_noattach).
+        */
+       if (error) {
+               xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL);
+               goto error_return;
+       }
+
+       /*
+        * We didn't log anything, because if we crashed, we'll have to
+        * start the quotacheck from scratch anyway. However, we must make
+        * sure that our dquot changes are secure before we put the
+        * quotacheck'd stamp on the superblock. So, here we do a synchronous
+        * flush.
+        */
+       XFS_bflush(mp->m_ddev_targp);
+
+       /*
+        * If one type of quotas is off, then it will lose its
+        * quotachecked status, since we won't be doing accounting for
+        * that type anymore.
+        */
+       mp->m_qflags &= ~(XFS_OQUOTA_CHKD | XFS_UQUOTA_CHKD);
+       mp->m_qflags |= flags;
+
+ error_return:
+       if (error) {
+               xfs_warn(mp,
+       "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
+                       error);
+               /*
+                * We must turn off quotas.
+                */
+               ASSERT(mp->m_quotainfo != NULL);
+               ASSERT(xfs_Gqm != NULL);
+               xfs_qm_destroy_quotainfo(mp);
+               if (xfs_mount_reset_sbqflags(mp)) {
+                       xfs_warn(mp,
+                               "Quotacheck: Failed to reset quota flags.");
+               }
+       } else
+               xfs_notice(mp, "Quotacheck: Done.");
+       return (error);
+}
+
+/*
+ * This is called after the superblock has been read in and we're ready to
+ * iget the quota inodes.
+ */
+STATIC int
+xfs_qm_init_quotainos(
+       xfs_mount_t     *mp)
+{
+       xfs_inode_t     *uip, *gip;
+       int             error;
+       __int64_t       sbflags;
+       uint            flags;
+
+       ASSERT(mp->m_quotainfo);
+       uip = gip = NULL;
+       sbflags = 0;
+       flags = 0;
+
+       /*
+        * Get the uquota and gquota inodes
+        */
+       if (xfs_sb_version_hasquota(&mp->m_sb)) {
+               if (XFS_IS_UQUOTA_ON(mp) &&
+                   mp->m_sb.sb_uquotino != NULLFSINO) {
+                       ASSERT(mp->m_sb.sb_uquotino > 0);
+                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                            0, 0, &uip)))
+                               return XFS_ERROR(error);
+               }
+               if (XFS_IS_OQUOTA_ON(mp) &&
+                   mp->m_sb.sb_gquotino != NULLFSINO) {
+                       ASSERT(mp->m_sb.sb_gquotino > 0);
+                       if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                            0, 0, &gip))) {
+                               if (uip)
+                                       IRELE(uip);
+                               return XFS_ERROR(error);
+                       }
+               }
+       } else {
+               flags |= XFS_QMOPT_SBVERSION;
+               sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
+                           XFS_SB_GQUOTINO | XFS_SB_QFLAGS);
+       }
+
+       /*
+        * Create the two inodes, if they don't exist already. The changes
+        * made above will get added to a transaction and logged in one of
+        * the qino_alloc calls below.  If the device is readonly,
+        * temporarily switch to read-write to do this.
+        */
+       if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
+               if ((error = xfs_qm_qino_alloc(mp, &uip,
+                                             sbflags | XFS_SB_UQUOTINO,
+                                             flags | XFS_QMOPT_UQUOTA)))
+                       return XFS_ERROR(error);
+
+               flags &= ~XFS_QMOPT_SBVERSION;
+       }
+       if (XFS_IS_OQUOTA_ON(mp) && gip == NULL) {
+               flags |= (XFS_IS_GQUOTA_ON(mp) ?
+                               XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA);
+               error = xfs_qm_qino_alloc(mp, &gip,
+                                         sbflags | XFS_SB_GQUOTINO, flags);
+               if (error) {
+                       if (uip)
+                               IRELE(uip);
+
+                       return XFS_ERROR(error);
+               }
+       }
+
+       mp->m_quotainfo->qi_uquotaip = uip;
+       mp->m_quotainfo->qi_gquotaip = gip;
+
+       return 0;
+}
+
+
+
+/*
+ * Just pop the least recently used dquot off the freelist and
+ * recycle it. The returned dquot is locked.
+ */
+STATIC xfs_dquot_t *
+xfs_qm_dqreclaim_one(void)
+{
+       xfs_dquot_t     *dqpout;
+       xfs_dquot_t     *dqp;
+       int             restarts;
+       int             startagain;
+
+       restarts = 0;
+       dqpout = NULL;
+
+       /* lockorder: hashchainlock, freelistlock, mplistlock, dqlock, dqflock */
+again:
+       startagain = 0;
+       mutex_lock(&xfs_Gqm->qm_dqfrlist_lock);
+
+       list_for_each_entry(dqp, &xfs_Gqm->qm_dqfrlist, q_freelist) {
+               struct xfs_mount *mp = dqp->q_mount;
+               xfs_dqlock(dqp);
+
+               /*
+                * We are racing with dqlookup here. Naturally we don't
+                * want to reclaim a dquot that lookup wants. We release the
+                * freelist lock and start over, so that lookup will grab
+                * both the dquot and the freelistlock.
+                */
+               if (dqp->dq_flags & XFS_DQ_WANT) {
+                       ASSERT(! (dqp->dq_flags & XFS_DQ_INACTIVE));
+
+                       trace_xfs_dqreclaim_want(dqp);
+                       XQM_STATS_INC(xqmstats.xs_qm_dqwants);
+                       restarts++;
+                       startagain = 1;
+                       goto dqunlock;
+               }
+
+               /*
+                * If the dquot is inactive, we are assured that it is
+                * not on the mplist or the hashlist, and that makes our
+                * life easier.
+                */
+               if (dqp->dq_flags & XFS_DQ_INACTIVE) {
+                       ASSERT(mp == NULL);
+                       ASSERT(! XFS_DQ_IS_DIRTY(dqp));
+                       ASSERT(list_empty(&dqp->q_hashlist));
+                       ASSERT(list_empty(&dqp->q_mplist));
+                       list_del_init(&dqp->q_freelist);
+                       xfs_Gqm->qm_dqfrlist_cnt--;
+                       dqpout = dqp;
+                       XQM_STATS_INC(xqmstats.xs_qm_dqinact_reclaims);
+                       goto dqunlock;
+               }
+
+               ASSERT(dqp->q_hash);
+               ASSERT(!list_empty(&dqp->q_mplist));
+
+               /*
+                * Try to grab the flush lock. If this dquot is in the process
+                * of getting flushed to disk, we don't want to reclaim it.
+                */
+               if (!xfs_dqflock_nowait(dqp))
+                       goto dqunlock;
+
+               /*
+                * We have the flush lock so we know that this is not in the
+                * process of being flushed. So, if this is dirty, flush it
+                * DELWRI so that we don't get a freelist infested with
+                * dirty dquots.
+                */
+               if (XFS_DQ_IS_DIRTY(dqp)) {
+                       int     error;
+
+                       trace_xfs_dqreclaim_dirty(dqp);
+
+                       /*
+                        * We flush it delayed write, so don't bother
+                        * releasing the freelist lock.
+                        */
+                       error = xfs_qm_dqflush(dqp, 0);
+                       if (error) {
+                               xfs_warn(mp, "%s: dquot %p flush failed",
+                                       __func__, dqp);
+                       }
+                       goto dqunlock;
+               }
+
+               /*
+                * We're trying to get the hashlock out of order. This races
+                * with dqlookup; so, we giveup and goto the next dquot if
+                * we couldn't get the hashlock. This way, we won't starve
+                * a dqlookup process that holds the hashlock that is
+                * waiting for the freelist lock.
+                */
+               if (!mutex_trylock(&dqp->q_hash->qh_lock)) {
+                       restarts++;
+                       goto dqfunlock;
+               }
+
+               /*
+                * This races with dquot allocation code as well as dqflush_all
+                * and reclaim code. So, if we failed to grab the mplist lock,
+                * giveup everything and start over.
+                */
+               if (!mutex_trylock(&mp->m_quotainfo->qi_dqlist_lock)) {
+                       restarts++;
+                       startagain = 1;
+                       goto qhunlock;
+               }
+
+               ASSERT(dqp->q_nrefs == 0);
+               list_del_init(&dqp->q_mplist);
+               mp->m_quotainfo->qi_dquots--;
+               mp->m_quotainfo->qi_dqreclaims++;
+               list_del_init(&dqp->q_hashlist);
+               dqp->q_hash->qh_version++;
+               list_del_init(&dqp->q_freelist);
+               xfs_Gqm->qm_dqfrlist_cnt--;
+               dqpout = dqp;
+               mutex_unlock(&mp->m_quotainfo->qi_dqlist_lock);
+qhunlock:
+               mutex_unlock(&dqp->q_hash->qh_lock);
+dqfunlock:
+               xfs_dqfunlock(dqp);
+dqunlock:
+               xfs_dqunlock(dqp);
+               if (dqpout)
+                       break;
+               if (restarts >= XFS_QM_RECLAIM_MAX_RESTARTS)
+                       break;
+               if (startagain) {
+                       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+                       goto again;
+               }
+       }
+       mutex_unlock(&xfs_Gqm->qm_dqfrlist_lock);
+       return dqpout;
+}
+
+/*
+ * Traverse the freelist of dquots and attempt to reclaim a maximum of
+ * 'howmany' dquots. This operation races with dqlookup(), and attempts to
+ * favor the lookup function ...
+ */
+STATIC int
+xfs_qm_shake_freelist(
+       int     howmany)
+{
+       int             nreclaimed = 0;
+       xfs_dquot_t     *dqp;
+
+       if (howmany <= 0)
+               return 0;
+
+       while (nreclaimed < howmany) {
+               dqp = xfs_qm_dqreclaim_one();
+               if (!dqp)
+                       return nreclaimed;
+               xfs_qm_dqdestroy(dqp);
+               nreclaimed++;
+       }
+       return nreclaimed;
+}
+
+/*
+ * The kmem_shake interface is invoked when memory is running low.
+ */
+/* ARGSUSED */
+STATIC int
+xfs_qm_shake(
+       struct shrinker *shrink,
+       struct shrink_control *sc)
+{
+       int     ndqused, nfree, n;
+       gfp_t gfp_mask = sc->gfp_mask;
+
+       if (!kmem_shake_allow(gfp_mask))
+               return 0;
+       if (!xfs_Gqm)
+               return 0;
+
+       nfree = xfs_Gqm->qm_dqfrlist_cnt; /* free dquots */
+       /* incore dquots in all f/s's */
+       ndqused = atomic_read(&xfs_Gqm->qm_totaldquots) - nfree;
+
+       ASSERT(ndqused >= 0);
+
+       if (nfree <= ndqused && nfree < ndquot)
+               return 0;
+
+       ndqused *= xfs_Gqm->qm_dqfree_ratio;    /* target # of free dquots */
+       n = nfree - ndqused - ndquot;           /* # over target */
+
+       return xfs_qm_shake_freelist(MAX(nfree, n));
+}
+
+
+/*------------------------------------------------------------------*/
+
+/*
+ * Return a new incore dquot. Depending on the number of
+ * dquots in the system, we either allocate a new one on the kernel heap,
+ * or reclaim a free one.
+ * Return value is B_TRUE if we allocated a new dquot, B_FALSE if we managed
+ * to reclaim an existing one from the freelist.
+ */
+boolean_t
+xfs_qm_dqalloc_incore(
+       xfs_dquot_t **O_dqpp)
+{
+       xfs_dquot_t     *dqp;
+
+       /*
+        * Check against high water mark to see if we want to pop
+        * a nincompoop dquot off the freelist.
+        */
+       if (atomic_read(&xfs_Gqm->qm_totaldquots) >= ndquot) {
+               /*
+                * Try to recycle a dquot from the freelist.
+                */
+               if ((dqp = xfs_qm_dqreclaim_one())) {
+                       XQM_STATS_INC(xqmstats.xs_qm_dqreclaims);
+                       /*
+                        * Just zero the core here. The rest will get
+                        * reinitialized by caller. XXX we shouldn't even
+                        * do this zero ...
+                        */
+                       memset(&dqp->q_core, 0, sizeof(dqp->q_core));
+                       *O_dqpp = dqp;
+                       return B_FALSE;
+               }
+               XQM_STATS_INC(xqmstats.xs_qm_dqreclaim_misses);
+       }
+
+       /*
+        * Allocate a brand new dquot on the kernel heap and return it
+        * to the caller to initialize.
+        */
+       ASSERT(xfs_Gqm->qm_dqzone != NULL);
+       *O_dqpp = kmem_zone_zalloc(xfs_Gqm->qm_dqzone, KM_SLEEP);
+       atomic_inc(&xfs_Gqm->qm_totaldquots);
+
+       return B_TRUE;
+}
+
+
+/*
+ * Start a transaction and write the incore superblock changes to
+ * disk. flags parameter indicates which fields have changed.
+ */
+int
+xfs_qm_write_sb_changes(
+       xfs_mount_t     *mp,
+       __int64_t       flags)
+{
+       xfs_trans_t     *tp;
+       int             error;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+       if ((error = xfs_trans_reserve(tp, 0,
+                                     mp->m_sb.sb_sectsize + 128, 0,
+                                     0,
+                                     XFS_DEFAULT_LOG_COUNT))) {
+               xfs_trans_cancel(tp, 0);
+               return error;
+       }
+
+       xfs_mod_sb(tp, flags);
+       error = xfs_trans_commit(tp, 0);
+
+       return error;
+}
+
+
+/* --------------- utility functions for vnodeops ---------------- */
+
+
+/*
+ * Given an inode, a uid, gid and prid make sure that we have
+ * allocated relevant dquot(s) on disk, and that we won't exceed inode
+ * quotas by creating this file.
+ * This also attaches dquot(s) to the given inode after locking it,
+ * and returns the dquots corresponding to the uid and/or gid.
+ *
+ * in  : inode (unlocked)
+ * out : udquot, gdquot with references taken and unlocked
+ */
+int
+xfs_qm_vop_dqalloc(
+       struct xfs_inode        *ip,
+       uid_t                   uid,
+       gid_t                   gid,
+       prid_t                  prid,
+       uint                    flags,
+       struct xfs_dquot        **O_udqpp,
+       struct xfs_dquot        **O_gdqpp)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_dquot        *uq, *gq;
+       int                     error;
+       uint                    lockflags;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return 0;
+
+       lockflags = XFS_ILOCK_EXCL;
+       xfs_ilock(ip, lockflags);
+
+       if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
+               gid = ip->i_d.di_gid;
+
+       /*
+        * Attach the dquot(s) to this inode, doing a dquot allocation
+        * if necessary. The dquot(s) will not be locked.
+        */
+       if (XFS_NOT_DQATTACHED(mp, ip)) {
+               error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC);
+               if (error) {
+                       xfs_iunlock(ip, lockflags);
+                       return error;
+               }
+       }
+
+       uq = gq = NULL;
+       if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
+               if (ip->i_d.di_uid != uid) {
+                       /*
+                        * What we need is the dquot that has this uid, and
+                        * if we send the inode to dqget, the uid of the inode
+                        * takes priority over what's sent in the uid argument.
+                        * We must unlock inode here before calling dqget if
+                        * we're not sending the inode, because otherwise
+                        * we'll deadlock by doing trans_reserve while
+                        * holding ilock.
+                        */
+                       xfs_iunlock(ip, lockflags);
+                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t) uid,
+                                                XFS_DQ_USER,
+                                                XFS_QMOPT_DQALLOC |
+                                                XFS_QMOPT_DOWARN,
+                                                &uq))) {
+                               ASSERT(error != ENOENT);
+                               return error;
+                       }
+                       /*
+                        * Get the ilock in the right order.
+                        */
+                       xfs_dqunlock(uq);
+                       lockflags = XFS_ILOCK_SHARED;
+                       xfs_ilock(ip, lockflags);
+               } else {
+                       /*
+                        * Take an extra reference, because we'll return
+                        * this to caller
+                        */
+                       ASSERT(ip->i_udquot);
+                       uq = ip->i_udquot;
+                       xfs_dqlock(uq);
+                       XFS_DQHOLD(uq);
+                       xfs_dqunlock(uq);
+               }
+       }
+       if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
+               if (ip->i_d.di_gid != gid) {
+                       xfs_iunlock(ip, lockflags);
+                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)gid,
+                                                XFS_DQ_GROUP,
+                                                XFS_QMOPT_DQALLOC |
+                                                XFS_QMOPT_DOWARN,
+                                                &gq))) {
+                               if (uq)
+                                       xfs_qm_dqrele(uq);
+                               ASSERT(error != ENOENT);
+                               return error;
+                       }
+                       xfs_dqunlock(gq);
+                       lockflags = XFS_ILOCK_SHARED;
+                       xfs_ilock(ip, lockflags);
+               } else {
+                       ASSERT(ip->i_gdquot);
+                       gq = ip->i_gdquot;
+                       xfs_dqlock(gq);
+                       XFS_DQHOLD(gq);
+                       xfs_dqunlock(gq);
+               }
+       } else if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
+               if (xfs_get_projid(ip) != prid) {
+                       xfs_iunlock(ip, lockflags);
+                       if ((error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid,
+                                                XFS_DQ_PROJ,
+                                                XFS_QMOPT_DQALLOC |
+                                                XFS_QMOPT_DOWARN,
+                                                &gq))) {
+                               if (uq)
+                                       xfs_qm_dqrele(uq);
+                               ASSERT(error != ENOENT);
+                               return (error);
+                       }
+                       xfs_dqunlock(gq);
+                       lockflags = XFS_ILOCK_SHARED;
+                       xfs_ilock(ip, lockflags);
+               } else {
+                       ASSERT(ip->i_gdquot);
+                       gq = ip->i_gdquot;
+                       xfs_dqlock(gq);
+                       XFS_DQHOLD(gq);
+                       xfs_dqunlock(gq);
+               }
+       }
+       if (uq)
+               trace_xfs_dquot_dqalloc(ip);
+
+       xfs_iunlock(ip, lockflags);
+       if (O_udqpp)
+               *O_udqpp = uq;
+       else if (uq)
+               xfs_qm_dqrele(uq);
+       if (O_gdqpp)
+               *O_gdqpp = gq;
+       else if (gq)
+               xfs_qm_dqrele(gq);
+       return 0;
+}
+
+/*
+ * Actually transfer ownership, and do dquot modifications.
+ * These were already reserved.
+ */
+xfs_dquot_t *
+xfs_qm_vop_chown(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       xfs_dquot_t     **IO_olddq,
+       xfs_dquot_t     *newdq)
+{
+       xfs_dquot_t     *prevdq;
+       uint            bfield = XFS_IS_REALTIME_INODE(ip) ?
+                                XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
+
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount));
+
+       /* old dquot */
+       prevdq = *IO_olddq;
+       ASSERT(prevdq);
+       ASSERT(prevdq != newdq);
+
+       xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_d.di_nblocks));
+       xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+
+       /* the sparkling new dquot */
+       xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_d.di_nblocks);
+       xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+
+       /*
+        * Take an extra reference, because the inode
+        * is going to keep this dquot pointer even
+        * after the trans_commit.
+        */
+       xfs_dqlock(newdq);
+       XFS_DQHOLD(newdq);
+       xfs_dqunlock(newdq);
+       *IO_olddq = newdq;
+
+       return prevdq;
+}
+
+/*
+ * Quota reservations for setattr(AT_UID|AT_GID|AT_PROJID).
+ */
+int
+xfs_qm_vop_chown_reserve(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       xfs_dquot_t     *udqp,
+       xfs_dquot_t     *gdqp,
+       uint            flags)
+{
+       xfs_mount_t     *mp = ip->i_mount;
+       uint            delblks, blkflags, prjflags = 0;
+       xfs_dquot_t     *unresudq, *unresgdq, *delblksudq, *delblksgdq;
+       int             error;
+
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       delblks = ip->i_delayed_blks;
+       delblksudq = delblksgdq = unresudq = unresgdq = NULL;
+       blkflags = XFS_IS_REALTIME_INODE(ip) ?
+                       XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
+
+       if (XFS_IS_UQUOTA_ON(mp) && udqp &&
+           ip->i_d.di_uid != (uid_t)be32_to_cpu(udqp->q_core.d_id)) {
+               delblksudq = udqp;
+               /*
+                * If there are delayed allocation blocks, then we have to
+                * unreserve those from the old dquot, and add them to the
+                * new dquot.
+                */
+               if (delblks) {
+                       ASSERT(ip->i_udquot);
+                       unresudq = ip->i_udquot;
+               }
+       }
+       if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) {
+               if (XFS_IS_PQUOTA_ON(ip->i_mount) &&
+                    xfs_get_projid(ip) != be32_to_cpu(gdqp->q_core.d_id))
+                       prjflags = XFS_QMOPT_ENOSPC;
+
+               if (prjflags ||
+                   (XFS_IS_GQUOTA_ON(ip->i_mount) &&
+                    ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) {
+                       delblksgdq = gdqp;
+                       if (delblks) {
+                               ASSERT(ip->i_gdquot);
+                               unresgdq = ip->i_gdquot;
+                       }
+               }
+       }
+
+       if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
+                               delblksudq, delblksgdq, ip->i_d.di_nblocks, 1,
+                               flags | blkflags | prjflags)))
+               return (error);
+
+       /*
+        * Do the delayed blks reservations/unreservations now. Since, these
+        * are done without the help of a transaction, if a reservation fails
+        * its previous reservations won't be automatically undone by trans
+        * code. So, we have to do it manually here.
+        */
+       if (delblks) {
+               /*
+                * Do the reservations first. Unreservation can't fail.
+                */
+               ASSERT(delblksudq || delblksgdq);
+               ASSERT(unresudq || unresgdq);
+               if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                               delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0,
+                               flags | blkflags | prjflags)))
+                       return (error);
+               xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
+                               unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0,
+                               blkflags);
+       }
+
+       return (0);
+}
+
+int
+xfs_qm_vop_rename_dqattach(
+       struct xfs_inode        **i_tab)
+{
+       struct xfs_mount        *mp = i_tab[0]->i_mount;
+       int                     i;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return 0;
+
+       for (i = 0; (i < 4 && i_tab[i]); i++) {
+               struct xfs_inode        *ip = i_tab[i];
+               int                     error;
+
+               /*
+                * Watch out for duplicate entries in the table.
+                */
+               if (i == 0 || ip != i_tab[i-1]) {
+                       if (XFS_NOT_DQATTACHED(mp, ip)) {
+                               error = xfs_qm_dqattach(ip, 0);
+                               if (error)
+                                       return error;
+                       }
+               }
+       }
+       return 0;
+}
+
+void
+xfs_qm_vop_create_dqattach(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       struct xfs_dquot        *udqp,
+       struct xfs_dquot        *gdqp)
+{
+       struct xfs_mount        *mp = tp->t_mountp;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return;
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT(XFS_IS_QUOTA_RUNNING(mp));
+
+       if (udqp) {
+               xfs_dqlock(udqp);
+               XFS_DQHOLD(udqp);
+               xfs_dqunlock(udqp);
+               ASSERT(ip->i_udquot == NULL);
+               ip->i_udquot = udqp;
+               ASSERT(XFS_IS_UQUOTA_ON(mp));
+               ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+               xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
+       }
+       if (gdqp) {
+               xfs_dqlock(gdqp);
+               XFS_DQHOLD(gdqp);
+               xfs_dqunlock(gdqp);
+               ASSERT(ip->i_gdquot == NULL);
+               ip->i_gdquot = gdqp;
+               ASSERT(XFS_IS_OQUOTA_ON(mp));
+               ASSERT((XFS_IS_GQUOTA_ON(mp) ?
+                       ip->i_d.di_gid : xfs_get_projid(ip)) ==
+                               be32_to_cpu(gdqp->q_core.d_id));
+               xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
+       }
+}
+
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
new file mode 100644 (file)
index 0000000..43b9abe
--- /dev/null
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QM_H__
+#define __XFS_QM_H__
+
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_quota_priv.h"
+#include "xfs_qm_stats.h"
+
+struct xfs_qm;
+struct xfs_inode;
+
+extern uint            ndquot;
+extern struct mutex    xfs_Gqm_lock;
+extern struct xfs_qm   *xfs_Gqm;
+extern kmem_zone_t     *qm_dqzone;
+extern kmem_zone_t     *qm_dqtrxzone;
+
+/*
+ * Used in xfs_qm_sync called by xfs_sync to count the max times that it can
+ * iterate over the mountpt's dquot list in one call.
+ */
+#define XFS_QM_SYNC_MAX_RESTARTS       7
+
+/*
+ * Ditto, for xfs_qm_dqreclaim_one.
+ */
+#define XFS_QM_RECLAIM_MAX_RESTARTS    4
+
+/*
+ * Ideal ratio of free to in use dquots. Quota manager makes an attempt
+ * to keep this balance.
+ */
+#define XFS_QM_DQFREE_RATIO            2
+
+/*
+ * Dquot hashtable constants/threshold values.
+ */
+#define XFS_QM_HASHSIZE_LOW            (PAGE_SIZE / sizeof(xfs_dqhash_t))
+#define XFS_QM_HASHSIZE_HIGH           ((PAGE_SIZE * 4) / sizeof(xfs_dqhash_t))
+
+/*
+ * This defines the unit of allocation of dquots.
+ * Currently, it is just one file system block, and a 4K blk contains 30
+ * (136 * 30 = 4080) dquots. It's probably not worth trying to make
+ * this more dynamic.
+ * XXXsup However, if this number is changed, we have to make sure that we don't
+ * implicitly assume that we do allocations in chunks of a single filesystem
+ * block in the dquot/xqm code.
+ */
+#define XFS_DQUOT_CLUSTER_SIZE_FSB     (xfs_filblks_t)1
+
+typedef xfs_dqhash_t   xfs_dqlist_t;
+
+/*
+ * Quota Manager (global) structure. Lives only in core.
+ */
+typedef struct xfs_qm {
+       xfs_dqlist_t    *qm_usr_dqhtable;/* udquot hash table */
+       xfs_dqlist_t    *qm_grp_dqhtable;/* gdquot hash table */
+       uint             qm_dqhashmask;  /* # buckets in dq hashtab - 1 */
+       struct list_head qm_dqfrlist;    /* freelist of dquots */
+       struct mutex     qm_dqfrlist_lock;
+       int              qm_dqfrlist_cnt;
+       atomic_t         qm_totaldquots; /* total incore dquots */
+       uint             qm_nrefs;       /* file systems with quota on */
+       int              qm_dqfree_ratio;/* ratio of free to inuse dquots */
+       kmem_zone_t     *qm_dqzone;      /* dquot mem-alloc zone */
+       kmem_zone_t     *qm_dqtrxzone;   /* t_dqinfo of transactions */
+} xfs_qm_t;
+
+/*
+ * Various quota information for individual filesystems.
+ * The mount structure keeps a pointer to this.
+ */
+typedef struct xfs_quotainfo {
+       xfs_inode_t     *qi_uquotaip;    /* user quota inode */
+       xfs_inode_t     *qi_gquotaip;    /* group quota inode */
+       struct list_head qi_dqlist;      /* all dquots in filesys */
+       struct mutex     qi_dqlist_lock;
+       int              qi_dquots;
+       int              qi_dqreclaims;  /* a change here indicates
+                                           a removal in the dqlist */
+       time_t           qi_btimelimit;  /* limit for blks timer */
+       time_t           qi_itimelimit;  /* limit for inodes timer */
+       time_t           qi_rtbtimelimit;/* limit for rt blks timer */
+       xfs_qwarncnt_t   qi_bwarnlimit;  /* limit for blks warnings */
+       xfs_qwarncnt_t   qi_iwarnlimit;  /* limit for inodes warnings */
+       xfs_qwarncnt_t   qi_rtbwarnlimit;/* limit for rt blks warnings */
+       struct mutex     qi_quotaofflock;/* to serialize quotaoff */
+       xfs_filblks_t    qi_dqchunklen;  /* # BBs in a chunk of dqs */
+       uint             qi_dqperchunk;  /* # ondisk dqs in above chunk */
+       xfs_qcnt_t       qi_bhardlimit;  /* default data blk hard limit */
+       xfs_qcnt_t       qi_bsoftlimit;  /* default data blk soft limit */
+       xfs_qcnt_t       qi_ihardlimit;  /* default inode count hard limit */
+       xfs_qcnt_t       qi_isoftlimit;  /* default inode count soft limit */
+       xfs_qcnt_t       qi_rtbhardlimit;/* default realtime blk hard limit */
+       xfs_qcnt_t       qi_rtbsoftlimit;/* default realtime blk soft limit */
+} xfs_quotainfo_t;
+
+
+extern void    xfs_trans_mod_dquot(xfs_trans_t *, xfs_dquot_t *, uint, long);
+extern int     xfs_trans_reserve_quota_bydquots(xfs_trans_t *, xfs_mount_t *,
+                       xfs_dquot_t *, xfs_dquot_t *, long, long, uint);
+extern void    xfs_trans_dqjoin(xfs_trans_t *, xfs_dquot_t *);
+extern void    xfs_trans_log_dquot(xfs_trans_t *, xfs_dquot_t *);
+
+/*
+ * We keep the usr and grp dquots separately so that locking will be easier
+ * to do at commit time. All transactions that we know of at this point
+ * affect no more than two dquots of one type. Hence, the TRANS_MAXDQS value.
+ */
+#define XFS_QM_TRANS_MAXDQS            2
+typedef struct xfs_dquot_acct {
+       xfs_dqtrx_t     dqa_usrdquots[XFS_QM_TRANS_MAXDQS];
+       xfs_dqtrx_t     dqa_grpdquots[XFS_QM_TRANS_MAXDQS];
+} xfs_dquot_acct_t;
+
+/*
+ * Users are allowed to have a usage exceeding their softlimit for
+ * a period this long.
+ */
+#define XFS_QM_BTIMELIMIT      (7 * 24*60*60)          /* 1 week */
+#define XFS_QM_RTBTIMELIMIT    (7 * 24*60*60)          /* 1 week */
+#define XFS_QM_ITIMELIMIT      (7 * 24*60*60)          /* 1 week */
+
+#define XFS_QM_BWARNLIMIT      5
+#define XFS_QM_IWARNLIMIT      5
+#define XFS_QM_RTBWARNLIMIT    5
+
+extern void            xfs_qm_destroy_quotainfo(xfs_mount_t *);
+extern int             xfs_qm_quotacheck(xfs_mount_t *);
+extern int             xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
+
+/* dquot stuff */
+extern boolean_t       xfs_qm_dqalloc_incore(xfs_dquot_t **);
+extern int             xfs_qm_dqpurge_all(xfs_mount_t *, uint);
+extern void            xfs_qm_dqrele_all_inodes(xfs_mount_t *, uint);
+
+/* quota ops */
+extern int             xfs_qm_scall_trunc_qfiles(xfs_mount_t *, uint);
+extern int             xfs_qm_scall_getquota(xfs_mount_t *, xfs_dqid_t, uint,
+                                       fs_disk_quota_t *);
+extern int             xfs_qm_scall_setqlim(xfs_mount_t *, xfs_dqid_t, uint,
+                                       fs_disk_quota_t *);
+extern int             xfs_qm_scall_getqstat(xfs_mount_t *, fs_quota_stat_t *);
+extern int             xfs_qm_scall_quotaon(xfs_mount_t *, uint);
+extern int             xfs_qm_scall_quotaoff(xfs_mount_t *, uint);
+
+#endif /* __XFS_QM_H__ */
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
new file mode 100644 (file)
index 0000000..a0a829a
--- /dev/null
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+
+STATIC void
+xfs_fill_statvfs_from_dquot(
+       struct kstatfs          *statp,
+       xfs_disk_dquot_t        *dp)
+{
+       __uint64_t              limit;
+
+       limit = dp->d_blk_softlimit ?
+               be64_to_cpu(dp->d_blk_softlimit) :
+               be64_to_cpu(dp->d_blk_hardlimit);
+       if (limit && statp->f_blocks > limit) {
+               statp->f_blocks = limit;
+               statp->f_bfree = statp->f_bavail =
+                       (statp->f_blocks > be64_to_cpu(dp->d_bcount)) ?
+                        (statp->f_blocks - be64_to_cpu(dp->d_bcount)) : 0;
+       }
+
+       limit = dp->d_ino_softlimit ?
+               be64_to_cpu(dp->d_ino_softlimit) :
+               be64_to_cpu(dp->d_ino_hardlimit);
+       if (limit && statp->f_files > limit) {
+               statp->f_files = limit;
+               statp->f_ffree =
+                       (statp->f_files > be64_to_cpu(dp->d_icount)) ?
+                        (statp->f_ffree - be64_to_cpu(dp->d_icount)) : 0;
+       }
+}
+
+
+/*
+ * Directory tree accounting is implemented using project quotas, where
+ * the project identifier is inherited from parent directories.
+ * A statvfs (df, etc.) of a directory that is using project quota should
+ * return a statvfs of the project, not the entire filesystem.
+ * This makes such trees appear as if they are filesystems in themselves.
+ */
+void
+xfs_qm_statvfs(
+       xfs_inode_t             *ip,
+       struct kstatfs          *statp)
+{
+       xfs_mount_t             *mp = ip->i_mount;
+       xfs_dquot_t             *dqp;
+
+       if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) {
+               xfs_fill_statvfs_from_dquot(statp, &dqp->q_core);
+               xfs_qm_dqput(dqp);
+       }
+}
+
+int
+xfs_qm_newmount(
+       xfs_mount_t     *mp,
+       uint            *needquotamount,
+       uint            *quotaflags)
+{
+       uint            quotaondisk;
+       uint            uquotaondisk = 0, gquotaondisk = 0, pquotaondisk = 0;
+
+       quotaondisk = xfs_sb_version_hasquota(&mp->m_sb) &&
+                               (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_ACCT);
+
+       if (quotaondisk) {
+               uquotaondisk = mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT;
+               pquotaondisk = mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT;
+               gquotaondisk = mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT;
+       }
+
+       /*
+        * If the device itself is read-only, we can't allow
+        * the user to change the state of quota on the mount -
+        * this would generate a transaction on the ro device,
+        * which would lead to an I/O error and shutdown
+        */
+
+       if (((uquotaondisk && !XFS_IS_UQUOTA_ON(mp)) ||
+           (!uquotaondisk &&  XFS_IS_UQUOTA_ON(mp)) ||
+            (pquotaondisk && !XFS_IS_PQUOTA_ON(mp)) ||
+           (!pquotaondisk &&  XFS_IS_PQUOTA_ON(mp)) ||
+            (gquotaondisk && !XFS_IS_GQUOTA_ON(mp)) ||
+           (!gquotaondisk &&  XFS_IS_OQUOTA_ON(mp)))  &&
+           xfs_dev_is_read_only(mp, "changing quota state")) {
+               xfs_warn(mp, "please mount with%s%s%s%s.",
+                       (!quotaondisk ? "out quota" : ""),
+                       (uquotaondisk ? " usrquota" : ""),
+                       (pquotaondisk ? " prjquota" : ""),
+                       (gquotaondisk ? " grpquota" : ""));
+               return XFS_ERROR(EPERM);
+       }
+
+       if (XFS_IS_QUOTA_ON(mp) || quotaondisk) {
+               /*
+                * Call mount_quotas at this point only if we won't have to do
+                * a quotacheck.
+                */
+               if (quotaondisk && !XFS_QM_NEED_QUOTACHECK(mp)) {
+                       /*
+                        * If an error occurred, qm_mount_quotas code
+                        * has already disabled quotas. So, just finish
+                        * mounting, and get on with the boring life
+                        * without disk quotas.
+                        */
+                       xfs_qm_mount_quotas(mp);
+               } else {
+                       /*
+                        * Clear the quota flags, but remember them. This
+                        * is so that the quota code doesn't get invoked
+                        * before we're ready. This can happen when an
+                        * inode goes inactive and wants to free blocks,
+                        * or via xfs_log_mount_finish.
+                        */
+                       *needquotamount = B_TRUE;
+                       *quotaflags = mp->m_qflags;
+                       mp->m_qflags = 0;
+               }
+       }
+
+       return 0;
+}
+
+void __init
+xfs_qm_init(void)
+{
+       printk(KERN_INFO "SGI XFS Quota Management subsystem\n");
+       mutex_init(&xfs_Gqm_lock);
+       xfs_qm_init_procfs();
+}
+
+void __exit
+xfs_qm_exit(void)
+{
+       xfs_qm_cleanup_procfs();
+       if (qm_dqzone)
+               kmem_zone_destroy(qm_dqzone);
+       if (qm_dqtrxzone)
+               kmem_zone_destroy(qm_dqtrxzone);
+}
diff --git a/fs/xfs/xfs_qm_stats.c b/fs/xfs/xfs_qm_stats.c
new file mode 100644 (file)
index 0000000..8671a0b
--- /dev/null
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_qm.h"
+
+struct xqmstats xqmstats;
+
+static int xqm_proc_show(struct seq_file *m, void *v)
+{
+       /* maximum; incore; ratio free to inuse; freelist */
+       seq_printf(m, "%d\t%d\t%d\t%u\n",
+                       ndquot,
+                       xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
+                       xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
+                       xfs_Gqm? xfs_Gqm->qm_dqfrlist_cnt : 0);
+       return 0;
+}
+
+static int xqm_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xqm_proc_show, NULL);
+}
+
+static const struct file_operations xqm_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = xqm_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
+       /* quota performance statistics */
+       seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
+                       xqmstats.xs_qm_dqreclaims,
+                       xqmstats.xs_qm_dqreclaim_misses,
+                       xqmstats.xs_qm_dquot_dups,
+                       xqmstats.xs_qm_dqcachemisses,
+                       xqmstats.xs_qm_dqcachehits,
+                       xqmstats.xs_qm_dqwants,
+                       xqmstats.xs_qm_dqshake_reclaims,
+                       xqmstats.xs_qm_dqinact_reclaims);
+       return 0;
+}
+
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xqmstat_proc_show, NULL);
+}
+
+static const struct file_operations xqmstat_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = xqmstat_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+void
+xfs_qm_init_procfs(void)
+{
+       proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
+       proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
+}
+
+void
+xfs_qm_cleanup_procfs(void)
+{
+       remove_proc_entry("fs/xfs/xqm", NULL);
+       remove_proc_entry("fs/xfs/xqmstat", NULL);
+}
diff --git a/fs/xfs/xfs_qm_stats.h b/fs/xfs/xfs_qm_stats.h
new file mode 100644 (file)
index 0000000..5b964fc
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QM_STATS_H__
+#define __XFS_QM_STATS_H__
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+/*
+ * XQM global statistics
+ */
+struct xqmstats {
+       __uint32_t              xs_qm_dqreclaims;
+       __uint32_t              xs_qm_dqreclaim_misses;
+       __uint32_t              xs_qm_dquot_dups;
+       __uint32_t              xs_qm_dqcachemisses;
+       __uint32_t              xs_qm_dqcachehits;
+       __uint32_t              xs_qm_dqwants;
+       __uint32_t              xs_qm_dqshake_reclaims;
+       __uint32_t              xs_qm_dqinact_reclaims;
+};
+
+extern struct xqmstats xqmstats;
+
+# define XQM_STATS_INC(count)  ( (count)++ )
+
+extern void xfs_qm_init_procfs(void);
+extern void xfs_qm_cleanup_procfs(void);
+
+#else
+
+# define XQM_STATS_INC(count)  do { } while (0)
+
+static inline void xfs_qm_init_procfs(void) { };
+static inline void xfs_qm_cleanup_procfs(void) { };
+
+#endif
+
+#endif /* __XFS_QM_STATS_H__ */
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
new file mode 100644 (file)
index 0000000..609246f
--- /dev/null
@@ -0,0 +1,906 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/capability.h>
+
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_qm.h"
+#include "xfs_trace.h"
+
+STATIC int     xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
+STATIC int     xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
+                                       uint);
+STATIC uint    xfs_qm_export_flags(uint);
+STATIC uint    xfs_qm_export_qtype_flags(uint);
+STATIC void    xfs_qm_export_dquot(xfs_mount_t *, xfs_disk_dquot_t *,
+                                       fs_disk_quota_t *);
+
+
+/*
+ * Turn off quota accounting and/or enforcement for all udquots and/or
+ * gdquots. Called only at unmount time.
+ *
+ * This assumes that there are no dquots of this file system cached
+ * incore, and modifies the ondisk dquot directly. Therefore, for example,
+ * it is an error to call this twice, without purging the cache.
+ */
+int
+xfs_qm_scall_quotaoff(
+       xfs_mount_t             *mp,
+       uint                    flags)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       uint                    dqtype;
+       int                     error;
+       uint                    inactivate_flags;
+       xfs_qoff_logitem_t      *qoffstart;
+       int                     nculprits;
+
+       /*
+        * No file system can have quotas enabled on disk but not in core.
+        * Note that quota utilities (like quotaoff) _expect_
+        * errno == EEXIST here.
+        */
+       if ((mp->m_qflags & flags) == 0)
+               return XFS_ERROR(EEXIST);
+       error = 0;
+
+       flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+
+       /*
+        * We don't want to deal with two quotaoffs messing up each other,
+        * so we're going to serialize it. quotaoff isn't exactly a performance
+        * critical thing.
+        * If quotaoff, then we must be dealing with the root filesystem.
+        */
+       ASSERT(q);
+       mutex_lock(&q->qi_quotaofflock);
+
+       /*
+        * If we're just turning off quota enforcement, change mp and go.
+        */
+       if ((flags & XFS_ALL_QUOTA_ACCT) == 0) {
+               mp->m_qflags &= ~(flags);
+
+               spin_lock(&mp->m_sb_lock);
+               mp->m_sb.sb_qflags = mp->m_qflags;
+               spin_unlock(&mp->m_sb_lock);
+               mutex_unlock(&q->qi_quotaofflock);
+
+               /* XXX what to do if error ? Revert back to old vals incore ? */
+               error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+               return (error);
+       }
+
+       dqtype = 0;
+       inactivate_flags = 0;
+       /*
+        * If accounting is off, we must turn enforcement off, clear the
+        * quota 'CHKD' certificate to make it known that we have to
+        * do a quotacheck the next time this quota is turned on.
+        */
+       if (flags & XFS_UQUOTA_ACCT) {
+               dqtype |= XFS_QMOPT_UQUOTA;
+               flags |= (XFS_UQUOTA_CHKD | XFS_UQUOTA_ENFD);
+               inactivate_flags |= XFS_UQUOTA_ACTIVE;
+       }
+       if (flags & XFS_GQUOTA_ACCT) {
+               dqtype |= XFS_QMOPT_GQUOTA;
+               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+               inactivate_flags |= XFS_GQUOTA_ACTIVE;
+       } else if (flags & XFS_PQUOTA_ACCT) {
+               dqtype |= XFS_QMOPT_PQUOTA;
+               flags |= (XFS_OQUOTA_CHKD | XFS_OQUOTA_ENFD);
+               inactivate_flags |= XFS_PQUOTA_ACTIVE;
+       }
+
+       /*
+        * Nothing to do?  Don't complain. This happens when we're just
+        * turning off quota enforcement.
+        */
+       if ((mp->m_qflags & flags) == 0)
+               goto out_unlock;
+
+       /*
+        * Write the LI_QUOTAOFF log record, and do SB changes atomically,
+        * and synchronously. If we fail to write, we should abort the
+        * operation as it cannot be recovered safely if we crash.
+        */
+       error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
+       if (error)
+               goto out_unlock;
+
+       /*
+        * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
+        * to take care of the race between dqget and quotaoff. We don't take
+        * any special locks to reset these bits. All processes need to check
+        * these bits *after* taking inode lock(s) to see if the particular
+        * quota type is in the process of being turned off. If *ACTIVE, it is
+        * guaranteed that all dquot structures and all quotainode ptrs will all
+        * stay valid as long as that inode is kept locked.
+        *
+        * There is no turning back after this.
+        */
+       mp->m_qflags &= ~inactivate_flags;
+
+       /*
+        * Give back all the dquot reference(s) held by inodes.
+        * Here we go thru every single incore inode in this file system, and
+        * do a dqrele on the i_udquot/i_gdquot that it may have.
+        * Essentially, as long as somebody has an inode locked, this guarantees
+        * that quotas will not be turned off. This is handy because in a
+        * transaction once we lock the inode(s) and check for quotaon, we can
+        * depend on the quota inodes (and other things) being valid as long as
+        * we keep the lock(s).
+        */
+       xfs_qm_dqrele_all_inodes(mp, flags);
+
+       /*
+        * Next we make the changes in the quota flag in the mount struct.
+        * This isn't protected by a particular lock directly, because we
+        * don't want to take a mrlock every time we depend on quotas being on.
+        */
+       mp->m_qflags &= ~(flags);
+
+       /*
+        * Go through all the dquots of this file system and purge them,
+        * according to what was turned off. We may not be able to get rid
+        * of all dquots, because dquots can have temporary references that
+        * are not attached to inodes. eg. xfs_setattr, xfs_create.
+        * So, if we couldn't purge all the dquots from the filesystem,
+        * we can't get rid of the incore data structures.
+        */
+       while ((nculprits = xfs_qm_dqpurge_all(mp, dqtype)))
+               delay(10 * nculprits);
+
+       /*
+        * Transactions that had started before ACTIVE state bit was cleared
+        * could have logged many dquots, so they'd have higher LSNs than
+        * the first QUOTAOFF log record does. If we happen to crash when
+        * the tail of the log has gone past the QUOTAOFF record, but
+        * before the last dquot modification, those dquots __will__
+        * recover, and that's not good.
+        *
+        * So, we have QUOTAOFF start and end logitems; the start
+        * logitem won't get overwritten until the end logitem appears...
+        */
+       error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+       if (error) {
+               /* We're screwed now. Shutdown is the only option. */
+               xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+               goto out_unlock;
+       }
+
+       /*
+        * If quotas is completely disabled, close shop.
+        */
+       if (((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET1) ||
+           ((flags & XFS_MOUNT_QUOTA_ALL) == XFS_MOUNT_QUOTA_SET2)) {
+               mutex_unlock(&q->qi_quotaofflock);
+               xfs_qm_destroy_quotainfo(mp);
+               return (0);
+       }
+
+       /*
+        * Release our quotainode references if we don't need them anymore.
+        */
+       if ((dqtype & XFS_QMOPT_UQUOTA) && q->qi_uquotaip) {
+               IRELE(q->qi_uquotaip);
+               q->qi_uquotaip = NULL;
+       }
+       if ((dqtype & (XFS_QMOPT_GQUOTA|XFS_QMOPT_PQUOTA)) && q->qi_gquotaip) {
+               IRELE(q->qi_gquotaip);
+               q->qi_gquotaip = NULL;
+       }
+
+out_unlock:
+       mutex_unlock(&q->qi_quotaofflock);
+       return error;
+}
+
+STATIC int
+xfs_qm_scall_trunc_qfile(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino)
+{
+       struct xfs_inode        *ip;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       if (ino == NULLFSINO)
+               return 0;
+
+       error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
+       if (error)
+               return error;
+
+       xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE);
+       error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+                                 XFS_TRANS_PERM_LOG_RES,
+                                 XFS_ITRUNCATE_LOG_COUNT);
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+               goto out_put;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_trans_ijoin(tp, ip);
+
+       error = xfs_itruncate_data(&tp, ip, 0);
+       if (error) {
+               xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES |
+                                    XFS_TRANS_ABORT);
+               goto out_unlock;
+       }
+
+       xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+       error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+
+out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+out_put:
+       IRELE(ip);
+       return error;
+}
+
+int
+xfs_qm_scall_trunc_qfiles(
+       xfs_mount_t     *mp,
+       uint            flags)
+{
+       int             error = 0, error2 = 0;
+
+       if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
+               xfs_debug(mp, "%s: flags=%x m_qflags=%x\n",
+                       __func__, flags, mp->m_qflags);
+               return XFS_ERROR(EINVAL);
+       }
+
+       if (flags & XFS_DQ_USER)
+               error = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_uquotino);
+       if (flags & (XFS_DQ_GROUP|XFS_DQ_PROJ))
+               error2 = xfs_qm_scall_trunc_qfile(mp, mp->m_sb.sb_gquotino);
+
+       return error ? error : error2;
+}
+
+/*
+ * Switch on (a given) quota enforcement for a filesystem.  This takes
+ * effect immediately.
+ * (Switching on quota accounting must be done at mount time.)
+ */
+int
+xfs_qm_scall_quotaon(
+       xfs_mount_t     *mp,
+       uint            flags)
+{
+       int             error;
+       uint            qf;
+       __int64_t       sbflags;
+
+       flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
+       /*
+        * Switching on quota accounting must be done at mount time.
+        */
+       flags &= ~(XFS_ALL_QUOTA_ACCT);
+
+       sbflags = 0;
+
+       if (flags == 0) {
+               xfs_debug(mp, "%s: zero flags, m_qflags=%x\n",
+                       __func__, mp->m_qflags);
+               return XFS_ERROR(EINVAL);
+       }
+
+       /* No fs can turn on quotas with a delayed effect */
+       ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
+
+       /*
+        * Can't enforce without accounting. We check the superblock
+        * qflags here instead of m_qflags because rootfs can have
+        * quota acct on ondisk without m_qflags' knowing.
+        */
+       if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+           (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
+           (flags & XFS_UQUOTA_ENFD))
+           ||
+           ((flags & XFS_PQUOTA_ACCT) == 0 &&
+           (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
+           (flags & XFS_GQUOTA_ACCT) == 0 &&
+           (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
+           (flags & XFS_OQUOTA_ENFD))) {
+               xfs_debug(mp,
+                       "%s: Can't enforce without acct, flags=%x sbflags=%x\n",
+                       __func__, flags, mp->m_sb.sb_qflags);
+               return XFS_ERROR(EINVAL);
+       }
+       /*
+        * If everything's up to-date incore, then don't waste time.
+        */
+       if ((mp->m_qflags & flags) == flags)
+               return XFS_ERROR(EEXIST);
+
+       /*
+        * Change sb_qflags on disk but not incore mp->qflags
+        * if this is the root filesystem.
+        */
+       spin_lock(&mp->m_sb_lock);
+       qf = mp->m_sb.sb_qflags;
+       mp->m_sb.sb_qflags = qf | flags;
+       spin_unlock(&mp->m_sb_lock);
+
+       /*
+        * There's nothing to change if it's the same.
+        */
+       if ((qf & flags) == flags && sbflags == 0)
+               return XFS_ERROR(EEXIST);
+       sbflags |= XFS_SB_QFLAGS;
+
+       if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+               return (error);
+       /*
+        * If we aren't trying to switch on quota enforcement, we are done.
+        */
+       if  (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) !=
+            (mp->m_qflags & XFS_UQUOTA_ACCT)) ||
+            ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
+            (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
+            ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
+            (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+           (flags & XFS_ALL_QUOTA_ENFD) == 0)
+               return (0);
+
+       if (! XFS_IS_QUOTA_RUNNING(mp))
+               return XFS_ERROR(ESRCH);
+
+       /*
+        * Switch on quota enforcement in core.
+        */
+       mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+       mp->m_qflags |= (flags & XFS_ALL_QUOTA_ENFD);
+       mutex_unlock(&mp->m_quotainfo->qi_quotaofflock);
+
+       return (0);
+}
+
+
+/*
+ * Return quota status information, such as uquota-off, enforcements, etc.
+ */
+int
+xfs_qm_scall_getqstat(
+       struct xfs_mount        *mp,
+       struct fs_quota_stat    *out)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       struct xfs_inode        *uip, *gip;
+       boolean_t               tempuqip, tempgqip;
+
+       uip = gip = NULL;
+       tempuqip = tempgqip = B_FALSE;
+       memset(out, 0, sizeof(fs_quota_stat_t));
+
+       out->qs_version = FS_QSTAT_VERSION;
+       if (!xfs_sb_version_hasquota(&mp->m_sb)) {
+               out->qs_uquota.qfs_ino = NULLFSINO;
+               out->qs_gquota.qfs_ino = NULLFSINO;
+               return (0);
+       }
+       out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
+                                                       (XFS_ALL_QUOTA_ACCT|
+                                                        XFS_ALL_QUOTA_ENFD));
+       out->qs_pad = 0;
+       out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino;
+       out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
+
+       if (q) {
+               uip = q->qi_uquotaip;
+               gip = q->qi_gquotaip;
+       }
+       if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
+                                       0, 0, &uip) == 0)
+                       tempuqip = B_TRUE;
+       }
+       if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
+               if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
+                                       0, 0, &gip) == 0)
+                       tempgqip = B_TRUE;
+       }
+       if (uip) {
+               out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
+               out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
+               if (tempuqip)
+                       IRELE(uip);
+       }
+       if (gip) {
+               out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
+               out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
+               if (tempgqip)
+                       IRELE(gip);
+       }
+       if (q) {
+               out->qs_incoredqs = q->qi_dquots;
+               out->qs_btimelimit = q->qi_btimelimit;
+               out->qs_itimelimit = q->qi_itimelimit;
+               out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+               out->qs_bwarnlimit = q->qi_bwarnlimit;
+               out->qs_iwarnlimit = q->qi_iwarnlimit;
+       }
+       return 0;
+}
+
+#define XFS_DQ_MASK \
+       (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+
+/*
+ * Adjust quota limits, and start/stop timers accordingly.
+ */
+int
+xfs_qm_scall_setqlim(
+       xfs_mount_t             *mp,
+       xfs_dqid_t              id,
+       uint                    type,
+       fs_disk_quota_t         *newlim)
+{
+       struct xfs_quotainfo    *q = mp->m_quotainfo;
+       xfs_disk_dquot_t        *ddq;
+       xfs_dquot_t             *dqp;
+       xfs_trans_t             *tp;
+       int                     error;
+       xfs_qcnt_t              hard, soft;
+
+       if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+               return EINVAL;
+       if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+               return 0;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
+       if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+                                     0, 0, XFS_DEFAULT_LOG_COUNT))) {
+               xfs_trans_cancel(tp, 0);
+               return (error);
+       }
+
+       /*
+        * We don't want to race with a quotaoff so take the quotaoff lock.
+        * (We don't hold an inode lock, so there's nothing else to stop
+        * a quotaoff from happening). (XXXThis doesn't currently happen
+        * because we take the vfslock before calling xfs_qm_sysent).
+        */
+       mutex_lock(&q->qi_quotaofflock);
+
+       /*
+        * Get the dquot (locked), and join it to the transaction.
+        * Allocate the dquot if this doesn't exist.
+        */
+       if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) {
+               xfs_trans_cancel(tp, XFS_TRANS_ABORT);
+               ASSERT(error != ENOENT);
+               goto out_unlock;
+       }
+       xfs_trans_dqjoin(tp, dqp);
+       ddq = &dqp->q_core;
+
+       /*
+        * Make sure that hardlimits are >= soft limits before changing.
+        */
+       hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+                       be64_to_cpu(ddq->d_blk_hardlimit);
+       soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+                       be64_to_cpu(ddq->d_blk_softlimit);
+       if (hard == 0 || hard >= soft) {
+               ddq->d_blk_hardlimit = cpu_to_be64(hard);
+               ddq->d_blk_softlimit = cpu_to_be64(soft);
+               if (id == 0) {
+                       q->qi_bhardlimit = hard;
+                       q->qi_bsoftlimit = soft;
+               }
+       } else {
+               xfs_debug(mp, "blkhard %Ld < blksoft %Ld\n", hard, soft);
+       }
+       hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+                       be64_to_cpu(ddq->d_rtb_hardlimit);
+       soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+               (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+                       be64_to_cpu(ddq->d_rtb_softlimit);
+       if (hard == 0 || hard >= soft) {
+               ddq->d_rtb_hardlimit = cpu_to_be64(hard);
+               ddq->d_rtb_softlimit = cpu_to_be64(soft);
+               if (id == 0) {
+                       q->qi_rtbhardlimit = hard;
+                       q->qi_rtbsoftlimit = soft;
+               }
+       } else {
+               xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld\n", hard, soft);
+       }
+
+       hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+               (xfs_qcnt_t) newlim->d_ino_hardlimit :
+                       be64_to_cpu(ddq->d_ino_hardlimit);
+       soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+               (xfs_qcnt_t) newlim->d_ino_softlimit :
+                       be64_to_cpu(ddq->d_ino_softlimit);
+       if (hard == 0 || hard >= soft) {
+               ddq->d_ino_hardlimit = cpu_to_be64(hard);
+               ddq->d_ino_softlimit = cpu_to_be64(soft);
+               if (id == 0) {
+                       q->qi_ihardlimit = hard;
+                       q->qi_isoftlimit = soft;
+               }
+       } else {
+               xfs_debug(mp, "ihard %Ld < isoft %Ld\n", hard, soft);
+       }
+
+       /*
+        * Update warnings counter(s) if requested
+        */
+       if (newlim->d_fieldmask & FS_DQ_BWARNS)
+               ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
+       if (newlim->d_fieldmask & FS_DQ_IWARNS)
+               ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
+       if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+               ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+
+       if (id == 0) {
+               /*
+                * Timelimits for the super user set the relative time
+                * the other users can be over quota for this file system.
+                * If it is zero a default is used.  Ditto for the default
+                * soft and hard limit values (already done, above), and
+                * for warnings.
+                */
+               if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+                       q->qi_btimelimit = newlim->d_btimer;
+                       ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+               }
+               if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+                       q->qi_itimelimit = newlim->d_itimer;
+                       ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+               }
+               if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+                       q->qi_rtbtimelimit = newlim->d_rtbtimer;
+                       ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+               }
+               if (newlim->d_fieldmask & FS_DQ_BWARNS)
+                       q->qi_bwarnlimit = newlim->d_bwarns;
+               if (newlim->d_fieldmask & FS_DQ_IWARNS)
+                       q->qi_iwarnlimit = newlim->d_iwarns;
+               if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+                       q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+       } else {
+               /*
+                * If the user is now over quota, start the timelimit.
+                * The user will not be 'warned'.
+                * Note that we keep the timers ticking, whether enforcement
+                * is on or off. We don't really want to bother with iterating
+                * over all ondisk dquots and turning the timers on/off.
+                */
+               xfs_qm_adjust_dqtimers(mp, ddq);
+       }
+       dqp->dq_flags |= XFS_DQ_DIRTY;
+       xfs_trans_log_dquot(tp, dqp);
+
+       error = xfs_trans_commit(tp, 0);
+       xfs_qm_dqrele(dqp);
+
+ out_unlock:
+       mutex_unlock(&q->qi_quotaofflock);
+       return error;
+}
+
+int
+xfs_qm_scall_getquota(
+       xfs_mount_t     *mp,
+       xfs_dqid_t      id,
+       uint            type,
+       fs_disk_quota_t *out)
+{
+       xfs_dquot_t     *dqp;
+       int             error;
+
+       /*
+        * Try to get the dquot. We don't want it allocated on disk, so
+        * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't
+        * exist, we'll get ENOENT back.
+        */
+       if ((error = xfs_qm_dqget(mp, NULL, id, type, 0, &dqp))) {
+               return (error);
+       }
+
+       /*
+        * If everything's NULL, this dquot doesn't quite exist as far as
+        * our utility programs are concerned.
+        */
+       if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) {
+               xfs_qm_dqput(dqp);
+               return XFS_ERROR(ENOENT);
+       }
+       /*
+        * Convert the disk dquot to the exportable format
+        */
+       xfs_qm_export_dquot(mp, &dqp->q_core, out);
+       xfs_qm_dqput(dqp);
+       return (error ? XFS_ERROR(EFAULT) : 0);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+       xfs_mount_t             *mp,
+       xfs_qoff_logitem_t      *startqoff,
+       uint                    flags)
+{
+       xfs_trans_t             *tp;
+       int                     error;
+       xfs_qoff_logitem_t      *qoffi;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
+
+       if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+                                     0, 0, XFS_DEFAULT_LOG_COUNT))) {
+               xfs_trans_cancel(tp, 0);
+               return (error);
+       }
+
+       qoffi = xfs_trans_get_qoff_item(tp, startqoff,
+                                       flags & XFS_ALL_QUOTA_ACCT);
+       xfs_trans_log_quotaoff_item(tp, qoffi);
+
+       /*
+        * We have to make sure that the transaction is secure on disk before we
+        * return and actually stop quota accounting. So, make it synchronous.
+        * We don't care about quotoff's performance.
+        */
+       xfs_trans_set_sync(tp);
+       error = xfs_trans_commit(tp, 0);
+       return (error);
+}
+
+
+STATIC int
+xfs_qm_log_quotaoff(
+       xfs_mount_t            *mp,
+       xfs_qoff_logitem_t     **qoffstartp,
+       uint                   flags)
+{
+       xfs_trans_t            *tp;
+       int                     error;
+       xfs_qoff_logitem_t     *qoffi=NULL;
+       uint                    oldsbqflag=0;
+
+       tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
+       if ((error = xfs_trans_reserve(tp, 0,
+                                     sizeof(xfs_qoff_logitem_t) * 2 +
+                                     mp->m_sb.sb_sectsize + 128,
+                                     0,
+                                     0,
+                                     XFS_DEFAULT_LOG_COUNT))) {
+               goto error0;
+       }
+
+       qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+       xfs_trans_log_quotaoff_item(tp, qoffi);
+
+       spin_lock(&mp->m_sb_lock);
+       oldsbqflag = mp->m_sb.sb_qflags;
+       mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+       spin_unlock(&mp->m_sb_lock);
+
+       xfs_mod_sb(tp, XFS_SB_QFLAGS);
+
+       /*
+        * We have to make sure that the transaction is secure on disk before we
+        * return and actually stop quota accounting. So, make it synchronous.
+        * We don't care about quotoff's performance.
+        */
+       xfs_trans_set_sync(tp);
+       error = xfs_trans_commit(tp, 0);
+
+error0:
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               /*
+                * No one else is modifying sb_qflags, so this is OK.
+                * We still hold the quotaofflock.
+                */
+               spin_lock(&mp->m_sb_lock);
+               mp->m_sb.sb_qflags = oldsbqflag;
+               spin_unlock(&mp->m_sb_lock);
+       }
+       *qoffstartp = qoffi;
+       return (error);
+}
+
+
+/*
+ * Translate an internal style on-disk-dquot to the exportable format.
+ * The main differences are that the counters/limits are all in Basic
+ * Blocks (BBs) instead of the internal FSBs, and all on-disk data has
+ * to be converted to the native endianness.
+ */
+STATIC void
+xfs_qm_export_dquot(
+       xfs_mount_t             *mp,
+       xfs_disk_dquot_t        *src,
+       struct fs_disk_quota    *dst)
+{
+       memset(dst, 0, sizeof(*dst));
+       dst->d_version = FS_DQUOT_VERSION;  /* different from src->d_version */
+       dst->d_flags = xfs_qm_export_qtype_flags(src->d_flags);
+       dst->d_id = be32_to_cpu(src->d_id);
+       dst->d_blk_hardlimit =
+               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_hardlimit));
+       dst->d_blk_softlimit =
+               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_blk_softlimit));
+       dst->d_ino_hardlimit = be64_to_cpu(src->d_ino_hardlimit);
+       dst->d_ino_softlimit = be64_to_cpu(src->d_ino_softlimit);
+       dst->d_bcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_bcount));
+       dst->d_icount = be64_to_cpu(src->d_icount);
+       dst->d_btimer = be32_to_cpu(src->d_btimer);
+       dst->d_itimer = be32_to_cpu(src->d_itimer);
+       dst->d_iwarns = be16_to_cpu(src->d_iwarns);
+       dst->d_bwarns = be16_to_cpu(src->d_bwarns);
+       dst->d_rtb_hardlimit =
+               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_hardlimit));
+       dst->d_rtb_softlimit =
+               XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtb_softlimit));
+       dst->d_rtbcount = XFS_FSB_TO_BB(mp, be64_to_cpu(src->d_rtbcount));
+       dst->d_rtbtimer = be32_to_cpu(src->d_rtbtimer);
+       dst->d_rtbwarns = be16_to_cpu(src->d_rtbwarns);
+
+       /*
+        * Internally, we don't reset all the timers when quota enforcement
+        * gets turned off. No need to confuse the user level code,
+        * so return zeroes in that case.
+        */
+       if ((!XFS_IS_UQUOTA_ENFORCED(mp) && src->d_flags == XFS_DQ_USER) ||
+           (!XFS_IS_OQUOTA_ENFORCED(mp) &&
+                       (src->d_flags & (XFS_DQ_PROJ | XFS_DQ_GROUP)))) {
+               dst->d_btimer = 0;
+               dst->d_itimer = 0;
+               dst->d_rtbtimer = 0;
+       }
+
+#ifdef DEBUG
+       if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
+            (XFS_IS_OQUOTA_ENFORCED(mp) &&
+                       (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
+           dst->d_id != 0) {
+               if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) &&
+                   (dst->d_blk_softlimit > 0)) {
+                       ASSERT(dst->d_btimer != 0);
+               }
+               if (((int) dst->d_icount >= (int) dst->d_ino_softlimit) &&
+                   (dst->d_ino_softlimit > 0)) {
+                       ASSERT(dst->d_itimer != 0);
+               }
+       }
+#endif
+}
+
+STATIC uint
+xfs_qm_export_qtype_flags(
+       uint flags)
+{
+       /*
+        * Can't be more than one, or none.
+        */
+       ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
+               (FS_PROJ_QUOTA | FS_USER_QUOTA));
+       ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
+               (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
+       ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
+               (FS_USER_QUOTA | FS_GROUP_QUOTA));
+       ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
+
+       return (flags & XFS_DQ_USER) ?
+               FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
+                       FS_PROJ_QUOTA : FS_GROUP_QUOTA;
+}
+
+STATIC uint
+xfs_qm_export_flags(
+       uint flags)
+{
+       uint uflags;
+
+       uflags = 0;
+       if (flags & XFS_UQUOTA_ACCT)
+               uflags |= FS_QUOTA_UDQ_ACCT;
+       if (flags & XFS_PQUOTA_ACCT)
+               uflags |= FS_QUOTA_PDQ_ACCT;
+       if (flags & XFS_GQUOTA_ACCT)
+               uflags |= FS_QUOTA_GDQ_ACCT;
+       if (flags & XFS_UQUOTA_ENFD)
+               uflags |= FS_QUOTA_UDQ_ENFD;
+       if (flags & (XFS_OQUOTA_ENFD)) {
+               uflags |= (flags & XFS_GQUOTA_ACCT) ?
+                       FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD;
+       }
+       return (uflags);
+}
+
+
+STATIC int
+xfs_dqrele_inode(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     flags)
+{
+       /* skip quota inodes */
+       if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
+           ip == ip->i_mount->m_quotainfo->qi_gquotaip) {
+               ASSERT(ip->i_udquot == NULL);
+               ASSERT(ip->i_gdquot == NULL);
+               return 0;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
+               xfs_qm_dqrele(ip->i_udquot);
+               ip->i_udquot = NULL;
+       }
+       if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+               xfs_qm_dqrele(ip->i_gdquot);
+               ip->i_gdquot = NULL;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       return 0;
+}
+
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ *
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+       struct xfs_mount *mp,
+       uint             flags)
+{
+       ASSERT(mp->m_quotainfo);
+       xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+}
diff --git a/fs/xfs/xfs_quota_priv.h b/fs/xfs/xfs_quota_priv.h
new file mode 100644 (file)
index 0000000..94a3d92
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2003 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_QUOTA_PRIV_H__
+#define __XFS_QUOTA_PRIV_H__
+
+/*
+ * Number of bmaps that we ask from bmapi when doing a quotacheck.
+ * We make this restriction to keep the memory usage to a minimum.
+ */
+#define XFS_DQITER_MAP_SIZE    10
+
+/*
+ * Hash into a bucket in the dquot hash table, based on <mp, id>.
+ */
+#define XFS_DQ_HASHVAL(mp, id) (((__psunsigned_t)(mp) + \
+                                (__psunsigned_t)(id)) & \
+                               (xfs_Gqm->qm_dqhashmask - 1))
+#define XFS_DQ_HASH(mp, id, type)   (type == XFS_DQ_USER ? \
+                                    (xfs_Gqm->qm_usr_dqhtable + \
+                                     XFS_DQ_HASHVAL(mp, id)) : \
+                                    (xfs_Gqm->qm_grp_dqhtable + \
+                                     XFS_DQ_HASHVAL(mp, id)))
+#define XFS_IS_DQUOT_UNINITIALIZED(dqp) ( \
+       !dqp->q_core.d_blk_hardlimit && \
+       !dqp->q_core.d_blk_softlimit && \
+       !dqp->q_core.d_rtb_hardlimit && \
+       !dqp->q_core.d_rtb_softlimit && \
+       !dqp->q_core.d_ino_hardlimit && \
+       !dqp->q_core.d_ino_softlimit && \
+       !dqp->q_core.d_bcount && \
+       !dqp->q_core.d_rtbcount && \
+       !dqp->q_core.d_icount)
+
+#define DQFLAGTO_TYPESTR(d)    (((d)->dq_flags & XFS_DQ_USER) ? "USR" : \
+                                (((d)->dq_flags & XFS_DQ_GROUP) ? "GRP" : \
+                                (((d)->dq_flags & XFS_DQ_PROJ) ? "PRJ":"???")))
+
+#endif /* __XFS_QUOTA_PRIV_H__ */
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
new file mode 100644 (file)
index 0000000..7e76f53
--- /dev/null
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2008, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_log.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_quota.h"
+#include "xfs_trans.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_qm.h"
+#include <linux/quota.h>
+
+
+STATIC int
+xfs_quota_type(int type)
+{
+       switch (type) {
+       case USRQUOTA:
+               return XFS_DQ_USER;
+       case GRPQUOTA:
+               return XFS_DQ_GROUP;
+       default:
+               return XFS_DQ_PROJ;
+       }
+}
+
+STATIC int
+xfs_fs_get_xstate(
+       struct super_block      *sb,
+       struct fs_quota_stat    *fqs)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       return -xfs_qm_scall_getqstat(mp, fqs);
+}
+
+STATIC int
+xfs_fs_set_xstate(
+       struct super_block      *sb,
+       unsigned int            uflags,
+       int                     op)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+       unsigned int            flags = 0;
+
+       if (sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+
+       if (uflags & FS_QUOTA_UDQ_ACCT)
+               flags |= XFS_UQUOTA_ACCT;
+       if (uflags & FS_QUOTA_PDQ_ACCT)
+               flags |= XFS_PQUOTA_ACCT;
+       if (uflags & FS_QUOTA_GDQ_ACCT)
+               flags |= XFS_GQUOTA_ACCT;
+       if (uflags & FS_QUOTA_UDQ_ENFD)
+               flags |= XFS_UQUOTA_ENFD;
+       if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD))
+               flags |= XFS_OQUOTA_ENFD;
+
+       switch (op) {
+       case Q_XQUOTAON:
+               return -xfs_qm_scall_quotaon(mp, flags);
+       case Q_XQUOTAOFF:
+               if (!XFS_IS_QUOTA_ON(mp))
+                       return -EINVAL;
+               return -xfs_qm_scall_quotaoff(mp, flags);
+       case Q_XQUOTARM:
+               if (XFS_IS_QUOTA_ON(mp))
+                       return -EINVAL;
+               return -xfs_qm_scall_trunc_qfiles(mp, flags);
+       }
+
+       return -EINVAL;
+}
+
+STATIC int
+xfs_fs_get_dqblk(
+       struct super_block      *sb,
+       int                     type,
+       qid_t                   id,
+       struct fs_disk_quota    *fdq)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       if (!XFS_IS_QUOTA_ON(mp))
+               return -ESRCH;
+
+       return -xfs_qm_scall_getquota(mp, id, xfs_quota_type(type), fdq);
+}
+
+STATIC int
+xfs_fs_set_dqblk(
+       struct super_block      *sb,
+       int                     type,
+       qid_t                   id,
+       struct fs_disk_quota    *fdq)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       if (sb->s_flags & MS_RDONLY)
+               return -EROFS;
+       if (!XFS_IS_QUOTA_RUNNING(mp))
+               return -ENOSYS;
+       if (!XFS_IS_QUOTA_ON(mp))
+               return -ESRCH;
+
+       return -xfs_qm_scall_setqlim(mp, id, xfs_quota_type(type), fdq);
+}
+
+const struct quotactl_ops xfs_quotactl_operations = {
+       .get_xstate             = xfs_fs_get_xstate,
+       .set_xstate             = xfs_fs_set_xstate,
+       .get_dqblk              = xfs_fs_get_dqblk,
+       .set_dqblk              = xfs_fs_set_dqblk,
+};
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
new file mode 100644 (file)
index 0000000..76fdc58
--- /dev/null
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/proc_fs.h>
+
+DEFINE_PER_CPU(struct xfsstats, xfsstats);
+
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
+{
+       int             c, i, j, val;
+       __uint64_t      xs_xstrat_bytes = 0;
+       __uint64_t      xs_write_bytes = 0;
+       __uint64_t      xs_read_bytes = 0;
+
+       static const struct xstats_entry {
+               char    *desc;
+               int     endpoint;
+       } xstats[] = {
+               { "extent_alloc",       XFSSTAT_END_EXTENT_ALLOC        },
+               { "abt",                XFSSTAT_END_ALLOC_BTREE         },
+               { "blk_map",            XFSSTAT_END_BLOCK_MAPPING       },
+               { "bmbt",               XFSSTAT_END_BLOCK_MAP_BTREE     },
+               { "dir",                XFSSTAT_END_DIRECTORY_OPS       },
+               { "trans",              XFSSTAT_END_TRANSACTIONS        },
+               { "ig",                 XFSSTAT_END_INODE_OPS           },
+               { "log",                XFSSTAT_END_LOG_OPS             },
+               { "push_ail",           XFSSTAT_END_TAIL_PUSHING        },
+               { "xstrat",             XFSSTAT_END_WRITE_CONVERT       },
+               { "rw",                 XFSSTAT_END_READ_WRITE_OPS      },
+               { "attr",               XFSSTAT_END_ATTRIBUTE_OPS       },
+               { "icluster",           XFSSTAT_END_INODE_CLUSTER       },
+               { "vnodes",             XFSSTAT_END_VNODE_OPS           },
+               { "buf",                XFSSTAT_END_BUF                 },
+               { "abtb2",              XFSSTAT_END_ABTB_V2             },
+               { "abtc2",              XFSSTAT_END_ABTC_V2             },
+               { "bmbt2",              XFSSTAT_END_BMBT_V2             },
+               { "ibt2",               XFSSTAT_END_IBT_V2              },
+       };
+
+       /* Loop over all stats groups */
+       for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
+               seq_printf(m, "%s", xstats[i].desc);
+               /* inner loop does each group */
+               while (j < xstats[i].endpoint) {
+                       val = 0;
+                       /* sum over all cpus */
+                       for_each_possible_cpu(c)
+                               val += *(((__u32*)&per_cpu(xfsstats, c) + j));
+                       seq_printf(m, " %u", val);
+                       j++;
+               }
+               seq_putc(m, '\n');
+       }
+       /* extra precision counters */
+       for_each_possible_cpu(i) {
+               xs_xstrat_bytes += per_cpu(xfsstats, i).xs_xstrat_bytes;
+               xs_write_bytes += per_cpu(xfsstats, i).xs_write_bytes;
+               xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
+       }
+
+       seq_printf(m, "xpc %Lu %Lu %Lu\n",
+                       xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
+       seq_printf(m, "debug %u\n",
+#if defined(DEBUG)
+               1);
+#else
+               0);
+#endif
+       return 0;
+}
+
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, xfs_stat_proc_show, NULL);
+}
+
+static const struct file_operations xfs_stat_proc_fops = {
+       .owner          = THIS_MODULE,
+       .open           = xfs_stat_proc_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+int
+xfs_init_procfs(void)
+{
+       if (!proc_mkdir("fs/xfs", NULL))
+               goto out;
+
+       if (!proc_create("fs/xfs/stat", 0, NULL,
+                        &xfs_stat_proc_fops))
+               goto out_remove_entry;
+       return 0;
+
+ out_remove_entry:
+       remove_proc_entry("fs/xfs", NULL);
+ out:
+       return -ENOMEM;
+}
+
+void
+xfs_cleanup_procfs(void)
+{
+       remove_proc_entry("fs/xfs/stat", NULL);
+       remove_proc_entry("fs/xfs", NULL);
+}
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
new file mode 100644 (file)
index 0000000..736854b
--- /dev/null
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2000,2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_STATS_H__
+#define __XFS_STATS_H__
+
+
+#if defined(CONFIG_PROC_FS) && !defined(XFS_STATS_OFF)
+
+#include <linux/percpu.h>
+
+/*
+ * XFS global statistics
+ */
+struct xfsstats {
+# define XFSSTAT_END_EXTENT_ALLOC      4
+       __uint32_t              xs_allocx;
+       __uint32_t              xs_allocb;
+       __uint32_t              xs_freex;
+       __uint32_t              xs_freeb;
+# define XFSSTAT_END_ALLOC_BTREE       (XFSSTAT_END_EXTENT_ALLOC+4)
+       __uint32_t              xs_abt_lookup;
+       __uint32_t              xs_abt_compare;
+       __uint32_t              xs_abt_insrec;
+       __uint32_t              xs_abt_delrec;
+# define XFSSTAT_END_BLOCK_MAPPING     (XFSSTAT_END_ALLOC_BTREE+7)
+       __uint32_t              xs_blk_mapr;
+       __uint32_t              xs_blk_mapw;
+       __uint32_t              xs_blk_unmap;
+       __uint32_t              xs_add_exlist;
+       __uint32_t              xs_del_exlist;
+       __uint32_t              xs_look_exlist;
+       __uint32_t              xs_cmp_exlist;
+# define XFSSTAT_END_BLOCK_MAP_BTREE   (XFSSTAT_END_BLOCK_MAPPING+4)
+       __uint32_t              xs_bmbt_lookup;
+       __uint32_t              xs_bmbt_compare;
+       __uint32_t              xs_bmbt_insrec;
+       __uint32_t              xs_bmbt_delrec;
+# define XFSSTAT_END_DIRECTORY_OPS     (XFSSTAT_END_BLOCK_MAP_BTREE+4)
+       __uint32_t              xs_dir_lookup;
+       __uint32_t              xs_dir_create;
+       __uint32_t              xs_dir_remove;
+       __uint32_t              xs_dir_getdents;
+# define XFSSTAT_END_TRANSACTIONS      (XFSSTAT_END_DIRECTORY_OPS+3)
+       __uint32_t              xs_trans_sync;
+       __uint32_t              xs_trans_async;
+       __uint32_t              xs_trans_empty;
+# define XFSSTAT_END_INODE_OPS         (XFSSTAT_END_TRANSACTIONS+7)
+       __uint32_t              xs_ig_attempts;
+       __uint32_t              xs_ig_found;
+       __uint32_t              xs_ig_frecycle;
+       __uint32_t              xs_ig_missed;
+       __uint32_t              xs_ig_dup;
+       __uint32_t              xs_ig_reclaims;
+       __uint32_t              xs_ig_attrchg;
+# define XFSSTAT_END_LOG_OPS           (XFSSTAT_END_INODE_OPS+5)
+       __uint32_t              xs_log_writes;
+       __uint32_t              xs_log_blocks;
+       __uint32_t              xs_log_noiclogs;
+       __uint32_t              xs_log_force;
+       __uint32_t              xs_log_force_sleep;
+# define XFSSTAT_END_TAIL_PUSHING      (XFSSTAT_END_LOG_OPS+10)
+       __uint32_t              xs_try_logspace;
+       __uint32_t              xs_sleep_logspace;
+       __uint32_t              xs_push_ail;
+       __uint32_t              xs_push_ail_success;
+       __uint32_t              xs_push_ail_pushbuf;
+       __uint32_t              xs_push_ail_pinned;
+       __uint32_t              xs_push_ail_locked;
+       __uint32_t              xs_push_ail_flushing;
+       __uint32_t              xs_push_ail_restarts;
+       __uint32_t              xs_push_ail_flush;
+# define XFSSTAT_END_WRITE_CONVERT     (XFSSTAT_END_TAIL_PUSHING+2)
+       __uint32_t              xs_xstrat_quick;
+       __uint32_t              xs_xstrat_split;
+# define XFSSTAT_END_READ_WRITE_OPS    (XFSSTAT_END_WRITE_CONVERT+2)
+       __uint32_t              xs_write_calls;
+       __uint32_t              xs_read_calls;
+# define XFSSTAT_END_ATTRIBUTE_OPS     (XFSSTAT_END_READ_WRITE_OPS+4)
+       __uint32_t              xs_attr_get;
+       __uint32_t              xs_attr_set;
+       __uint32_t              xs_attr_remove;
+       __uint32_t              xs_attr_list;
+# define XFSSTAT_END_INODE_CLUSTER     (XFSSTAT_END_ATTRIBUTE_OPS+3)
+       __uint32_t              xs_iflush_count;
+       __uint32_t              xs_icluster_flushcnt;
+       __uint32_t              xs_icluster_flushinode;
+# define XFSSTAT_END_VNODE_OPS         (XFSSTAT_END_INODE_CLUSTER+8)
+       __uint32_t              vn_active;      /* # vnodes not on free lists */
+       __uint32_t              vn_alloc;       /* # times vn_alloc called */
+       __uint32_t              vn_get;         /* # times vn_get called */
+       __uint32_t              vn_hold;        /* # times vn_hold called */
+       __uint32_t              vn_rele;        /* # times vn_rele called */
+       __uint32_t              vn_reclaim;     /* # times vn_reclaim called */
+       __uint32_t              vn_remove;      /* # times vn_remove called */
+       __uint32_t              vn_free;        /* # times vn_free called */
+#define XFSSTAT_END_BUF                        (XFSSTAT_END_VNODE_OPS+9)
+       __uint32_t              xb_get;
+       __uint32_t              xb_create;
+       __uint32_t              xb_get_locked;
+       __uint32_t              xb_get_locked_waited;
+       __uint32_t              xb_busy_locked;
+       __uint32_t              xb_miss_locked;
+       __uint32_t              xb_page_retries;
+       __uint32_t              xb_page_found;
+       __uint32_t              xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2            (XFSSTAT_END_BUF+15)
+       __uint32_t              xs_abtb_2_lookup;
+       __uint32_t              xs_abtb_2_compare;
+       __uint32_t              xs_abtb_2_insrec;
+       __uint32_t              xs_abtb_2_delrec;
+       __uint32_t              xs_abtb_2_newroot;
+       __uint32_t              xs_abtb_2_killroot;
+       __uint32_t              xs_abtb_2_increment;
+       __uint32_t              xs_abtb_2_decrement;
+       __uint32_t              xs_abtb_2_lshift;
+       __uint32_t              xs_abtb_2_rshift;
+       __uint32_t              xs_abtb_2_split;
+       __uint32_t              xs_abtb_2_join;
+       __uint32_t              xs_abtb_2_alloc;
+       __uint32_t              xs_abtb_2_free;
+       __uint32_t              xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2            (XFSSTAT_END_ABTB_V2+15)
+       __uint32_t              xs_abtc_2_lookup;
+       __uint32_t              xs_abtc_2_compare;
+       __uint32_t              xs_abtc_2_insrec;
+       __uint32_t              xs_abtc_2_delrec;
+       __uint32_t              xs_abtc_2_newroot;
+       __uint32_t              xs_abtc_2_killroot;
+       __uint32_t              xs_abtc_2_increment;
+       __uint32_t              xs_abtc_2_decrement;
+       __uint32_t              xs_abtc_2_lshift;
+       __uint32_t              xs_abtc_2_rshift;
+       __uint32_t              xs_abtc_2_split;
+       __uint32_t              xs_abtc_2_join;
+       __uint32_t              xs_abtc_2_alloc;
+       __uint32_t              xs_abtc_2_free;
+       __uint32_t              xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2            (XFSSTAT_END_ABTC_V2+15)
+       __uint32_t              xs_bmbt_2_lookup;
+       __uint32_t              xs_bmbt_2_compare;
+       __uint32_t              xs_bmbt_2_insrec;
+       __uint32_t              xs_bmbt_2_delrec;
+       __uint32_t              xs_bmbt_2_newroot;
+       __uint32_t              xs_bmbt_2_killroot;
+       __uint32_t              xs_bmbt_2_increment;
+       __uint32_t              xs_bmbt_2_decrement;
+       __uint32_t              xs_bmbt_2_lshift;
+       __uint32_t              xs_bmbt_2_rshift;
+       __uint32_t              xs_bmbt_2_split;
+       __uint32_t              xs_bmbt_2_join;
+       __uint32_t              xs_bmbt_2_alloc;
+       __uint32_t              xs_bmbt_2_free;
+       __uint32_t              xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2             (XFSSTAT_END_BMBT_V2+15)
+       __uint32_t              xs_ibt_2_lookup;
+       __uint32_t              xs_ibt_2_compare;
+       __uint32_t              xs_ibt_2_insrec;
+       __uint32_t              xs_ibt_2_delrec;
+       __uint32_t              xs_ibt_2_newroot;
+       __uint32_t              xs_ibt_2_killroot;
+       __uint32_t              xs_ibt_2_increment;
+       __uint32_t              xs_ibt_2_decrement;
+       __uint32_t              xs_ibt_2_lshift;
+       __uint32_t              xs_ibt_2_rshift;
+       __uint32_t              xs_ibt_2_split;
+       __uint32_t              xs_ibt_2_join;
+       __uint32_t              xs_ibt_2_alloc;
+       __uint32_t              xs_ibt_2_free;
+       __uint32_t              xs_ibt_2_moves;
+/* Extra precision counters */
+       __uint64_t              xs_xstrat_bytes;
+       __uint64_t              xs_write_bytes;
+       __uint64_t              xs_read_bytes;
+};
+
+DECLARE_PER_CPU(struct xfsstats, xfsstats);
+
+/*
+ * We don't disable preempt, not too worried about poking the
+ * wrong CPU's stat for now (also aggregated before reporting).
+ */
+#define XFS_STATS_INC(v)       (per_cpu(xfsstats, current_cpu()).v++)
+#define XFS_STATS_DEC(v)       (per_cpu(xfsstats, current_cpu()).v--)
+#define XFS_STATS_ADD(v, inc)  (per_cpu(xfsstats, current_cpu()).v += (inc))
+
+extern int xfs_init_procfs(void);
+extern void xfs_cleanup_procfs(void);
+
+
+#else  /* !CONFIG_PROC_FS */
+
+# define XFS_STATS_INC(count)
+# define XFS_STATS_DEC(count)
+# define XFS_STATS_ADD(count, inc)
+
+static inline int xfs_init_procfs(void)
+{
+       return 0;
+}
+
+static inline void xfs_cleanup_procfs(void)
+{
+}
+
+#endif /* !CONFIG_PROC_FS */
+
+#endif /* __XFS_STATS_H__ */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
new file mode 100644 (file)
index 0000000..9a72dda
--- /dev/null
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_itable.h"
+#include "xfs_fsops.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_utils.h"
+#include "xfs_vnodeops.h"
+#include "xfs_log_priv.h"
+#include "xfs_trans_priv.h"
+#include "xfs_filestream.h"
+#include "xfs_da_btree.h"
+#include "xfs_extfree_item.h"
+#include "xfs_mru_cache.h"
+#include "xfs_inode_item.h"
+#include "xfs_sync.h"
+#include "xfs_trace.h"
+
+#include <linux/namei.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/mount.h>
+#include <linux/mempool.h>
+#include <linux/writeback.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/parser.h>
+
+static const struct super_operations xfs_super_operations;
+static kmem_zone_t *xfs_ioend_zone;
+mempool_t *xfs_ioend_pool;
+
+#define MNTOPT_LOGBUFS "logbufs"       /* number of XFS log buffers */
+#define MNTOPT_LOGBSIZE        "logbsize"      /* size of XFS log buffers */
+#define MNTOPT_LOGDEV  "logdev"        /* log device */
+#define MNTOPT_RTDEV   "rtdev"         /* realtime I/O device */
+#define MNTOPT_BIOSIZE "biosize"       /* log2 of preferred buffered io size */
+#define MNTOPT_WSYNC   "wsync"         /* safe-mode nfs compatible mount */
+#define MNTOPT_NOALIGN "noalign"       /* turn off stripe alignment */
+#define MNTOPT_SWALLOC "swalloc"       /* turn on stripe width allocation */
+#define MNTOPT_SUNIT   "sunit"         /* data volume stripe unit */
+#define MNTOPT_SWIDTH  "swidth"        /* data volume stripe width */
+#define MNTOPT_NOUUID  "nouuid"        /* ignore filesystem UUID */
+#define MNTOPT_MTPT    "mtpt"          /* filesystem mount point */
+#define MNTOPT_GRPID   "grpid"         /* group-ID from parent directory */
+#define MNTOPT_NOGRPID "nogrpid"       /* group-ID from current process */
+#define MNTOPT_BSDGROUPS    "bsdgroups"    /* group-ID from parent directory */
+#define MNTOPT_SYSVGROUPS   "sysvgroups"   /* group-ID from current process */
+#define MNTOPT_ALLOCSIZE    "allocsize"    /* preferred allocation size */
+#define MNTOPT_NORECOVERY   "norecovery"   /* don't run XFS recovery */
+#define MNTOPT_BARRIER "barrier"       /* use writer barriers for log write and
+                                        * unwritten extent conversion */
+#define MNTOPT_NOBARRIER "nobarrier"   /* .. disable */
+#define MNTOPT_64BITINODE   "inode64"  /* inodes can be allocated anywhere */
+#define MNTOPT_IKEEP   "ikeep"         /* do not free empty inode clusters */
+#define MNTOPT_NOIKEEP "noikeep"       /* free empty inode clusters */
+#define MNTOPT_LARGEIO    "largeio"    /* report large I/O sizes in stat() */
+#define MNTOPT_NOLARGEIO   "nolargeio" /* do not report large I/O sizes
+                                        * in stat(). */
+#define MNTOPT_ATTR2   "attr2"         /* do use attr2 attribute format */
+#define MNTOPT_NOATTR2 "noattr2"       /* do not use attr2 attribute format */
+#define MNTOPT_FILESTREAM  "filestreams" /* use filestreams allocator */
+#define MNTOPT_QUOTA   "quota"         /* disk quotas (user) */
+#define MNTOPT_NOQUOTA "noquota"       /* no quotas */
+#define MNTOPT_USRQUOTA        "usrquota"      /* user quota enabled */
+#define MNTOPT_GRPQUOTA        "grpquota"      /* group quota enabled */
+#define MNTOPT_PRJQUOTA        "prjquota"      /* project quota enabled */
+#define MNTOPT_UQUOTA  "uquota"        /* user quota (IRIX variant) */
+#define MNTOPT_GQUOTA  "gquota"        /* group quota (IRIX variant) */
+#define MNTOPT_PQUOTA  "pquota"        /* project quota (IRIX variant) */
+#define MNTOPT_UQUOTANOENF "uqnoenforce"/* user quota limit enforcement */
+#define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */
+#define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */
+#define MNTOPT_QUOTANOENF  "qnoenforce"        /* same as uqnoenforce */
+#define MNTOPT_DELAYLOG    "delaylog"  /* Delayed logging enabled */
+#define MNTOPT_NODELAYLOG  "nodelaylog"        /* Delayed logging disabled */
+#define MNTOPT_DISCARD    "discard"    /* Discard unused blocks */
+#define MNTOPT_NODISCARD   "nodiscard" /* Do not discard unused blocks */
+
+/*
+ * Table driven mount option parser.
+ *
+ * Currently only used for remount, but it will be used for mount
+ * in the future, too.
+ */
+enum {
+       Opt_barrier, Opt_nobarrier, Opt_err
+};
+
+static const match_table_t tokens = {
+       {Opt_barrier, "barrier"},
+       {Opt_nobarrier, "nobarrier"},
+       {Opt_err, NULL}
+};
+
+
+STATIC unsigned long
+suffix_strtoul(char *s, char **endp, unsigned int base)
+{
+       int     last, shift_left_factor = 0;
+       char    *value = s;
+
+       last = strlen(value) - 1;
+       if (value[last] == 'K' || value[last] == 'k') {
+               shift_left_factor = 10;
+               value[last] = '\0';
+       }
+       if (value[last] == 'M' || value[last] == 'm') {
+               shift_left_factor = 20;
+               value[last] = '\0';
+       }
+       if (value[last] == 'G' || value[last] == 'g') {
+               shift_left_factor = 30;
+               value[last] = '\0';
+       }
+
+       return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure.  The caller takes care of them.
+ */
+STATIC int
+xfs_parseargs(
+       struct xfs_mount        *mp,
+       char                    *options)
+{
+       struct super_block      *sb = mp->m_super;
+       char                    *this_char, *value, *eov;
+       int                     dsunit = 0;
+       int                     dswidth = 0;
+       int                     iosize = 0;
+       __uint8_t               iosizelog = 0;
+
+       /*
+        * set up the mount name first so all the errors will refer to the
+        * correct device.
+        */
+       mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+       if (!mp->m_fsname)
+               return ENOMEM;
+       mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+       /*
+        * Copy binary VFS mount flags we are interested in.
+        */
+       if (sb->s_flags & MS_RDONLY)
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+       if (sb->s_flags & MS_DIRSYNC)
+               mp->m_flags |= XFS_MOUNT_DIRSYNC;
+       if (sb->s_flags & MS_SYNCHRONOUS)
+               mp->m_flags |= XFS_MOUNT_WSYNC;
+
+       /*
+        * Set some default flags that could be cleared by the mount option
+        * parsing.
+        */
+       mp->m_flags |= XFS_MOUNT_BARRIER;
+       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+       mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+       mp->m_flags |= XFS_MOUNT_DELAYLOG;
+
+       /*
+        * These can be overridden by the mount option parsing.
+        */
+       mp->m_logbufs = -1;
+       mp->m_logbsize = -1;
+
+       if (!options)
+               goto done;
+
+       while ((this_char = strsep(&options, ",")) != NULL) {
+               if (!*this_char)
+                       continue;
+               if ((value = strchr(this_char, '=')) != NULL)
+                       *value++ = 0;
+
+               if (!strcmp(this_char, MNTOPT_LOGBUFS)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       mp->m_logbufs = simple_strtoul(value, &eov, 10);
+               } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+               } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_logname)
+                               return ENOMEM;
+               } else if (!strcmp(this_char, MNTOPT_MTPT)) {
+                       xfs_warn(mp, "%s option not allowed on this system",
+                               this_char);
+                       return EINVAL;
+               } else if (!strcmp(this_char, MNTOPT_RTDEV)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+                       if (!mp->m_rtname)
+                               return ENOMEM;
+               } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       iosize = simple_strtoul(value, &eov, 10);
+                       iosizelog = ffs(iosize) - 1;
+               } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       iosize = suffix_strtoul(value, &eov, 10);
+                       iosizelog = ffs(iosize) - 1;
+               } else if (!strcmp(this_char, MNTOPT_GRPID) ||
+                          !strcmp(this_char, MNTOPT_BSDGROUPS)) {
+                       mp->m_flags |= XFS_MOUNT_GRPID;
+               } else if (!strcmp(this_char, MNTOPT_NOGRPID) ||
+                          !strcmp(this_char, MNTOPT_SYSVGROUPS)) {
+                       mp->m_flags &= ~XFS_MOUNT_GRPID;
+               } else if (!strcmp(this_char, MNTOPT_WSYNC)) {
+                       mp->m_flags |= XFS_MOUNT_WSYNC;
+               } else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
+                       mp->m_flags |= XFS_MOUNT_NORECOVERY;
+               } else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
+                       mp->m_flags |= XFS_MOUNT_NOALIGN;
+               } else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
+                       mp->m_flags |= XFS_MOUNT_SWALLOC;
+               } else if (!strcmp(this_char, MNTOPT_SUNIT)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       dsunit = simple_strtoul(value, &eov, 10);
+               } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
+                       if (!value || !*value) {
+                               xfs_warn(mp, "%s option requires an argument",
+                                       this_char);
+                               return EINVAL;
+                       }
+                       dswidth = simple_strtoul(value, &eov, 10);
+               } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
+                       mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+#if !XFS_BIG_INUMS
+                       xfs_warn(mp, "%s option not allowed on this system",
+                               this_char);
+                       return EINVAL;
+#endif
+               } else if (!strcmp(this_char, MNTOPT_NOUUID)) {
+                       mp->m_flags |= XFS_MOUNT_NOUUID;
+               } else if (!strcmp(this_char, MNTOPT_BARRIER)) {
+                       mp->m_flags |= XFS_MOUNT_BARRIER;
+               } else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
+                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
+               } else if (!strcmp(this_char, MNTOPT_IKEEP)) {
+                       mp->m_flags |= XFS_MOUNT_IKEEP;
+               } else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
+                       mp->m_flags &= ~XFS_MOUNT_IKEEP;
+               } else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
+                       mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
+               } else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
+                       mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+               } else if (!strcmp(this_char, MNTOPT_ATTR2)) {
+                       mp->m_flags |= XFS_MOUNT_ATTR2;
+               } else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
+                       mp->m_flags &= ~XFS_MOUNT_ATTR2;
+                       mp->m_flags |= XFS_MOUNT_NOATTR2;
+               } else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
+                       mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+               } else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
+                       mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                         XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                         XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                         XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
+               } else if (!strcmp(this_char, MNTOPT_QUOTA) ||
+                          !strcmp(this_char, MNTOPT_UQUOTA) ||
+                          !strcmp(this_char, MNTOPT_USRQUOTA)) {
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+                                        XFS_UQUOTA_ENFD);
+               } else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
+                          !strcmp(this_char, MNTOPT_UQUOTANOENF)) {
+                       mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+               } else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
+                          !strcmp(this_char, MNTOPT_PRJQUOTA)) {
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
+               } else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
+                       mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+               } else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
+                          !strcmp(this_char, MNTOPT_GRPQUOTA)) {
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+                                        XFS_OQUOTA_ENFD);
+               } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
+                       mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+                       mp->m_qflags &= ~XFS_OQUOTA_ENFD;
+               } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+                       mp->m_flags |= XFS_MOUNT_DELAYLOG;
+               } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+                       mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
+               } else if (!strcmp(this_char, MNTOPT_DISCARD)) {
+                       mp->m_flags |= XFS_MOUNT_DISCARD;
+               } else if (!strcmp(this_char, MNTOPT_NODISCARD)) {
+                       mp->m_flags &= ~XFS_MOUNT_DISCARD;
+               } else if (!strcmp(this_char, "ihashsize")) {
+                       xfs_warn(mp,
+       "ihashsize no longer used, option is deprecated.");
+               } else if (!strcmp(this_char, "osyncisdsync")) {
+                       xfs_warn(mp,
+       "osyncisdsync has no effect, option is deprecated.");
+               } else if (!strcmp(this_char, "osyncisosync")) {
+                       xfs_warn(mp,
+       "osyncisosync has no effect, option is deprecated.");
+               } else if (!strcmp(this_char, "irixsgid")) {
+                       xfs_warn(mp,
+       "irixsgid is now a sysctl(2) variable, option is deprecated.");
+               } else {
+                       xfs_warn(mp, "unknown mount option [%s].", this_char);
+                       return EINVAL;
+               }
+       }
+
+       /*
+        * no recovery flag requires a read-only mount
+        */
+       if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               xfs_warn(mp, "no-recovery mounts must be read-only.");
+               return EINVAL;
+       }
+
+       if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
+               xfs_warn(mp,
+       "sunit and swidth options incompatible with the noalign option");
+               return EINVAL;
+       }
+
+       if ((mp->m_flags & XFS_MOUNT_DISCARD) &&
+           !(mp->m_flags & XFS_MOUNT_DELAYLOG)) {
+               xfs_warn(mp,
+       "the discard option is incompatible with the nodelaylog option");
+               return EINVAL;
+       }
+
+#ifndef CONFIG_XFS_QUOTA
+       if (XFS_IS_QUOTA_RUNNING(mp)) {
+               xfs_warn(mp, "quota support not available in this kernel.");
+               return EINVAL;
+       }
+#endif
+
+       if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+           (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
+               xfs_warn(mp, "cannot mount with both project and group quota");
+               return EINVAL;
+       }
+
+       if ((dsunit && !dswidth) || (!dsunit && dswidth)) {
+               xfs_warn(mp, "sunit and swidth must be specified together");
+               return EINVAL;
+       }
+
+       if (dsunit && (dswidth % dsunit != 0)) {
+               xfs_warn(mp,
+       "stripe width (%d) must be a multiple of the stripe unit (%d)",
+                       dswidth, dsunit);
+               return EINVAL;
+       }
+
+done:
+       if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+               /*
+                * At this point the superblock has not been read
+                * in, therefore we do not know the block size.
+                * Before the mount call ends we will convert
+                * these to FSBs.
+                */
+               if (dsunit) {
+                       mp->m_dalign = dsunit;
+                       mp->m_flags |= XFS_MOUNT_RETERR;
+               }
+
+               if (dswidth)
+                       mp->m_swidth = dswidth;
+       }
+
+       if (mp->m_logbufs != -1 &&
+           mp->m_logbufs != 0 &&
+           (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+            mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+               xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]",
+                       mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+               return XFS_ERROR(EINVAL);
+       }
+       if (mp->m_logbsize != -1 &&
+           mp->m_logbsize !=  0 &&
+           (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+            mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+            !is_power_of_2(mp->m_logbsize))) {
+               xfs_warn(mp,
+                       "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+                       mp->m_logbsize);
+               return XFS_ERROR(EINVAL);
+       }
+
+       if (iosizelog) {
+               if (iosizelog > XFS_MAX_IO_LOG ||
+                   iosizelog < XFS_MIN_IO_LOG) {
+                       xfs_warn(mp, "invalid log iosize: %d [not %d-%d]",
+                               iosizelog, XFS_MIN_IO_LOG,
+                               XFS_MAX_IO_LOG);
+                       return XFS_ERROR(EINVAL);
+               }
+
+               mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+               mp->m_readio_log = iosizelog;
+               mp->m_writeio_log = iosizelog;
+       }
+
+       return 0;
+}
+
+struct proc_xfs_info {
+       int     flag;
+       char    *str;
+};
+
+STATIC int
+xfs_showargs(
+       struct xfs_mount        *mp,
+       struct seq_file         *m)
+{
+       static struct proc_xfs_info xfs_info_set[] = {
+               /* the few simple ones we can get from the mount struct */
+               { XFS_MOUNT_IKEEP,              "," MNTOPT_IKEEP },
+               { XFS_MOUNT_WSYNC,              "," MNTOPT_WSYNC },
+               { XFS_MOUNT_NOALIGN,            "," MNTOPT_NOALIGN },
+               { XFS_MOUNT_SWALLOC,            "," MNTOPT_SWALLOC },
+               { XFS_MOUNT_NOUUID,             "," MNTOPT_NOUUID },
+               { XFS_MOUNT_NORECOVERY,         "," MNTOPT_NORECOVERY },
+               { XFS_MOUNT_ATTR2,              "," MNTOPT_ATTR2 },
+               { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
+               { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+               { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
+               { XFS_MOUNT_DISCARD,            "," MNTOPT_DISCARD },
+               { 0, NULL }
+       };
+       static struct proc_xfs_info xfs_info_unset[] = {
+               /* the few simple ones we can get from the mount struct */
+               { XFS_MOUNT_COMPAT_IOSIZE,      "," MNTOPT_LARGEIO },
+               { XFS_MOUNT_BARRIER,            "," MNTOPT_NOBARRIER },
+               { XFS_MOUNT_SMALL_INUMS,        "," MNTOPT_64BITINODE },
+               { 0, NULL }
+       };
+       struct proc_xfs_info    *xfs_infop;
+
+       for (xfs_infop = xfs_info_set; xfs_infop->flag; xfs_infop++) {
+               if (mp->m_flags & xfs_infop->flag)
+                       seq_puts(m, xfs_infop->str);
+       }
+       for (xfs_infop = xfs_info_unset; xfs_infop->flag; xfs_infop++) {
+               if (!(mp->m_flags & xfs_infop->flag))
+                       seq_puts(m, xfs_infop->str);
+       }
+
+       if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+               seq_printf(m, "," MNTOPT_ALLOCSIZE "=%dk",
+                               (int)(1 << mp->m_writeio_log) >> 10);
+
+       if (mp->m_logbufs > 0)
+               seq_printf(m, "," MNTOPT_LOGBUFS "=%d", mp->m_logbufs);
+       if (mp->m_logbsize > 0)
+               seq_printf(m, "," MNTOPT_LOGBSIZE "=%dk", mp->m_logbsize >> 10);
+
+       if (mp->m_logname)
+               seq_printf(m, "," MNTOPT_LOGDEV "=%s", mp->m_logname);
+       if (mp->m_rtname)
+               seq_printf(m, "," MNTOPT_RTDEV "=%s", mp->m_rtname);
+
+       if (mp->m_dalign > 0)
+               seq_printf(m, "," MNTOPT_SUNIT "=%d",
+                               (int)XFS_FSB_TO_BB(mp, mp->m_dalign));
+       if (mp->m_swidth > 0)
+               seq_printf(m, "," MNTOPT_SWIDTH "=%d",
+                               (int)XFS_FSB_TO_BB(mp, mp->m_swidth));
+
+       if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
+               seq_puts(m, "," MNTOPT_USRQUOTA);
+       else if (mp->m_qflags & XFS_UQUOTA_ACCT)
+               seq_puts(m, "," MNTOPT_UQUOTANOENF);
+
+       /* Either project or group quotas can be active, not both */
+
+       if (mp->m_qflags & XFS_PQUOTA_ACCT) {
+               if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                       seq_puts(m, "," MNTOPT_PRJQUOTA);
+               else
+                       seq_puts(m, "," MNTOPT_PQUOTANOENF);
+       } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
+               if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                       seq_puts(m, "," MNTOPT_GRPQUOTA);
+               else
+                       seq_puts(m, "," MNTOPT_GQUOTANOENF);
+       }
+
+       if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
+               seq_puts(m, "," MNTOPT_NOQUOTA);
+
+       return 0;
+}
+__uint64_t
+xfs_max_file_offset(
+       unsigned int            blockshift)
+{
+       unsigned int            pagefactor = 1;
+       unsigned int            bitshift = BITS_PER_LONG - 1;
+
+       /* Figure out maximum filesize, on Linux this can depend on
+        * the filesystem blocksize (on 32 bit platforms).
+        * __block_write_begin does this in an [unsigned] long...
+        *      page->index << (PAGE_CACHE_SHIFT - bbits)
+        * So, for page sized blocks (4K on 32 bit platforms),
+        * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
+        *      (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+        * but for smaller blocksizes it is less (bbits = log2 bsize).
+        * Note1: get_block_t takes a long (implicit cast from above)
+        * Note2: The Large Block Device (LBD and HAVE_SECTOR_T) patch
+        * can optionally convert the [unsigned] long from above into
+        * an [unsigned] long long.
+        */
+
+#if BITS_PER_LONG == 32
+# if defined(CONFIG_LBDAF)
+       ASSERT(sizeof(sector_t) == 8);
+       pagefactor = PAGE_CACHE_SIZE;
+       bitshift = BITS_PER_LONG;
+# else
+       pagefactor = PAGE_CACHE_SIZE >> (PAGE_CACHE_SHIFT - blockshift);
+# endif
+#endif
+
+       return (((__uint64_t)pagefactor) << bitshift) - 1;
+}
+
+STATIC int
+xfs_blkdev_get(
+       xfs_mount_t             *mp,
+       const char              *name,
+       struct block_device     **bdevp)
+{
+       int                     error = 0;
+
+       *bdevp = blkdev_get_by_path(name, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+                                   mp);
+       if (IS_ERR(*bdevp)) {
+               error = PTR_ERR(*bdevp);
+               xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+       }
+
+       return -error;
+}
+
+STATIC void
+xfs_blkdev_put(
+       struct block_device     *bdev)
+{
+       if (bdev)
+               blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+}
+
+void
+xfs_blkdev_issue_flush(
+       xfs_buftarg_t           *buftarg)
+{
+       blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL);
+}
+
+STATIC void
+xfs_close_devices(
+       struct xfs_mount        *mp)
+{
+       if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+               struct block_device *logdev = mp->m_logdev_targp->bt_bdev;
+               xfs_free_buftarg(mp, mp->m_logdev_targp);
+               xfs_blkdev_put(logdev);
+       }
+       if (mp->m_rtdev_targp) {
+               struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev;
+               xfs_free_buftarg(mp, mp->m_rtdev_targp);
+               xfs_blkdev_put(rtdev);
+       }
+       xfs_free_buftarg(mp, mp->m_ddev_targp);
+}
+
+/*
+ * The file system configurations are:
+ *     (1) device (partition) with data and internal log
+ *     (2) logical volume with data and log subvolumes.
+ *     (3) logical volume with data, log, and realtime subvolumes.
+ *
+ * We only have to handle opening the log and realtime volumes here if
+ * they are present.  The data subvolume has already been opened by
+ * get_sb_bdev() and is stored in sb->s_bdev.
+ */
+STATIC int
+xfs_open_devices(
+       struct xfs_mount        *mp)
+{
+       struct block_device     *ddev = mp->m_super->s_bdev;
+       struct block_device     *logdev = NULL, *rtdev = NULL;
+       int                     error;
+
+       /*
+        * Open real time and log devices - order is important.
+        */
+       if (mp->m_logname) {
+               error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
+               if (error)
+                       goto out;
+       }
+
+       if (mp->m_rtname) {
+               error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
+               if (error)
+                       goto out_close_logdev;
+
+               if (rtdev == ddev || rtdev == logdev) {
+                       xfs_warn(mp,
+       "Cannot mount filesystem with identical rtdev and ddev/logdev.");
+                       error = EINVAL;
+                       goto out_close_rtdev;
+               }
+       }
+
+       /*
+        * Setup xfs_mount buffer target pointers
+        */
+       error = ENOMEM;
+       mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname);
+       if (!mp->m_ddev_targp)
+               goto out_close_rtdev;
+
+       if (rtdev) {
+               mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1,
+                                                       mp->m_fsname);
+               if (!mp->m_rtdev_targp)
+                       goto out_free_ddev_targ;
+       }
+
+       if (logdev && logdev != ddev) {
+               mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1,
+                                                       mp->m_fsname);
+               if (!mp->m_logdev_targp)
+                       goto out_free_rtdev_targ;
+       } else {
+               mp->m_logdev_targp = mp->m_ddev_targp;
+       }
+
+       return 0;
+
+ out_free_rtdev_targ:
+       if (mp->m_rtdev_targp)
+               xfs_free_buftarg(mp, mp->m_rtdev_targp);
+ out_free_ddev_targ:
+       xfs_free_buftarg(mp, mp->m_ddev_targp);
+ out_close_rtdev:
+       if (rtdev)
+               xfs_blkdev_put(rtdev);
+ out_close_logdev:
+       if (logdev && logdev != ddev)
+               xfs_blkdev_put(logdev);
+ out:
+       return error;
+}
+
+/*
+ * Setup xfs_mount buffer target pointers based on superblock
+ */
+STATIC int
+xfs_setup_devices(
+       struct xfs_mount        *mp)
+{
+       int                     error;
+
+       error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_blocksize,
+                                   mp->m_sb.sb_sectsize);
+       if (error)
+               return error;
+
+       if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) {
+               unsigned int    log_sector_size = BBSIZE;
+
+               if (xfs_sb_version_hassector(&mp->m_sb))
+                       log_sector_size = mp->m_sb.sb_logsectsize;
+               error = xfs_setsize_buftarg(mp->m_logdev_targp,
+                                           mp->m_sb.sb_blocksize,
+                                           log_sector_size);
+               if (error)
+                       return error;
+       }
+       if (mp->m_rtdev_targp) {
+               error = xfs_setsize_buftarg(mp->m_rtdev_targp,
+                                           mp->m_sb.sb_blocksize,
+                                           mp->m_sb.sb_sectsize);
+               if (error)
+                       return error;
+       }
+
+       return 0;
+}
+
+/* Catch misguided souls that try to use this interface on XFS */
+STATIC struct inode *
+xfs_fs_alloc_inode(
+       struct super_block      *sb)
+{
+       BUG();
+       return NULL;
+}
+
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
+STATIC void
+xfs_fs_destroy_inode(
+       struct inode            *inode)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+
+       trace_xfs_destroy_inode(ip);
+
+       XFS_STATS_INC(vn_reclaim);
+
+       /* bad inode, get out here ASAP */
+       if (is_bad_inode(inode))
+               goto out_reclaim;
+
+       xfs_ioend_wait(ip);
+
+       ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+
+       /*
+        * We should never get here with one of the reclaim flags already set.
+        */
+       ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIMABLE));
+       ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM));
+
+       /*
+        * We always use background reclaim here because even if the
+        * inode is clean, it still may be under IO and hence we have
+        * to take the flush lock. The background reclaim path handles
+        * this more efficiently than we can here, so simply let background
+        * reclaim tear down all inodes.
+        */
+out_reclaim:
+       xfs_inode_set_reclaim_tag(ip);
+}
+
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly initialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
+STATIC void
+xfs_fs_inode_init_once(
+       void                    *inode)
+{
+       struct xfs_inode        *ip = inode;
+
+       memset(ip, 0, sizeof(struct xfs_inode));
+
+       /* vfs inode */
+       inode_init_once(VFS_I(ip));
+
+       /* xfs inode */
+       atomic_set(&ip->i_iocount, 0);
+       atomic_set(&ip->i_pincount, 0);
+       spin_lock_init(&ip->i_flags_lock);
+       init_waitqueue_head(&ip->i_ipin_wait);
+       /*
+        * Because we want to use a counting completion, complete
+        * the flush completion once to allow a single access to
+        * the flush completion without blocking.
+        */
+       init_completion(&ip->i_flush);
+       complete(&ip->i_flush);
+
+       mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+                    "xfsino", ip->i_ino);
+}
+
+/*
+ * Dirty the XFS inode when mark_inode_dirty_sync() is called so that
+ * we catch unlogged VFS level updates to the inode.
+ *
+ * We need the barrier() to maintain correct ordering between unlogged
+ * updates and the transaction commit code that clears the i_update_core
+ * field. This requires all updates to be completed before marking the
+ * inode dirty.
+ */
+STATIC void
+xfs_fs_dirty_inode(
+       struct inode    *inode,
+       int             flags)
+{
+       barrier();
+       XFS_I(inode)->i_update_core = 1;
+}
+
+STATIC int
+xfs_log_inode(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       struct xfs_trans        *tp;
+       int                     error;
+
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+       error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0);
+
+       if (error) {
+               xfs_trans_cancel(tp, 0);
+               /* we need to return with the lock hold shared */
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+               return error;
+       }
+
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+       /*
+        * Note - it's possible that we might have pushed ourselves out of the
+        * way during trans_reserve which would flush the inode.  But there's
+        * no guarantee that the inode buffer has actually gone out yet (it's
+        * delwri).  Plus the buffer could be pinned anyway if it's part of
+        * an inode in another recent transaction.  So we play it safe and
+        * fire off the transaction anyway.
+        */
+       xfs_trans_ijoin(tp, ip);
+       xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+       error = xfs_trans_commit(tp, 0);
+       xfs_ilock_demote(ip, XFS_ILOCK_EXCL);
+
+       return error;
+}
+
+STATIC int
+xfs_fs_write_inode(
+       struct inode            *inode,
+       struct writeback_control *wbc)
+{
+       struct xfs_inode        *ip = XFS_I(inode);
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error = EAGAIN;
+
+       trace_xfs_write_inode(ip);
+
+       if (XFS_FORCED_SHUTDOWN(mp))
+               return XFS_ERROR(EIO);
+
+       if (wbc->sync_mode == WB_SYNC_ALL) {
+               /*
+                * Make sure the inode has made it it into the log.  Instead
+                * of forcing it all the way to stable storage using a
+                * synchronous transaction we let the log force inside the
+                * ->sync_fs call do that for thus, which reduces the number
+                * of synchronous log foces dramatically.
+                */
+               xfs_ioend_wait(ip);
+               xfs_ilock(ip, XFS_ILOCK_SHARED);
+               if (ip->i_update_core) {
+                       error = xfs_log_inode(ip);
+                       if (error)
+                               goto out_unlock;
+               }
+       } else {
+               /*
+                * We make this non-blocking if the inode is contended, return
+                * EAGAIN to indicate to the caller that they did not succeed.
+                * This prevents the flush path from blocking on inodes inside
+                * another operation right now, they get caught later by
+                * xfs_sync.
+                */
+               if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED))
+                       goto out;
+
+               if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip))
+                       goto out_unlock;
+
+               /*
+                * Now we have the flush lock and the inode is not pinned, we
+                * can check if the inode is really clean as we know that
+                * there are no pending transaction completions, it is not
+                * waiting on the delayed write queue and there is no IO in
+                * progress.
+                */
+               if (xfs_inode_clean(ip)) {
+                       xfs_ifunlock(ip);
+                       error = 0;
+                       goto out_unlock;
+               }
+               error = xfs_iflush(ip, SYNC_TRYLOCK);
+       }
+
+ out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ out:
+       /*
+        * if we failed to write out the inode then mark
+        * it dirty again so we'll try again later.
+        */
+       if (error)
+               xfs_mark_inode_dirty_sync(ip);
+       return -error;
+}
+
+STATIC void
+xfs_fs_evict_inode(
+       struct inode            *inode)
+{
+       xfs_inode_t             *ip = XFS_I(inode);
+
+       trace_xfs_evict_inode(ip);
+
+       truncate_inode_pages(&inode->i_data, 0);
+       end_writeback(inode);
+       XFS_STATS_INC(vn_rele);
+       XFS_STATS_INC(vn_remove);
+       XFS_STATS_DEC(vn_active);
+
+       /*
+        * The iolock is used by the file system to coordinate reads,
+        * writes, and block truncates.  Up to this point the lock
+        * protected concurrent accesses by users of the inode.  But
+        * from here forward we're doing some final processing of the
+        * inode because we're done with it, and although we reuse the
+        * iolock for protection it is really a distinct lock class
+        * (in the lockdep sense) from before.  To keep lockdep happy
+        * (and basically indicate what we are doing), we explicitly
+        * re-init the iolock here.
+        */
+       ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+       lockdep_set_class_and_name(&ip->i_iolock.mr_lock,
+                       &xfs_iolock_reclaimable, "xfs_iolock_reclaimable");
+
+       xfs_inactive(ip);
+}
+
+STATIC void
+xfs_free_fsname(
+       struct xfs_mount        *mp)
+{
+       kfree(mp->m_fsname);
+       kfree(mp->m_rtname);
+       kfree(mp->m_logname);
+}
+
+STATIC void
+xfs_fs_put_super(
+       struct super_block      *sb)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       xfs_syncd_stop(mp);
+
+       /*
+        * Blow away any referenced inode in the filestreams cache.
+        * This can and will cause log traffic as inodes go inactive
+        * here.
+        */
+       xfs_filestream_unmount(mp);
+
+       XFS_bflush(mp->m_ddev_targp);
+
+       xfs_unmountfs(mp);
+       xfs_freesb(mp);
+       xfs_icsb_destroy_counters(mp);
+       xfs_close_devices(mp);
+       xfs_free_fsname(mp);
+       kfree(mp);
+}
+
+STATIC int
+xfs_fs_sync_fs(
+       struct super_block      *sb,
+       int                     wait)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+       int                     error;
+
+       /*
+        * Not much we can do for the first async pass.  Writing out the
+        * superblock would be counter-productive as we are going to redirty
+        * when writing out other data and metadata (and writing out a single
+        * block is quite fast anyway).
+        *
+        * Try to asynchronously kick off quota syncing at least.
+        */
+       if (!wait) {
+               xfs_qm_sync(mp, SYNC_TRYLOCK);
+               return 0;
+       }
+
+       error = xfs_quiesce_data(mp);
+       if (error)
+               return -error;
+
+       if (laptop_mode) {
+               /*
+                * The disk must be active because we're syncing.
+                * We schedule xfssyncd now (now that the disk is
+                * active) instead of later (when it might not be).
+                */
+               flush_delayed_work_sync(&mp->m_sync_work);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_fs_statfs(
+       struct dentry           *dentry,
+       struct kstatfs          *statp)
+{
+       struct xfs_mount        *mp = XFS_M(dentry->d_sb);
+       xfs_sb_t                *sbp = &mp->m_sb;
+       struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+       __uint64_t              fakeinos, id;
+       xfs_extlen_t            lsize;
+       __int64_t               ffree;
+
+       statp->f_type = XFS_SB_MAGIC;
+       statp->f_namelen = MAXNAMELEN - 1;
+
+       id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
+       statp->f_fsid.val[0] = (u32)id;
+       statp->f_fsid.val[1] = (u32)(id >> 32);
+
+       xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+
+       spin_lock(&mp->m_sb_lock);
+       statp->f_bsize = sbp->sb_blocksize;
+       lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
+       statp->f_blocks = sbp->sb_dblocks - lsize;
+       statp->f_bfree = statp->f_bavail =
+                               sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
+       fakeinos = statp->f_bfree << sbp->sb_inopblog;
+       statp->f_files =
+           MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER);
+       if (mp->m_maxicount)
+               statp->f_files = min_t(typeof(statp->f_files),
+                                       statp->f_files,
+                                       mp->m_maxicount);
+
+       /* make sure statp->f_ffree does not underflow */
+       ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+       statp->f_ffree = max_t(__int64_t, ffree, 0);
+
+       spin_unlock(&mp->m_sb_lock);
+
+       if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
+           ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))) ==
+                             (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+               xfs_qm_statvfs(ip, statp);
+       return 0;
+}
+
+STATIC void
+xfs_save_resvblks(struct xfs_mount *mp)
+{
+       __uint64_t resblks = 0;
+
+       mp->m_resblks_save = mp->m_resblks;
+       xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC void
+xfs_restore_resvblks(struct xfs_mount *mp)
+{
+       __uint64_t resblks;
+
+       if (mp->m_resblks_save) {
+               resblks = mp->m_resblks_save;
+               mp->m_resblks_save = 0;
+       } else
+               resblks = xfs_default_resblks(mp);
+
+       xfs_reserve_blocks(mp, &resblks, NULL);
+}
+
+STATIC int
+xfs_fs_remount(
+       struct super_block      *sb,
+       int                     *flags,
+       char                    *options)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+       substring_t             args[MAX_OPT_ARGS];
+       char                    *p;
+       int                     error;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, tokens, args);
+               switch (token) {
+               case Opt_barrier:
+                       mp->m_flags |= XFS_MOUNT_BARRIER;
+                       break;
+               case Opt_nobarrier:
+                       mp->m_flags &= ~XFS_MOUNT_BARRIER;
+                       break;
+               default:
+                       /*
+                        * Logically we would return an error here to prevent
+                        * users from believing they might have changed
+                        * mount options using remount which can't be changed.
+                        *
+                        * But unfortunately mount(8) adds all options from
+                        * mtab and fstab to the mount arguments in some cases
+                        * so we can't blindly reject options, but have to
+                        * check for each specified option if it actually
+                        * differs from the currently set option and only
+                        * reject it if that's the case.
+                        *
+                        * Until that is implemented we return success for
+                        * every remount request, and silently ignore all
+                        * options that we can't actually change.
+                        */
+#if 0
+                       xfs_info(mp,
+               "mount option \"%s\" not supported for remount\n", p);
+                       return -EINVAL;
+#else
+                       break;
+#endif
+               }
+       }
+
+       /* ro -> rw */
+       if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) {
+               mp->m_flags &= ~XFS_MOUNT_RDONLY;
+
+               /*
+                * If this is the first remount to writeable state we
+                * might have some superblock changes to update.
+                */
+               if (mp->m_update_flags) {
+                       error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                       if (error) {
+                               xfs_warn(mp, "failed to write sb changes");
+                               return error;
+                       }
+                       mp->m_update_flags = 0;
+               }
+
+               /*
+                * Fill out the reserve pool if it is empty. Use the stashed
+                * value if it is non-zero, otherwise go with the default.
+                */
+               xfs_restore_resvblks(mp);
+       }
+
+       /* rw -> ro */
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+               /*
+                * After we have synced the data but before we sync the
+                * metadata, we need to free up the reserve block pool so that
+                * the used block count in the superblock on disk is correct at
+                * the end of the remount. Stash the current reserve pool size
+                * so that if we get remounted rw, we can return it to the same
+                * size.
+                */
+
+               xfs_quiesce_data(mp);
+               xfs_save_resvblks(mp);
+               xfs_quiesce_attr(mp);
+               mp->m_flags |= XFS_MOUNT_RDONLY;
+       }
+
+       return 0;
+}
+
+/*
+ * Second stage of a freeze. The data is already frozen so we only
+ * need to take care of the metadata. Once that's done write a dummy
+ * record to dirty the log in case of a crash while frozen.
+ */
+STATIC int
+xfs_fs_freeze(
+       struct super_block      *sb)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       xfs_save_resvblks(mp);
+       xfs_quiesce_attr(mp);
+       return -xfs_fs_log_dummy(mp);
+}
+
+STATIC int
+xfs_fs_unfreeze(
+       struct super_block      *sb)
+{
+       struct xfs_mount        *mp = XFS_M(sb);
+
+       xfs_restore_resvblks(mp);
+       return 0;
+}
+
+STATIC int
+xfs_fs_show_options(
+       struct seq_file         *m,
+       struct vfsmount         *mnt)
+{
+       return -xfs_showargs(XFS_M(mnt->mnt_sb), m);
+}
+
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock _has_ now been read in.
+ */
+STATIC int
+xfs_finish_flags(
+       struct xfs_mount        *mp)
+{
+       int                     ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
+
+       /* Fail a mount where the logbuf is smaller than the log stripe */
+       if (xfs_sb_version_haslogv2(&mp->m_sb)) {
+               if (mp->m_logbsize <= 0 &&
+                   mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
+                       mp->m_logbsize = mp->m_sb.sb_logsunit;
+               } else if (mp->m_logbsize > 0 &&
+                          mp->m_logbsize < mp->m_sb.sb_logsunit) {
+                       xfs_warn(mp,
+               "logbuf size must be greater than or equal to log stripe size");
+                       return XFS_ERROR(EINVAL);
+               }
+       } else {
+               /* Fail a mount if the logbuf is larger than 32K */
+               if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
+                       xfs_warn(mp,
+               "logbuf size for version 1 logs must be 16K or 32K");
+                       return XFS_ERROR(EINVAL);
+               }
+       }
+
+       /*
+        * mkfs'ed attr2 will turn on attr2 mount unless explicitly
+        * told by noattr2 to turn it off
+        */
+       if (xfs_sb_version_hasattr2(&mp->m_sb) &&
+           !(mp->m_flags & XFS_MOUNT_NOATTR2))
+               mp->m_flags |= XFS_MOUNT_ATTR2;
+
+       /*
+        * prohibit r/w mounts of read-only filesystems
+        */
+       if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) {
+               xfs_warn(mp,
+                       "cannot mount a read-only filesystem as read-write");
+               return XFS_ERROR(EROFS);
+       }
+
+       return 0;
+}
+
+STATIC int
+xfs_fs_fill_super(
+       struct super_block      *sb,
+       void                    *data,
+       int                     silent)
+{
+       struct inode            *root;
+       struct xfs_mount        *mp = NULL;
+       int                     flags = 0, error = ENOMEM;
+
+       mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
+       if (!mp)
+               goto out;
+
+       spin_lock_init(&mp->m_sb_lock);
+       mutex_init(&mp->m_growlock);
+       atomic_set(&mp->m_active_trans, 0);
+
+       mp->m_super = sb;
+       sb->s_fs_info = mp;
+
+       error = xfs_parseargs(mp, (char *)data);
+       if (error)
+               goto out_free_fsname;
+
+       sb_min_blocksize(sb, BBSIZE);
+       sb->s_xattr = xfs_xattr_handlers;
+       sb->s_export_op = &xfs_export_operations;
+#ifdef CONFIG_XFS_QUOTA
+       sb->s_qcop = &xfs_quotactl_operations;
+#endif
+       sb->s_op = &xfs_super_operations;
+
+       if (silent)
+               flags |= XFS_MFSI_QUIET;
+
+       error = xfs_open_devices(mp);
+       if (error)
+               goto out_free_fsname;
+
+       error = xfs_icsb_init_counters(mp);
+       if (error)
+               goto out_close_devices;
+
+       error = xfs_readsb(mp, flags);
+       if (error)
+               goto out_destroy_counters;
+
+       error = xfs_finish_flags(mp);
+       if (error)
+               goto out_free_sb;
+
+       error = xfs_setup_devices(mp);
+       if (error)
+               goto out_free_sb;
+
+       error = xfs_filestream_mount(mp);
+       if (error)
+               goto out_free_sb;
+
+       /*
+        * we must configure the block size in the superblock before we run the
+        * full mount process as the mount process can lookup and cache inodes.
+        * For the same reason we must also initialise the syncd and register
+        * the inode cache shrinker so that inodes can be reclaimed during
+        * operations like a quotacheck that iterate all inodes in the
+        * filesystem.
+        */
+       sb->s_magic = XFS_SB_MAGIC;
+       sb->s_blocksize = mp->m_sb.sb_blocksize;
+       sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
+       sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+       sb->s_time_gran = 1;
+       set_posix_acl_flag(sb);
+
+       error = xfs_mountfs(mp);
+       if (error)
+               goto out_filestream_unmount;
+
+       error = xfs_syncd_init(mp);
+       if (error)
+               goto out_unmount;
+
+       root = igrab(VFS_I(mp->m_rootip));
+       if (!root) {
+               error = ENOENT;
+               goto out_syncd_stop;
+       }
+       if (is_bad_inode(root)) {
+               error = EINVAL;
+               goto out_syncd_stop;
+       }
+       sb->s_root = d_alloc_root(root);
+       if (!sb->s_root) {
+               error = ENOMEM;
+               goto out_iput;
+       }
+
+       return 0;
+
+ out_filestream_unmount:
+       xfs_filestream_unmount(mp);
+ out_free_sb:
+       xfs_freesb(mp);
+ out_destroy_counters:
+       xfs_icsb_destroy_counters(mp);
+ out_close_devices:
+       xfs_close_devices(mp);
+ out_free_fsname:
+       xfs_free_fsname(mp);
+       kfree(mp);
+ out:
+       return -error;
+
+ out_iput:
+       iput(root);
+ out_syncd_stop:
+       xfs_syncd_stop(mp);
+ out_unmount:
+       /*
+        * Blow away any referenced inode in the filestreams cache.
+        * This can and will cause log traffic as inodes go inactive
+        * here.
+        */
+       xfs_filestream_unmount(mp);
+
+       XFS_bflush(mp->m_ddev_targp);
+
+       xfs_unmountfs(mp);
+       goto out_free_sb;
+}
+
+STATIC struct dentry *
+xfs_fs_mount(
+       struct file_system_type *fs_type,
+       int                     flags,
+       const char              *dev_name,
+       void                    *data)
+{
+       return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
+}
+
+static int
+xfs_fs_nr_cached_objects(
+       struct super_block      *sb)
+{
+       return xfs_reclaim_inodes_count(XFS_M(sb));
+}
+
+static void
+xfs_fs_free_cached_objects(
+       struct super_block      *sb,
+       int                     nr_to_scan)
+{
+       xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+}
+
+static const struct super_operations xfs_super_operations = {
+       .alloc_inode            = xfs_fs_alloc_inode,
+       .destroy_inode          = xfs_fs_destroy_inode,
+       .dirty_inode            = xfs_fs_dirty_inode,
+       .write_inode            = xfs_fs_write_inode,
+       .evict_inode            = xfs_fs_evict_inode,
+       .put_super              = xfs_fs_put_super,
+       .sync_fs                = xfs_fs_sync_fs,
+       .freeze_fs              = xfs_fs_freeze,
+       .unfreeze_fs            = xfs_fs_unfreeze,
+       .statfs                 = xfs_fs_statfs,
+       .remount_fs             = xfs_fs_remount,
+       .show_options           = xfs_fs_show_options,
+       .nr_cached_objects      = xfs_fs_nr_cached_objects,
+       .free_cached_objects    = xfs_fs_free_cached_objects,
+};
+
+static struct file_system_type xfs_fs_type = {
+       .owner                  = THIS_MODULE,
+       .name                   = "xfs",
+       .mount                  = xfs_fs_mount,
+       .kill_sb                = kill_block_super,
+       .fs_flags               = FS_REQUIRES_DEV,
+};
+
+STATIC int __init
+xfs_init_zones(void)
+{
+
+       xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
+       if (!xfs_ioend_zone)
+               goto out;
+
+       xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
+                                                 xfs_ioend_zone);
+       if (!xfs_ioend_pool)
+               goto out_destroy_ioend_zone;
+
+       xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
+                                               "xfs_log_ticket");
+       if (!xfs_log_ticket_zone)
+               goto out_destroy_ioend_pool;
+
+       xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
+                                               "xfs_bmap_free_item");
+       if (!xfs_bmap_free_item_zone)
+               goto out_destroy_log_ticket_zone;
+
+       xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
+                                               "xfs_btree_cur");
+       if (!xfs_btree_cur_zone)
+               goto out_destroy_bmap_free_item_zone;
+
+       xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
+                                               "xfs_da_state");
+       if (!xfs_da_state_zone)
+               goto out_destroy_btree_cur_zone;
+
+       xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
+       if (!xfs_dabuf_zone)
+               goto out_destroy_da_state_zone;
+
+       xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
+       if (!xfs_ifork_zone)
+               goto out_destroy_dabuf_zone;
+
+       xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
+       if (!xfs_trans_zone)
+               goto out_destroy_ifork_zone;
+
+       xfs_log_item_desc_zone =
+               kmem_zone_init(sizeof(struct xfs_log_item_desc),
+                              "xfs_log_item_desc");
+       if (!xfs_log_item_desc_zone)
+               goto out_destroy_trans_zone;
+
+       /*
+        * The size of the zone allocated buf log item is the maximum
+        * size possible under XFS.  This wastes a little bit of memory,
+        * but it is much faster.
+        */
+       xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
+                               (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
+                                 NBWORD) * sizeof(int))), "xfs_buf_item");
+       if (!xfs_buf_item_zone)
+               goto out_destroy_log_item_desc_zone;
+
+       xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+                       ((XFS_EFD_MAX_FAST_EXTENTS - 1) *
+                                sizeof(xfs_extent_t))), "xfs_efd_item");
+       if (!xfs_efd_zone)
+               goto out_destroy_buf_item_zone;
+
+       xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+                       ((XFS_EFI_MAX_FAST_EXTENTS - 1) *
+                               sizeof(xfs_extent_t))), "xfs_efi_item");
+       if (!xfs_efi_zone)
+               goto out_destroy_efd_zone;
+
+       xfs_inode_zone =
+               kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
+                       KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+                       xfs_fs_inode_init_once);
+       if (!xfs_inode_zone)
+               goto out_destroy_efi_zone;
+
+       xfs_ili_zone =
+               kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
+                                       KM_ZONE_SPREAD, NULL);
+       if (!xfs_ili_zone)
+               goto out_destroy_inode_zone;
+
+       return 0;
+
+ out_destroy_inode_zone:
+       kmem_zone_destroy(xfs_inode_zone);
+ out_destroy_efi_zone:
+       kmem_zone_destroy(xfs_efi_zone);
+ out_destroy_efd_zone:
+       kmem_zone_destroy(xfs_efd_zone);
+ out_destroy_buf_item_zone:
+       kmem_zone_destroy(xfs_buf_item_zone);
+ out_destroy_log_item_desc_zone:
+       kmem_zone_destroy(xfs_log_item_desc_zone);
+ out_destroy_trans_zone:
+       kmem_zone_destroy(xfs_trans_zone);
+ out_destroy_ifork_zone:
+       kmem_zone_destroy(xfs_ifork_zone);
+ out_destroy_dabuf_zone:
+       kmem_zone_destroy(xfs_dabuf_zone);
+ out_destroy_da_state_zone:
+       kmem_zone_destroy(xfs_da_state_zone);
+ out_destroy_btree_cur_zone:
+       kmem_zone_destroy(xfs_btree_cur_zone);
+ out_destroy_bmap_free_item_zone:
+       kmem_zone_destroy(xfs_bmap_free_item_zone);
+ out_destroy_log_ticket_zone:
+       kmem_zone_destroy(xfs_log_ticket_zone);
+ out_destroy_ioend_pool:
+       mempool_destroy(xfs_ioend_pool);
+ out_destroy_ioend_zone:
+       kmem_zone_destroy(xfs_ioend_zone);
+ out:
+       return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_zones(void)
+{
+       kmem_zone_destroy(xfs_ili_zone);
+       kmem_zone_destroy(xfs_inode_zone);
+       kmem_zone_destroy(xfs_efi_zone);
+       kmem_zone_destroy(xfs_efd_zone);
+       kmem_zone_destroy(xfs_buf_item_zone);
+       kmem_zone_destroy(xfs_log_item_desc_zone);
+       kmem_zone_destroy(xfs_trans_zone);
+       kmem_zone_destroy(xfs_ifork_zone);
+       kmem_zone_destroy(xfs_dabuf_zone);
+       kmem_zone_destroy(xfs_da_state_zone);
+       kmem_zone_destroy(xfs_btree_cur_zone);
+       kmem_zone_destroy(xfs_bmap_free_item_zone);
+       kmem_zone_destroy(xfs_log_ticket_zone);
+       mempool_destroy(xfs_ioend_pool);
+       kmem_zone_destroy(xfs_ioend_zone);
+
+}
+
+STATIC int __init
+xfs_init_workqueues(void)
+{
+       /*
+        * max_active is set to 8 to give enough concurency to allow
+        * multiple work operations on each CPU to run. This allows multiple
+        * filesystems to be running sync work concurrently, and scales with
+        * the number of CPUs in the system.
+        */
+       xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
+       if (!xfs_syncd_wq)
+               goto out;
+
+       xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
+       if (!xfs_ail_wq)
+               goto out_destroy_syncd;
+
+       return 0;
+
+out_destroy_syncd:
+       destroy_workqueue(xfs_syncd_wq);
+out:
+       return -ENOMEM;
+}
+
+STATIC void
+xfs_destroy_workqueues(void)
+{
+       destroy_workqueue(xfs_ail_wq);
+       destroy_workqueue(xfs_syncd_wq);
+}
+
+STATIC int __init
+init_xfs_fs(void)
+{
+       int                     error;
+
+       printk(KERN_INFO XFS_VERSION_STRING " with "
+                        XFS_BUILD_OPTIONS " enabled\n");
+
+       xfs_ioend_init();
+       xfs_dir_startup();
+
+       error = xfs_init_zones();
+       if (error)
+               goto out;
+
+       error = xfs_init_workqueues();
+       if (error)
+               goto out_destroy_zones;
+
+       error = xfs_mru_cache_init();
+       if (error)
+               goto out_destroy_wq;
+
+       error = xfs_filestream_init();
+       if (error)
+               goto out_mru_cache_uninit;
+
+       error = xfs_buf_init();
+       if (error)
+               goto out_filestream_uninit;
+
+       error = xfs_init_procfs();
+       if (error)
+               goto out_buf_terminate;
+
+       error = xfs_sysctl_register();
+       if (error)
+               goto out_cleanup_procfs;
+
+       vfs_initquota();
+
+       error = register_filesystem(&xfs_fs_type);
+       if (error)
+               goto out_sysctl_unregister;
+       return 0;
+
+ out_sysctl_unregister:
+       xfs_sysctl_unregister();
+ out_cleanup_procfs:
+       xfs_cleanup_procfs();
+ out_buf_terminate:
+       xfs_buf_terminate();
+ out_filestream_uninit:
+       xfs_filestream_uninit();
+ out_mru_cache_uninit:
+       xfs_mru_cache_uninit();
+ out_destroy_wq:
+       xfs_destroy_workqueues();
+ out_destroy_zones:
+       xfs_destroy_zones();
+ out:
+       return error;
+}
+
+STATIC void __exit
+exit_xfs_fs(void)
+{
+       vfs_exitquota();
+       unregister_filesystem(&xfs_fs_type);
+       xfs_sysctl_unregister();
+       xfs_cleanup_procfs();
+       xfs_buf_terminate();
+       xfs_filestream_uninit();
+       xfs_mru_cache_uninit();
+       xfs_destroy_workqueues();
+       xfs_destroy_zones();
+}
+
+module_init(init_xfs_fs);
+module_exit(exit_xfs_fs);
+
+MODULE_AUTHOR("Silicon Graphics, Inc.");
+MODULE_DESCRIPTION(XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled");
+MODULE_LICENSE("GPL");
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
new file mode 100644 (file)
index 0000000..50a3266
--- /dev/null
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SUPER_H__
+#define __XFS_SUPER_H__
+
+#include <linux/exportfs.h>
+
+#ifdef CONFIG_XFS_QUOTA
+extern void xfs_qm_init(void);
+extern void xfs_qm_exit(void);
+# define vfs_initquota()       xfs_qm_init()
+# define vfs_exitquota()       xfs_qm_exit()
+#else
+# define vfs_initquota()       do { } while (0)
+# define vfs_exitquota()       do { } while (0)
+#endif
+
+#ifdef CONFIG_XFS_POSIX_ACL
+# define XFS_ACL_STRING                "ACLs, "
+# define set_posix_acl_flag(sb)        ((sb)->s_flags |= MS_POSIXACL)
+#else
+# define XFS_ACL_STRING
+# define set_posix_acl_flag(sb)        do { } while (0)
+#endif
+
+#define XFS_SECURITY_STRING    "security attributes, "
+
+#ifdef CONFIG_XFS_RT
+# define XFS_REALTIME_STRING   "realtime, "
+#else
+# define XFS_REALTIME_STRING
+#endif
+
+#if XFS_BIG_BLKNOS
+# if XFS_BIG_INUMS
+#  define XFS_BIGFS_STRING     "large block/inode numbers, "
+# else
+#  define XFS_BIGFS_STRING     "large block numbers, "
+# endif
+#else
+# define XFS_BIGFS_STRING
+#endif
+
+#ifdef DEBUG
+# define XFS_DBG_STRING                "debug"
+#else
+# define XFS_DBG_STRING                "no debug"
+#endif
+
+#define XFS_VERSION_STRING     "SGI XFS"
+#define XFS_BUILD_OPTIONS      XFS_ACL_STRING \
+                               XFS_SECURITY_STRING \
+                               XFS_REALTIME_STRING \
+                               XFS_BIGFS_STRING \
+                               XFS_DBG_STRING /* DBG must be last */
+
+struct xfs_inode;
+struct xfs_mount;
+struct xfs_buftarg;
+struct block_device;
+
+extern __uint64_t xfs_max_file_offset(unsigned int);
+
+extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
+
+extern const struct export_operations xfs_export_operations;
+extern const struct xattr_handler *xfs_xattr_handlers[];
+extern const struct quotactl_ops xfs_quotactl_operations;
+
+#define XFS_M(sb)              ((struct xfs_mount *)((sb)->s_fs_info))
+
+#endif /* __XFS_SUPER_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
new file mode 100644 (file)
index 0000000..4604f90
--- /dev/null
@@ -0,0 +1,1065 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+struct workqueue_struct        *xfs_syncd_wq;  /* sync workqueue */
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH       32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = VFS_I(ip);
+
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
+       /* nothing to sync during shutdown */
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return EFSCORRUPTED;
+
+       /* If we can't grab the inode, it must on it's way to reclaim. */
+       if (!igrab(inode))
+               return ENOENT;
+
+       if (is_bad_inode(inode)) {
+               IRELE(ip);
+               return ENOENT;
+       }
+
+       /* inode is valid */
+       return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags),
+       int                     flags)
+{
+       uint32_t                first_index;
+       int                     last_error = 0;
+       int                     skipped;
+       int                     done;
+       int                     nr_found;
+
+restart:
+       done = 0;
+       skipped = 0;
+       first_index = 0;
+       nr_found = 0;
+       do {
+               struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+               int             error = 0;
+               int             i;
+
+               rcu_read_lock();
+               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                       (void **)batch, first_index,
+                                       XFS_LOOKUP_BATCH);
+               if (!nr_found) {
+                       rcu_read_unlock();
+                       break;
+               }
+
+               /*
+                * Grab the inodes before we drop the lock. if we found
+                * nothing, nr == 0 and the loop will be skipped.
+                */
+               for (i = 0; i < nr_found; i++) {
+                       struct xfs_inode *ip = batch[i];
+
+                       if (done || xfs_inode_ag_walk_grab(ip))
+                               batch[i] = NULL;
+
+                       /*
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
+                        */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
+                       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                       if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                               done = 1;
+               }
+
+               /* unlock now we've grabbed the inodes. */
+               rcu_read_unlock();
+
+               for (i = 0; i < nr_found; i++) {
+                       if (!batch[i])
+                               continue;
+                       error = execute(batch[i], pag, flags);
+                       IRELE(batch[i]);
+                       if (error == EAGAIN) {
+                               skipped++;
+                               continue;
+                       }
+                       if (error && last_error != EFSCORRUPTED)
+                               last_error = error;
+               }
+
+               /* bail out if the filesystem is corrupted.  */
+               if (error == EFSCORRUPTED)
+                       break;
+
+               cond_resched();
+
+       } while (nr_found && !done);
+
+       if (skipped) {
+               delay(1);
+               goto restart;
+       }
+       return last_error;
+}
+
+int
+xfs_inode_ag_iterator(
+       struct xfs_mount        *mp,
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags),
+       int                     flags)
+{
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     last_error = 0;
+       xfs_agnumber_t          ag;
+
+       ag = 0;
+       while ((pag = xfs_perag_get(mp, ag))) {
+               ag = pag->pag_agno + 1;
+               error = xfs_inode_ag_walk(mp, pag, execute, flags);
+               xfs_perag_put(pag);
+               if (error) {
+                       last_error = error;
+                       if (error == EFSCORRUPTED)
+                               break;
+               }
+       }
+       return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_sync_inode_data(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     flags)
+{
+       struct inode            *inode = VFS_I(ip);
+       struct address_space *mapping = inode->i_mapping;
+       int                     error = 0;
+
+       if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+               goto out_wait;
+
+       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
+               if (flags & SYNC_TRYLOCK)
+                       goto out_wait;
+               xfs_ilock(ip, XFS_IOLOCK_SHARED);
+       }
+
+       error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
+                               0 : XBF_ASYNC, FI_NONE);
+       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+
+ out_wait:
+       if (flags & SYNC_WAIT)
+               xfs_ioend_wait(ip);
+       return error;
+}
+
+STATIC int
+xfs_sync_inode_attr(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     flags)
+{
+       int                     error = 0;
+
+       xfs_ilock(ip, XFS_ILOCK_SHARED);
+       if (xfs_inode_clean(ip))
+               goto out_unlock;
+       if (!xfs_iflock_nowait(ip)) {
+               if (!(flags & SYNC_WAIT))
+                       goto out_unlock;
+               xfs_iflock(ip);
+       }
+
+       if (xfs_inode_clean(ip)) {
+               xfs_ifunlock(ip);
+               goto out_unlock;
+       }
+
+       error = xfs_iflush(ip, flags);
+
+       /*
+        * We don't want to try again on non-blocking flushes that can't run
+        * again immediately. If an inode really must be written, then that's
+        * what the SYNC_WAIT flag is for.
+        */
+       if (error == EAGAIN) {
+               ASSERT(!(flags & SYNC_WAIT));
+               error = 0;
+       }
+
+ out_unlock:
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
+       return error;
+}
+
+/*
+ * Write out pagecache data for the whole filesystem.
+ */
+STATIC int
+xfs_sync_data(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       int                     error;
+
+       ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
+
+       error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
+       if (error)
+               return XFS_ERROR(error);
+
+       xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
+       return 0;
+}
+
+/*
+ * Write out inode metadata (attributes) for the whole filesystem.
+ */
+STATIC int
+xfs_sync_attr(
+       struct xfs_mount        *mp,
+       int                     flags)
+{
+       ASSERT((flags & ~SYNC_WAIT) == 0);
+
+       return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags);
+}
+
+STATIC int
+xfs_sync_fsdata(
+       struct xfs_mount        *mp)
+{
+       struct xfs_buf          *bp;
+
+       /*
+        * If the buffer is pinned then push on the log so we won't get stuck
+        * waiting in the write for someone, maybe ourselves, to flush the log.
+        *
+        * Even though we just pushed the log above, we did not have the
+        * superblock buffer locked at that point so it can become pinned in
+        * between there and here.
+        */
+       bp = xfs_getsb(mp, 0);
+       if (xfs_buf_ispinned(bp))
+               xfs_log_force(mp, 0);
+
+       return xfs_bwrite(mp, bp);
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete.  Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+       struct xfs_mount        *mp)
+{
+       int                     error, error2 = 0;
+
+       xfs_qm_sync(mp, SYNC_TRYLOCK);
+       xfs_qm_sync(mp, SYNC_WAIT);
+
+       /* force out the newly dirtied log buffers */
+       xfs_log_force(mp, XFS_LOG_SYNC);
+
+       /* write superblock and hoover up shutdown errors */
+       error = xfs_sync_fsdata(mp);
+
+       /* make sure all delwri buffers are written out */
+       xfs_flush_buftarg(mp->m_ddev_targp, 1);
+
+       /* mark the log as covered if needed */
+       if (xfs_log_need_covered(mp))
+               error2 = xfs_fs_log_dummy(mp);
+
+       /* flush data-only devices */
+       if (mp->m_rtdev_targp)
+               XFS_bflush(mp->m_rtdev_targp);
+
+       return error ? error : error2;
+}
+
+STATIC void
+xfs_quiesce_fs(
+       struct xfs_mount        *mp)
+{
+       int     count = 0, pincount;
+
+       xfs_reclaim_inodes(mp, 0);
+       xfs_flush_buftarg(mp->m_ddev_targp, 0);
+
+       /*
+        * This loop must run at least twice.  The first instance of the loop
+        * will flush most meta data but that will generate more meta data
+        * (typically directory updates).  Which then must be flushed and
+        * logged before we can write the unmount record. We also so sync
+        * reclaim of inodes to catch any that the above delwri flush skipped.
+        */
+       do {
+               xfs_reclaim_inodes(mp, SYNC_WAIT);
+               xfs_sync_attr(mp, SYNC_WAIT);
+               pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+               if (!pincount) {
+                       delay(50);
+                       count++;
+               }
+       } while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceeding.
+ */
+void
+xfs_quiesce_attr(
+       struct xfs_mount        *mp)
+{
+       int     error = 0;
+
+       /* wait for all modifications to complete */
+       while (atomic_read(&mp->m_active_trans) > 0)
+               delay(100);
+
+       /* flush inodes and push all remaining buffers out to disk */
+       xfs_quiesce_fs(mp);
+
+       /*
+        * Just warn here till VFS can correctly support
+        * read-only remount without racing.
+        */
+       WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+       /* Push the superblock and write an unmount record */
+       error = xfs_log_sbcount(mp);
+       if (error)
+               xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+                               "Frozen image may not be consistent.");
+       xfs_log_unmount_write(mp);
+       xfs_unmountfs_writesb(mp);
+}
+
+static void
+xfs_syncd_queue_sync(
+       struct xfs_mount        *mp)
+{
+       queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
+                               msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes and sync
+ * disk quotas.  We might need to cover the log to indicate that the
+ * filesystem is idle and not frozen.
+ */
+STATIC void
+xfs_sync_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_sync_work);
+       int             error;
+
+       if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+               /* dgc: errors ignored here */
+               if (mp->m_super->s_frozen == SB_UNFROZEN &&
+                   xfs_log_need_covered(mp))
+                       error = xfs_fs_log_dummy(mp);
+               else
+                       xfs_log_force(mp, 0);
+               error = xfs_qm_sync(mp, SYNC_TRYLOCK);
+
+               /* start pushing all the metadata that is currently dirty */
+               xfs_ail_push_all(mp->m_ail);
+       }
+
+       /* queue us up again */
+       xfs_syncd_queue_sync(mp);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_syncd_queue_reclaim(
+       struct xfs_mount        *mp)
+{
+
+       /*
+        * We can have inodes enter reclaim after we've shut down the syncd
+        * workqueue during unmount, so don't allow reclaim work to be queued
+        * during unmount.
+        */
+       if (!(mp->m_super->s_flags & MS_ACTIVE))
+               return;
+
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+STATIC void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+
+       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+       xfs_syncd_queue_reclaim(mp);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations.  At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room.
+ *
+ * Queue a new data flush if there isn't one already in progress and
+ * wait for completion of the flush. This means that we only ever have one
+ * inode flush in progress no matter how many ENOSPC events are occurring and
+ * so will prevent the system from bogging down due to every concurrent
+ * ENOSPC event scanning all the active inodes in the system for writeback.
+ */
+void
+xfs_flush_inodes(
+       struct xfs_inode        *ip)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+
+       queue_work(xfs_syncd_wq, &mp->m_flush_work);
+       flush_work_sync(&mp->m_flush_work);
+}
+
+STATIC void
+xfs_flush_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(work,
+                                       struct xfs_mount, m_flush_work);
+
+       xfs_sync_data(mp, SYNC_TRYLOCK);
+       xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
+}
+
+int
+xfs_syncd_init(
+       struct xfs_mount        *mp)
+{
+       INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
+       INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
+       INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+
+       xfs_syncd_queue_sync(mp);
+       xfs_syncd_queue_reclaim(mp);
+
+       return 0;
+}
+
+void
+xfs_syncd_stop(
+       struct xfs_mount        *mp)
+{
+       cancel_delayed_work_sync(&mp->m_sync_work);
+       cancel_delayed_work_sync(&mp->m_reclaim_work);
+       cancel_work_sync(&mp->m_flush_work);
+}
+
+void
+__xfs_inode_set_reclaim_tag(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *ip)
+{
+       radix_tree_tag_set(&pag->pag_ici_root,
+                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                          XFS_ICI_RECLAIM_TAG);
+
+       if (!pag->pag_ici_reclaimable) {
+               /* propagate the reclaim tag up into the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                               XFS_ICI_RECLAIM_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+
+               /* schedule periodic background inode reclaim */
+               xfs_syncd_queue_reclaim(ip->i_mount);
+
+               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                       -1, _RET_IP_);
+       }
+       pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       struct xfs_mount *mp = ip->i_mount;
+       struct xfs_perag *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_set_reclaim_tag(pag, ip);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       pag->pag_ici_reclaimable--;
+       if (!pag->pag_ici_reclaimable) {
+               /* clear the reclaim tag from the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                               XFS_ICI_RECLAIM_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                       -1, _RET_IP_);
+       }
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+       xfs_mount_t     *mp,
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+       __xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+       struct xfs_inode        *ip,
+       int                     flags)
+{
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
+
+       /*
+        * do some unlocked checks first to avoid unnecessary lock traffic.
+        * The first is a flush lock check, the second is a already in reclaim
+        * check. Only do these checks if we are not going to block on locks.
+        */
+       if ((flags & SYNC_TRYLOCK) &&
+           (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) {
+               return 1;
+       }
+
+       /*
+        * The radix tree lock here protects a thread in xfs_iget from racing
+        * with us starting reclaim on the inode.  Once we have the
+        * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
+               spin_unlock(&ip->i_flags_lock);
+               return 1;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+       spin_unlock(&ip->i_flags_lock);
+       return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently, and the return
+ * value of xfs_iflush is not sufficient to get this right. The following table
+ * lists the inode states and the reclaim actions necessary for non-blocking
+ * reclaim:
+ *
+ *
+ *     inode state          iflush ret         required action
+ *      ---------------      ----------         ---------------
+ *     bad                     -               reclaim
+ *     shutdown                EIO             unpin and reclaim
+ *     clean, unpinned         0               reclaim
+ *     stale, unpinned         0               reclaim
+ *     clean, pinned(*)        0               requeue
+ *     stale, pinned           EAGAIN          requeue
+ *     dirty, delwri ok        0               requeue
+ *     dirty, delwri blocked   EAGAIN          requeue
+ *     dirty, sync flush       0               reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * As can be seen from the table, the return value of xfs_iflush() is not
+ * sufficient to correctly decide the reclaim action here. The checks in
+ * xfs_iflush() might look like duplicates, but they are not.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean. The clean inode check needs to be done before flushing
+ * the inode delwri otherwise we would loop forever requeuing clean inodes as
+ * we cannot tell apart a successful delwri flush and a clean inode from the
+ * return value of xfs_iflush().
+ *
+ * Note that because the inode is flushed delayed write by background
+ * writeback, the flush lock may already be held here and waiting on it can
+ * result in very long latencies. Hence for sync reclaims, where we wait on the
+ * flush lock, the caller should push out delayed write inodes first before
+ * trying to reclaim them to minimise the amount of time spent waiting. For
+ * background relaim, we just requeue the inode for the next pass.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *     bad             => reclaim
+ *     shutdown        => unpin and reclaim
+ *     pinned, delwri  => requeue
+ *     pinned, sync    => unpin
+ *     stale           => reclaim
+ *     clean           => reclaim
+ *     dirty, delwri   => flush and requeue
+ *     dirty, sync     => flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     sync_mode)
+{
+       int     error;
+
+restart:
+       error = 0;
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (!xfs_iflock_nowait(ip)) {
+               if (!(sync_mode & SYNC_WAIT))
+                       goto out;
+               xfs_iflock(ip);
+       }
+
+       if (is_bad_inode(VFS_I(ip)))
+               goto reclaim;
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               xfs_iunpin_wait(ip);
+               goto reclaim;
+       }
+       if (xfs_ipincount(ip)) {
+               if (!(sync_mode & SYNC_WAIT)) {
+                       xfs_ifunlock(ip);
+                       goto out;
+               }
+               xfs_iunpin_wait(ip);
+       }
+       if (xfs_iflags_test(ip, XFS_ISTALE))
+               goto reclaim;
+       if (xfs_inode_clean(ip))
+               goto reclaim;
+
+       /*
+        * Now we have an inode that needs flushing.
+        *
+        * We do a nonblocking flush here even if we are doing a SYNC_WAIT
+        * reclaim as we can deadlock with inode cluster removal.
+        * xfs_ifree_cluster() can lock the inode buffer before it locks the
+        * ip->i_lock, and we are doing the exact opposite here. As a result,
+        * doing a blocking xfs_itobp() to get the cluster buffer will result
+        * in an ABBA deadlock with xfs_ifree_cluster().
+        *
+        * As xfs_ifree_cluser() must gather all inodes that are active in the
+        * cache to mark them stale, if we hit this case we don't actually want
+        * to do IO here - we want the inode marked stale so we can simply
+        * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush,
+        * just unlock the inode, back off and try again. Hopefully the next
+        * pass through will see the stale flag set on the inode.
+        */
+       error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode);
+       if (sync_mode & SYNC_WAIT) {
+               if (error == EAGAIN) {
+                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+                       /* backoff longer than in xfs_ifree_cluster */
+                       delay(2);
+                       goto restart;
+               }
+               xfs_iflock(ip);
+               goto reclaim;
+       }
+
+       /*
+        * When we have to flush an inode but don't have SYNC_WAIT set, we
+        * flush the inode out using a delwri buffer and wait for the next
+        * call into reclaim to find it in a clean state instead of waiting for
+        * it now. We also don't return errors here - if the error is transient
+        * then the next reclaim pass will flush the inode, and if the error
+        * is permanent then the next sync reclaim will reclaim the inode and
+        * pass on the error.
+        */
+       if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               xfs_warn(ip->i_mount,
+                       "inode 0x%llx background reclaim flush failed with %d",
+                       (long long)ip->i_ino, error);
+       }
+out:
+       xfs_iflags_clear(ip, XFS_IRECLAIM);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       /*
+        * We could return EAGAIN here to make reclaim rescan the inode tree in
+        * a short while. However, this just burns CPU time scanning the tree
+        * waiting for IO to complete and xfssyncd never goes back to the idle
+        * state. Instead, return 0 to let the next scheduled background reclaim
+        * attempt to reclaim the inode again.
+        */
+       return 0;
+
+reclaim:
+       xfs_ifunlock(ip);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       XFS_STATS_INC(xs_ig_reclaims);
+       /*
+        * Remove the inode from the per-AG radix tree.
+        *
+        * Because radix_tree_delete won't complain even if the item was never
+        * added to the tree assert that it's been there before to catch
+        * problems with the inode life time early on.
+        */
+       spin_lock(&pag->pag_ici_lock);
+       if (!radix_tree_delete(&pag->pag_ici_root,
+                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+               ASSERT(0);
+       __xfs_inode_clear_reclaim(pag, ip);
+       spin_unlock(&pag->pag_ici_lock);
+
+       /*
+        * Here we do an (almost) spurious inode lock in order to coordinate
+        * with inode cache radix tree lookups.  This is because the lookup
+        * can reference the inodes in the cache without taking references.
+        *
+        * We make that OK here by ensuring that we wait until the inode is
+        * unlocked after the lookup before we go ahead and free it.  We get
+        * both the ilock and the iolock because the code may need to drop the
+        * ilock one but will still hold the iolock.
+        */
+       xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+       xfs_qm_dqdetach(ip);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+       xfs_inode_free(ip);
+       return error;
+
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+int
+xfs_reclaim_inodes_ag(
+       struct xfs_mount        *mp,
+       int                     flags,
+       int                     *nr_to_scan)
+{
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     last_error = 0;
+       xfs_agnumber_t          ag;
+       int                     trylock = flags & SYNC_TRYLOCK;
+       int                     skipped;
+
+restart:
+       ag = 0;
+       skipped = 0;
+       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+               unsigned long   first_index = 0;
+               int             done = 0;
+               int             nr_found = 0;
+
+               ag = pag->pag_agno + 1;
+
+               if (trylock) {
+                       if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                               skipped++;
+                               xfs_perag_put(pag);
+                               continue;
+                       }
+                       first_index = pag->pag_ici_reclaim_cursor;
+               } else
+                       mutex_lock(&pag->pag_ici_reclaim_lock);
+
+               do {
+                       struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                       int     i;
+
+                       rcu_read_lock();
+                       nr_found = radix_tree_gang_lookup_tag(
+                                       &pag->pag_ici_root,
+                                       (void **)batch, first_index,
+                                       XFS_LOOKUP_BATCH,
+                                       XFS_ICI_RECLAIM_TAG);
+                       if (!nr_found) {
+                               done = 1;
+                               rcu_read_unlock();
+                               break;
+                       }
+
+                       /*
+                        * Grab the inodes before we drop the lock. if we found
+                        * nothing, nr == 0 and the loop will be skipped.
+                        */
+                       for (i = 0; i < nr_found; i++) {
+                               struct xfs_inode *ip = batch[i];
+
+                               if (done || xfs_reclaim_inode_grab(ip, flags))
+                                       batch[i] = NULL;
+
+                               /*
+                                * Update the index for the next lookup. Catch
+                                * overflows into the next AG range which can
+                                * occur if we have inodes in the last block of
+                                * the AG and we are currently pointing to the
+                                * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
+                                */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
+                               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                       done = 1;
+                       }
+
+                       /* unlock now we've grabbed the inodes. */
+                       rcu_read_unlock();
+
+                       for (i = 0; i < nr_found; i++) {
+                               if (!batch[i])
+                                       continue;
+                               error = xfs_reclaim_inode(batch[i], pag, flags);
+                               if (error && last_error != EFSCORRUPTED)
+                                       last_error = error;
+                       }
+
+                       *nr_to_scan -= XFS_LOOKUP_BATCH;
+
+                       cond_resched();
+
+               } while (nr_found && !done && *nr_to_scan > 0);
+
+               if (trylock && !done)
+                       pag->pag_ici_reclaim_cursor = first_index;
+               else
+                       pag->pag_ici_reclaim_cursor = 0;
+               mutex_unlock(&pag->pag_ici_reclaim_lock);
+               xfs_perag_put(pag);
+       }
+
+       /*
+        * if we skipped any AG, and we still have scan count remaining, do
+        * another pass this time using blocking reclaim semantics (i.e
+        * waiting on the reclaim locks and ignoring the reclaim cursors). This
+        * ensure that when we get more reclaimers than AGs we block rather
+        * than spin trying to execute reclaim.
+        */
+       if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+               trylock = 0;
+               goto restart;
+       }
+       return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int             mode)
+{
+       int             nr_to_scan = INT_MAX;
+
+       return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+       struct xfs_mount        *mp,
+       int                     nr_to_scan)
+{
+       /* kick background reclaimer and push the AIL */
+       xfs_syncd_queue_reclaim(mp);
+       xfs_ail_push_all(mp->m_ail);
+
+       xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+       struct xfs_mount        *mp)
+{
+       struct xfs_perag        *pag;
+       xfs_agnumber_t          ag = 0;
+       int                     reclaimable = 0;
+
+       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+               ag = pag->pag_agno + 1;
+               reclaimable += pag->pag_ici_reclaimable;
+               xfs_perag_put(pag);
+       }
+       return reclaimable;
+}
+
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
new file mode 100644 (file)
index 0000000..941202e
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+#define SYNC_WAIT              0x0001  /* wait for i/o to complete */
+#define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
+
+extern struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inodes(struct xfs_inode *ip);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+                               struct xfs_inode *ip);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+       int flags);
+
+#endif
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
new file mode 100644 (file)
index 0000000..ee2d2ad
--- /dev/null
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include <linux/sysctl.h>
+#include <linux/proc_fs.h>
+#include "xfs_error.h"
+
+static struct ctl_table_header *xfs_table_header;
+
+#ifdef CONFIG_PROC_FS
+STATIC int
+xfs_stats_clear_proc_handler(
+       ctl_table       *ctl,
+       int             write,
+       void            __user *buffer,
+       size_t          *lenp,
+       loff_t          *ppos)
+{
+       int             c, ret, *valp = ctl->data;
+       __uint32_t      vn_active;
+
+       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+
+       if (!ret && write && *valp) {
+               xfs_notice(NULL, "Clearing xfsstats");
+               for_each_possible_cpu(c) {
+                       preempt_disable();
+                       /* save vn_active, it's a universal truth! */
+                       vn_active = per_cpu(xfsstats, c).vn_active;
+                       memset(&per_cpu(xfsstats, c), 0,
+                              sizeof(struct xfsstats));
+                       per_cpu(xfsstats, c).vn_active = vn_active;
+                       preempt_enable();
+               }
+               xfs_stats_clear = 0;
+       }
+
+       return ret;
+}
+
+STATIC int
+xfs_panic_mask_proc_handler(
+       ctl_table       *ctl,
+       int             write,
+       void            __user *buffer,
+       size_t          *lenp,
+       loff_t          *ppos)
+{
+       int             ret, *valp = ctl->data;
+
+       ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos);
+       if (!ret && write) {
+               xfs_panic_mask = *valp;
+#ifdef DEBUG
+               xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
+#endif
+       }
+       return ret;
+}
+#endif /* CONFIG_PROC_FS */
+
+static ctl_table xfs_table[] = {
+       {
+               .procname       = "irix_sgid_inherit",
+               .data           = &xfs_params.sgid_inherit.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.sgid_inherit.min,
+               .extra2         = &xfs_params.sgid_inherit.max
+       },
+       {
+               .procname       = "irix_symlink_mode",
+               .data           = &xfs_params.symlink_mode.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.symlink_mode.min,
+               .extra2         = &xfs_params.symlink_mode.max
+       },
+       {
+               .procname       = "panic_mask",
+               .data           = &xfs_params.panic_mask.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = xfs_panic_mask_proc_handler,
+               .extra1         = &xfs_params.panic_mask.min,
+               .extra2         = &xfs_params.panic_mask.max
+       },
+
+       {
+               .procname       = "error_level",
+               .data           = &xfs_params.error_level.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.error_level.min,
+               .extra2         = &xfs_params.error_level.max
+       },
+       {
+               .procname       = "xfssyncd_centisecs",
+               .data           = &xfs_params.syncd_timer.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.syncd_timer.min,
+               .extra2         = &xfs_params.syncd_timer.max
+       },
+       {
+               .procname       = "inherit_sync",
+               .data           = &xfs_params.inherit_sync.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.inherit_sync.min,
+               .extra2         = &xfs_params.inherit_sync.max
+       },
+       {
+               .procname       = "inherit_nodump",
+               .data           = &xfs_params.inherit_nodump.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.inherit_nodump.min,
+               .extra2         = &xfs_params.inherit_nodump.max
+       },
+       {
+               .procname       = "inherit_noatime",
+               .data           = &xfs_params.inherit_noatim.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.inherit_noatim.min,
+               .extra2         = &xfs_params.inherit_noatim.max
+       },
+       {
+               .procname       = "xfsbufd_centisecs",
+               .data           = &xfs_params.xfs_buf_timer.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.xfs_buf_timer.min,
+               .extra2         = &xfs_params.xfs_buf_timer.max
+       },
+       {
+               .procname       = "age_buffer_centisecs",
+               .data           = &xfs_params.xfs_buf_age.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.xfs_buf_age.min,
+               .extra2         = &xfs_params.xfs_buf_age.max
+       },
+       {
+               .procname       = "inherit_nosymlinks",
+               .data           = &xfs_params.inherit_nosym.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.inherit_nosym.min,
+               .extra2         = &xfs_params.inherit_nosym.max
+       },
+       {
+               .procname       = "rotorstep",
+               .data           = &xfs_params.rotorstep.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.rotorstep.min,
+               .extra2         = &xfs_params.rotorstep.max
+       },
+       {
+               .procname       = "inherit_nodefrag",
+               .data           = &xfs_params.inherit_nodfrg.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.inherit_nodfrg.min,
+               .extra2         = &xfs_params.inherit_nodfrg.max
+       },
+       {
+               .procname       = "filestream_centisecs",
+               .data           = &xfs_params.fstrm_timer.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.fstrm_timer.min,
+               .extra2         = &xfs_params.fstrm_timer.max,
+       },
+       /* please keep this the last entry */
+#ifdef CONFIG_PROC_FS
+       {
+               .procname       = "stats_clear",
+               .data           = &xfs_params.stats_clear.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = xfs_stats_clear_proc_handler,
+               .extra1         = &xfs_params.stats_clear.min,
+               .extra2         = &xfs_params.stats_clear.max
+       },
+#endif /* CONFIG_PROC_FS */
+
+       {}
+};
+
+static ctl_table xfs_dir_table[] = {
+       {
+               .procname       = "xfs",
+               .mode           = 0555,
+               .child          = xfs_table
+       },
+       {}
+};
+
+static ctl_table xfs_root_table[] = {
+       {
+               .procname       = "fs",
+               .mode           = 0555,
+               .child          = xfs_dir_table
+       },
+       {}
+};
+
+int
+xfs_sysctl_register(void)
+{
+       xfs_table_header = register_sysctl_table(xfs_root_table);
+       if (!xfs_table_header)
+               return -ENOMEM;
+       return 0;
+}
+
+void
+xfs_sysctl_unregister(void)
+{
+       unregister_sysctl_table(xfs_table_header);
+}
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
new file mode 100644 (file)
index 0000000..b9937d4
--- /dev/null
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2001-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_SYSCTL_H__
+#define __XFS_SYSCTL_H__
+
+#include <linux/sysctl.h>
+
+/*
+ * Tunable xfs parameters
+ */
+
+typedef struct xfs_sysctl_val {
+       int min;
+       int val;
+       int max;
+} xfs_sysctl_val_t;
+
+typedef struct xfs_param {
+       xfs_sysctl_val_t sgid_inherit;  /* Inherit S_ISGID if process' GID is
+                                        * not a member of parent dir GID. */
+       xfs_sysctl_val_t symlink_mode;  /* Link creat mode affected by umask */
+       xfs_sysctl_val_t panic_mask;    /* bitmask to cause panic on errors. */
+       xfs_sysctl_val_t error_level;   /* Degree of reporting for problems  */
+       xfs_sysctl_val_t syncd_timer;   /* Interval between xfssyncd wakeups */
+       xfs_sysctl_val_t stats_clear;   /* Reset all XFS statistics to zero. */
+       xfs_sysctl_val_t inherit_sync;  /* Inherit the "sync" inode flag. */
+       xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */
+       xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */
+       xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */
+       xfs_sysctl_val_t xfs_buf_age;   /* Metadata buffer age before flush. */
+       xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */
+       xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
+       xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
+       xfs_sysctl_val_t fstrm_timer;   /* Filestream dir-AG assoc'n timeout. */
+} xfs_param_t;
+
+/*
+ * xfs_error_level:
+ *
+ * How much error reporting will be done when internal problems are
+ * encountered.  These problems normally return an EFSCORRUPTED to their
+ * caller, with no other information reported.
+ *
+ * 0   No error reports
+ * 1   Report EFSCORRUPTED errors that will cause a filesystem shutdown
+ * 5   Report all EFSCORRUPTED errors (all of the above errors, plus any
+ *     additional errors that are known to not cause shutdowns)
+ *
+ * xfs_panic_mask bit 0x8 turns the error reports into panics
+ */
+
+enum {
+       /* XFS_REFCACHE_SIZE = 1 */
+       /* XFS_REFCACHE_PURGE = 2 */
+       /* XFS_RESTRICT_CHOWN = 3 */
+       XFS_SGID_INHERIT = 4,
+       XFS_SYMLINK_MODE = 5,
+       XFS_PANIC_MASK = 6,
+       XFS_ERRLEVEL = 7,
+       XFS_SYNCD_TIMER = 8,
+       /* XFS_PROBE_DMAPI = 9 */
+       /* XFS_PROBE_IOOPS = 10 */
+       /* XFS_PROBE_QUOTA = 11 */
+       XFS_STATS_CLEAR = 12,
+       XFS_INHERIT_SYNC = 13,
+       XFS_INHERIT_NODUMP = 14,
+       XFS_INHERIT_NOATIME = 15,
+       XFS_BUF_TIMER = 16,
+       XFS_BUF_AGE = 17,
+       /* XFS_IO_BYPASS = 18 */
+       XFS_INHERIT_NOSYM = 19,
+       XFS_ROTORSTEP = 20,
+       XFS_INHERIT_NODFRG = 21,
+       XFS_FILESTREAM_TIMER = 22,
+};
+
+extern xfs_param_t     xfs_params;
+
+#ifdef CONFIG_SYSCTL
+extern int xfs_sysctl_register(void);
+extern void xfs_sysctl_unregister(void);
+#else
+# define xfs_sysctl_register()         (0)
+# define xfs_sysctl_unregister()       do { } while (0)
+#endif /* CONFIG_SYSCTL */
+
+#endif /* __XFS_SYSCTL_H__ */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
new file mode 100644 (file)
index 0000000..9010ce8
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_mount.h"
+#include "xfs_ialloc.h"
+#include "xfs_itable.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_quota.h"
+#include "xfs_iomap.h"
+#include "xfs_aops.h"
+#include "xfs_dquot_item.h"
+#include "xfs_dquot.h"
+#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "xfs_trace.h"
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
new file mode 100644 (file)
index 0000000..690fc7a
--- /dev/null
@@ -0,0 +1,1746 @@
+/*
+ * Copyright (c) 2009, Christoph Hellwig
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM xfs
+
+#if !defined(_TRACE_XFS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_XFS_H
+
+#include <linux/tracepoint.h>
+
+struct xfs_agf;
+struct xfs_alloc_arg;
+struct xfs_attr_list_context;
+struct xfs_buf_log_item;
+struct xfs_da_args;
+struct xfs_da_node_entry;
+struct xfs_dquot;
+struct xlog_ticket;
+struct log;
+struct xlog_recover;
+struct xlog_recover_item;
+struct xfs_buf_log_format;
+struct xfs_inode_log_format;
+
+DECLARE_EVENT_CLASS(xfs_attr_list_class,
+       TP_PROTO(struct xfs_attr_list_context *ctx),
+       TP_ARGS(ctx),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(u32, hashval)
+               __field(u32, blkno)
+               __field(u32, offset)
+               __field(void *, alist)
+               __field(int, bufsize)
+               __field(int, count)
+               __field(int, firstu)
+               __field(int, dupcnt)
+               __field(int, flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+               __entry->ino = ctx->dp->i_ino;
+               __entry->hashval = ctx->cursor->hashval;
+               __entry->blkno = ctx->cursor->blkno;
+               __entry->offset = ctx->cursor->offset;
+               __entry->alist = ctx->alist;
+               __entry->bufsize = ctx->bufsize;
+               __entry->count = ctx->count;
+               __entry->firstu = ctx->firstu;
+               __entry->flags = ctx->flags;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+                 "alist 0x%p size %u count %u firstu %u flags %d %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->hashval,
+                  __entry->blkno,
+                  __entry->offset,
+                  __entry->dupcnt,
+                  __entry->alist,
+                  __entry->bufsize,
+                  __entry->count,
+                  __entry->firstu,
+                  __entry->flags,
+                  __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS)
+       )
+)
+
+#define DEFINE_ATTR_LIST_EVENT(name) \
+DEFINE_EVENT(xfs_attr_list_class, name, \
+       TP_PROTO(struct xfs_attr_list_context *ctx), \
+       TP_ARGS(ctx))
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf_end);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+
+DECLARE_EVENT_CLASS(xfs_perag_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+                unsigned long caller_ip),
+       TP_ARGS(mp, agno, refcount, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(int, refcount)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->refcount = refcount;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->refcount,
+                 (char *)__entry->caller_ip)
+);
+
+#define DEFINE_PERAG_REF_EVENT(name)   \
+DEFINE_EVENT(xfs_perag_class, name,    \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
+                unsigned long caller_ip),                                      \
+       TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+
+TRACE_EVENT(xfs_attr_list_node_descend,
+       TP_PROTO(struct xfs_attr_list_context *ctx,
+                struct xfs_da_node_entry *btree),
+       TP_ARGS(ctx, btree),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(u32, hashval)
+               __field(u32, blkno)
+               __field(u32, offset)
+               __field(void *, alist)
+               __field(int, bufsize)
+               __field(int, count)
+               __field(int, firstu)
+               __field(int, dupcnt)
+               __field(int, flags)
+               __field(u32, bt_hashval)
+               __field(u32, bt_before)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev;
+               __entry->ino = ctx->dp->i_ino;
+               __entry->hashval = ctx->cursor->hashval;
+               __entry->blkno = ctx->cursor->blkno;
+               __entry->offset = ctx->cursor->offset;
+               __entry->alist = ctx->alist;
+               __entry->bufsize = ctx->bufsize;
+               __entry->count = ctx->count;
+               __entry->firstu = ctx->firstu;
+               __entry->flags = ctx->flags;
+               __entry->bt_hashval = be32_to_cpu(btree->hashval);
+               __entry->bt_before = be32_to_cpu(btree->before);
+       ),
+       TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u "
+                 "alist 0x%p size %u count %u firstu %u flags %d %s "
+                 "node hashval %u, node before %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->hashval,
+                  __entry->blkno,
+                  __entry->offset,
+                  __entry->dupcnt,
+                  __entry->alist,
+                  __entry->bufsize,
+                  __entry->count,
+                  __entry->firstu,
+                  __entry->flags,
+                  __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS),
+                  __entry->bt_hashval,
+                  __entry->bt_before)
+);
+
+TRACE_EVENT(xfs_iext_insert,
+       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx,
+                struct xfs_bmbt_irec *r, int state, unsigned long caller_ip),
+       TP_ARGS(ip, idx, r, state, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_extnum_t, idx)
+               __field(xfs_fileoff_t, startoff)
+               __field(xfs_fsblock_t, startblock)
+               __field(xfs_filblks_t, blockcount)
+               __field(xfs_exntst_t, state)
+               __field(int, bmap_state)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->idx = idx;
+               __entry->startoff = r->br_startoff;
+               __entry->startblock = r->br_startblock;
+               __entry->blockcount = r->br_blockcount;
+               __entry->state = r->br_state;
+               __entry->bmap_state = state;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+                 "offset %lld block %lld count %lld flag %d caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+                 (long)__entry->idx,
+                 __entry->startoff,
+                 (__int64_t)__entry->startblock,
+                 __entry->blockcount,
+                 __entry->state,
+                 (char *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_bmap_class,
+       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state,
+                unsigned long caller_ip),
+       TP_ARGS(ip, idx, state, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_extnum_t, idx)
+               __field(xfs_fileoff_t, startoff)
+               __field(xfs_fsblock_t, startblock)
+               __field(xfs_filblks_t, blockcount)
+               __field(xfs_exntst_t, state)
+               __field(int, bmap_state)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               struct xfs_ifork        *ifp = (state & BMAP_ATTRFORK) ?
+                                               ip->i_afp : &ip->i_df;
+               struct xfs_bmbt_irec    r;
+
+               xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->idx = idx;
+               __entry->startoff = r.br_startoff;
+               __entry->startblock = r.br_startblock;
+               __entry->blockcount = r.br_blockcount;
+               __entry->state = r.br_state;
+               __entry->bmap_state = state;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx state %s idx %ld "
+                 "offset %lld block %lld count %lld flag %d caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS),
+                 (long)__entry->idx,
+                 __entry->startoff,
+                 (__int64_t)__entry->startblock,
+                 __entry->blockcount,
+                 __entry->state,
+                 (char *)__entry->caller_ip)
+)
+
+#define DEFINE_BMAP_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_class, name, \
+       TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \
+                unsigned long caller_ip), \
+       TP_ARGS(ip, idx, state, caller_ip))
+DEFINE_BMAP_EVENT(xfs_iext_remove);
+DEFINE_BMAP_EVENT(xfs_bmap_pre_update);
+DEFINE_BMAP_EVENT(xfs_bmap_post_update);
+DEFINE_BMAP_EVENT(xfs_extlist);
+
+DECLARE_EVENT_CLASS(xfs_buf_class,
+       TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip),
+       TP_ARGS(bp, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_daddr_t, bno)
+               __field(size_t, buffer_length)
+               __field(int, hold)
+               __field(int, pincount)
+               __field(unsigned, lockval)
+               __field(unsigned, flags)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = bp->b_target->bt_dev;
+               __entry->bno = bp->b_bn;
+               __entry->buffer_length = bp->b_buffer_length;
+               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->pincount = atomic_read(&bp->b_pin_count);
+               __entry->lockval = bp->b_sema.count;
+               __entry->flags = bp->b_flags;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                 "lock %d flags %s caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long long)__entry->bno,
+                 __entry->buffer_length,
+                 __entry->hold,
+                 __entry->pincount,
+                 __entry->lockval,
+                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                 (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_EVENT(name) \
+DEFINE_EVENT(xfs_buf_class, name, \
+       TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \
+       TP_ARGS(bp, caller_ip))
+DEFINE_BUF_EVENT(xfs_buf_init);
+DEFINE_BUF_EVENT(xfs_buf_free);
+DEFINE_BUF_EVENT(xfs_buf_hold);
+DEFINE_BUF_EVENT(xfs_buf_rele);
+DEFINE_BUF_EVENT(xfs_buf_iodone);
+DEFINE_BUF_EVENT(xfs_buf_iorequest);
+DEFINE_BUF_EVENT(xfs_buf_bawrite);
+DEFINE_BUF_EVENT(xfs_buf_bdwrite);
+DEFINE_BUF_EVENT(xfs_buf_lock);
+DEFINE_BUF_EVENT(xfs_buf_lock_done);
+DEFINE_BUF_EVENT(xfs_buf_trylock);
+DEFINE_BUF_EVENT(xfs_buf_unlock);
+DEFINE_BUF_EVENT(xfs_buf_iowait);
+DEFINE_BUF_EVENT(xfs_buf_iowait_done);
+DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue);
+DEFINE_BUF_EVENT(xfs_buf_delwri_split);
+DEFINE_BUF_EVENT(xfs_buf_get_uncached);
+DEFINE_BUF_EVENT(xfs_bdstrat_shut);
+DEFINE_BUF_EVENT(xfs_buf_item_relse);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone);
+DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
+DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
+DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
+
+/* not really buffer traces, but the buf provides useful information */
+DEFINE_BUF_EVENT(xfs_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
+DEFINE_BUF_EVENT(xfs_reset_dqcounts);
+DEFINE_BUF_EVENT(xfs_inode_item_push);
+
+/* pass flags explicitly */
+DECLARE_EVENT_CLASS(xfs_buf_flags_class,
+       TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip),
+       TP_ARGS(bp, flags, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_daddr_t, bno)
+               __field(size_t, buffer_length)
+               __field(int, hold)
+               __field(int, pincount)
+               __field(unsigned, lockval)
+               __field(unsigned, flags)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = bp->b_target->bt_dev;
+               __entry->bno = bp->b_bn;
+               __entry->buffer_length = bp->b_buffer_length;
+               __entry->flags = flags;
+               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->pincount = atomic_read(&bp->b_pin_count);
+               __entry->lockval = bp->b_sema.count;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                 "lock %d flags %s caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long long)__entry->bno,
+                 __entry->buffer_length,
+                 __entry->hold,
+                 __entry->pincount,
+                 __entry->lockval,
+                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                 (void *)__entry->caller_ip)
+)
+
+#define DEFINE_BUF_FLAGS_EVENT(name) \
+DEFINE_EVENT(xfs_buf_flags_class, name, \
+       TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \
+       TP_ARGS(bp, flags, caller_ip))
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_find);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_get);
+DEFINE_BUF_FLAGS_EVENT(xfs_buf_read);
+
+TRACE_EVENT(xfs_buf_ioerror,
+       TP_PROTO(struct xfs_buf *bp, int error, unsigned long caller_ip),
+       TP_ARGS(bp, error, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_daddr_t, bno)
+               __field(size_t, buffer_length)
+               __field(unsigned, flags)
+               __field(int, hold)
+               __field(int, pincount)
+               __field(unsigned, lockval)
+               __field(int, error)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = bp->b_target->bt_dev;
+               __entry->bno = bp->b_bn;
+               __entry->buffer_length = bp->b_buffer_length;
+               __entry->hold = atomic_read(&bp->b_hold);
+               __entry->pincount = atomic_read(&bp->b_pin_count);
+               __entry->lockval = bp->b_sema.count;
+               __entry->error = error;
+               __entry->flags = bp->b_flags;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                 "lock %d error %d flags %s caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long long)__entry->bno,
+                 __entry->buffer_length,
+                 __entry->hold,
+                 __entry->pincount,
+                 __entry->lockval,
+                 __entry->error,
+                 __print_flags(__entry->flags, "|", XFS_BUF_FLAGS),
+                 (void *)__entry->caller_ip)
+);
+
+DECLARE_EVENT_CLASS(xfs_buf_item_class,
+       TP_PROTO(struct xfs_buf_log_item *bip),
+       TP_ARGS(bip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_daddr_t, buf_bno)
+               __field(size_t, buf_len)
+               __field(int, buf_hold)
+               __field(int, buf_pincount)
+               __field(int, buf_lockval)
+               __field(unsigned, buf_flags)
+               __field(unsigned, bli_recur)
+               __field(int, bli_refcount)
+               __field(unsigned, bli_flags)
+               __field(void *, li_desc)
+               __field(unsigned, li_flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = bip->bli_buf->b_target->bt_dev;
+               __entry->bli_flags = bip->bli_flags;
+               __entry->bli_recur = bip->bli_recur;
+               __entry->bli_refcount = atomic_read(&bip->bli_refcount);
+               __entry->buf_bno = bip->bli_buf->b_bn;
+               __entry->buf_len = bip->bli_buf->b_buffer_length;
+               __entry->buf_flags = bip->bli_buf->b_flags;
+               __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold);
+               __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count);
+               __entry->buf_lockval = bip->bli_buf->b_sema.count;
+               __entry->li_desc = bip->bli_item.li_desc;
+               __entry->li_flags = bip->bli_item.li_flags;
+       ),
+       TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d "
+                 "lock %d flags %s recur %d refcount %d bliflags %s "
+                 "lidesc 0x%p liflags %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 (unsigned long long)__entry->buf_bno,
+                 __entry->buf_len,
+                 __entry->buf_hold,
+                 __entry->buf_pincount,
+                 __entry->buf_lockval,
+                 __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS),
+                 __entry->bli_recur,
+                 __entry->bli_refcount,
+                 __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS),
+                 __entry->li_desc,
+                 __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS))
+)
+
+#define DEFINE_BUF_ITEM_EVENT(name) \
+DEFINE_EVENT(xfs_buf_item_class, name, \
+       TP_PROTO(struct xfs_buf_log_item *bip), \
+       TP_ARGS(bip))
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
+DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
+
+DECLARE_EVENT_CLASS(xfs_lock_class,
+       TP_PROTO(struct xfs_inode *ip, unsigned lock_flags,
+                unsigned long caller_ip),
+       TP_ARGS(ip,  lock_flags, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(int, lock_flags)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->lock_flags = lock_flags;
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS),
+                 (void *)__entry->caller_ip)
+)
+
+#define DEFINE_LOCK_EVENT(name) \
+DEFINE_EVENT(xfs_lock_class, name, \
+       TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \
+                unsigned long caller_ip), \
+       TP_ARGS(ip,  lock_flags, caller_ip))
+DEFINE_LOCK_EVENT(xfs_ilock);
+DEFINE_LOCK_EVENT(xfs_ilock_nowait);
+DEFINE_LOCK_EVENT(xfs_ilock_demote);
+DEFINE_LOCK_EVENT(xfs_iunlock);
+
+DECLARE_EVENT_CLASS(xfs_inode_class,
+       TP_PROTO(struct xfs_inode *ip),
+       TP_ARGS(ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino)
+)
+
+#define DEFINE_INODE_EVENT(name) \
+DEFINE_EVENT(xfs_inode_class, name, \
+       TP_PROTO(struct xfs_inode *ip), \
+       TP_ARGS(ip))
+DEFINE_INODE_EVENT(xfs_iget_skip);
+DEFINE_INODE_EVENT(xfs_iget_reclaim);
+DEFINE_INODE_EVENT(xfs_iget_reclaim_fail);
+DEFINE_INODE_EVENT(xfs_iget_hit);
+DEFINE_INODE_EVENT(xfs_iget_miss);
+
+DEFINE_INODE_EVENT(xfs_getattr);
+DEFINE_INODE_EVENT(xfs_setattr);
+DEFINE_INODE_EVENT(xfs_readlink);
+DEFINE_INODE_EVENT(xfs_alloc_file_space);
+DEFINE_INODE_EVENT(xfs_free_file_space);
+DEFINE_INODE_EVENT(xfs_readdir);
+#ifdef CONFIG_XFS_POSIX_ACL
+DEFINE_INODE_EVENT(xfs_get_acl);
+#endif
+DEFINE_INODE_EVENT(xfs_vm_bmap);
+DEFINE_INODE_EVENT(xfs_file_ioctl);
+DEFINE_INODE_EVENT(xfs_file_compat_ioctl);
+DEFINE_INODE_EVENT(xfs_ioctl_setattr);
+DEFINE_INODE_EVENT(xfs_file_fsync);
+DEFINE_INODE_EVENT(xfs_destroy_inode);
+DEFINE_INODE_EVENT(xfs_write_inode);
+DEFINE_INODE_EVENT(xfs_evict_inode);
+
+DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
+DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+
+DECLARE_EVENT_CLASS(xfs_iref_class,
+       TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
+       TP_ARGS(ip, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(int, count)
+               __field(int, pincount)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->count = atomic_read(&VFS_I(ip)->i_count);
+               __entry->pincount = atomic_read(&ip->i_pincount);
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->count,
+                 __entry->pincount,
+                 (char *)__entry->caller_ip)
+)
+
+#define DEFINE_IREF_EVENT(name) \
+DEFINE_EVENT(xfs_iref_class, name, \
+       TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
+       TP_ARGS(ip, caller_ip))
+DEFINE_IREF_EVENT(xfs_ihold);
+DEFINE_IREF_EVENT(xfs_irele);
+DEFINE_IREF_EVENT(xfs_inode_pin);
+DEFINE_IREF_EVENT(xfs_inode_unpin);
+DEFINE_IREF_EVENT(xfs_inode_unpin_nowait);
+
+DECLARE_EVENT_CLASS(xfs_namespace_class,
+       TP_PROTO(struct xfs_inode *dp, struct xfs_name *name),
+       TP_ARGS(dp, name),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, dp_ino)
+               __dynamic_array(char, name, name->len)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(dp)->i_sb->s_dev;
+               __entry->dp_ino = dp->i_ino;
+               memcpy(__get_str(name), name->name, name->len);
+       ),
+       TP_printk("dev %d:%d dp ino 0x%llx name %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->dp_ino,
+                 __get_str(name))
+)
+
+#define DEFINE_NAMESPACE_EVENT(name) \
+DEFINE_EVENT(xfs_namespace_class, name, \
+       TP_PROTO(struct xfs_inode *dp, struct xfs_name *name), \
+       TP_ARGS(dp, name))
+DEFINE_NAMESPACE_EVENT(xfs_remove);
+DEFINE_NAMESPACE_EVENT(xfs_link);
+DEFINE_NAMESPACE_EVENT(xfs_lookup);
+DEFINE_NAMESPACE_EVENT(xfs_create);
+DEFINE_NAMESPACE_EVENT(xfs_symlink);
+
+TRACE_EVENT(xfs_rename,
+       TP_PROTO(struct xfs_inode *src_dp, struct xfs_inode *target_dp,
+                struct xfs_name *src_name, struct xfs_name *target_name),
+       TP_ARGS(src_dp, target_dp, src_name, target_name),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, src_dp_ino)
+               __field(xfs_ino_t, target_dp_ino)
+               __dynamic_array(char, src_name, src_name->len)
+               __dynamic_array(char, target_name, target_name->len)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(src_dp)->i_sb->s_dev;
+               __entry->src_dp_ino = src_dp->i_ino;
+               __entry->target_dp_ino = target_dp->i_ino;
+               memcpy(__get_str(src_name), src_name->name, src_name->len);
+               memcpy(__get_str(target_name), target_name->name, target_name->len);
+       ),
+       TP_printk("dev %d:%d src dp ino 0x%llx target dp ino 0x%llx"
+                 " src name %s target name %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->src_dp_ino,
+                 __entry->target_dp_ino,
+                 __get_str(src_name),
+                 __get_str(target_name))
+)
+
+DECLARE_EVENT_CLASS(xfs_dquot_class,
+       TP_PROTO(struct xfs_dquot *dqp),
+       TP_ARGS(dqp),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(u32, id)
+               __field(unsigned, flags)
+               __field(unsigned, nrefs)
+               __field(unsigned long long, res_bcount)
+               __field(unsigned long long, bcount)
+               __field(unsigned long long, icount)
+               __field(unsigned long long, blk_hardlimit)
+               __field(unsigned long long, blk_softlimit)
+               __field(unsigned long long, ino_hardlimit)
+               __field(unsigned long long, ino_softlimit)
+       ), \
+       TP_fast_assign(
+               __entry->dev = dqp->q_mount->m_super->s_dev;
+               __entry->id = be32_to_cpu(dqp->q_core.d_id);
+               __entry->flags = dqp->dq_flags;
+               __entry->nrefs = dqp->q_nrefs;
+               __entry->res_bcount = dqp->q_res_bcount;
+               __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount);
+               __entry->icount = be64_to_cpu(dqp->q_core.d_icount);
+               __entry->blk_hardlimit =
+                       be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+               __entry->blk_softlimit =
+                       be64_to_cpu(dqp->q_core.d_blk_softlimit);
+               __entry->ino_hardlimit =
+                       be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+               __entry->ino_softlimit =
+                       be64_to_cpu(dqp->q_core.d_ino_softlimit);
+       ),
+       TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx "
+                 "bcnt 0x%llx bhardlimit 0x%llx bsoftlimit 0x%llx "
+                 "icnt 0x%llx ihardlimit 0x%llx isoftlimit 0x%llx]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->id,
+                 __print_flags(__entry->flags, "|", XFS_DQ_FLAGS),
+                 __entry->nrefs,
+                 __entry->res_bcount,
+                 __entry->bcount,
+                 __entry->blk_hardlimit,
+                 __entry->blk_softlimit,
+                 __entry->icount,
+                 __entry->ino_hardlimit,
+                 __entry->ino_softlimit)
+)
+
+#define DEFINE_DQUOT_EVENT(name) \
+DEFINE_EVENT(xfs_dquot_class, name, \
+       TP_PROTO(struct xfs_dquot *dqp), \
+       TP_ARGS(dqp))
+DEFINE_DQUOT_EVENT(xfs_dqadjust);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_want);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_dirty);
+DEFINE_DQUOT_EVENT(xfs_dqreclaim_unlink);
+DEFINE_DQUOT_EVENT(xfs_dqattach_found);
+DEFINE_DQUOT_EVENT(xfs_dqattach_get);
+DEFINE_DQUOT_EVENT(xfs_dqinit);
+DEFINE_DQUOT_EVENT(xfs_dqreuse);
+DEFINE_DQUOT_EVENT(xfs_dqalloc);
+DEFINE_DQUOT_EVENT(xfs_dqtobp_read);
+DEFINE_DQUOT_EVENT(xfs_dqread);
+DEFINE_DQUOT_EVENT(xfs_dqread_fail);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_found);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_want);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_freelist);
+DEFINE_DQUOT_EVENT(xfs_dqlookup_done);
+DEFINE_DQUOT_EVENT(xfs_dqget_hit);
+DEFINE_DQUOT_EVENT(xfs_dqget_miss);
+DEFINE_DQUOT_EVENT(xfs_dqput);
+DEFINE_DQUOT_EVENT(xfs_dqput_wait);
+DEFINE_DQUOT_EVENT(xfs_dqput_free);
+DEFINE_DQUOT_EVENT(xfs_dqrele);
+DEFINE_DQUOT_EVENT(xfs_dqflush);
+DEFINE_DQUOT_EVENT(xfs_dqflush_force);
+DEFINE_DQUOT_EVENT(xfs_dqflush_done);
+
+DECLARE_EVENT_CLASS(xfs_loggrant_class,
+       TP_PROTO(struct log *log, struct xlog_ticket *tic),
+       TP_ARGS(log, tic),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned, trans_type)
+               __field(char, ocnt)
+               __field(char, cnt)
+               __field(int, curr_res)
+               __field(int, unit_res)
+               __field(unsigned int, flags)
+               __field(int, reserveq)
+               __field(int, writeq)
+               __field(int, grant_reserve_cycle)
+               __field(int, grant_reserve_bytes)
+               __field(int, grant_write_cycle)
+               __field(int, grant_write_bytes)
+               __field(int, curr_cycle)
+               __field(int, curr_block)
+               __field(xfs_lsn_t, tail_lsn)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->trans_type = tic->t_trans_type;
+               __entry->ocnt = tic->t_ocnt;
+               __entry->cnt = tic->t_cnt;
+               __entry->curr_res = tic->t_curr_res;
+               __entry->unit_res = tic->t_unit_res;
+               __entry->flags = tic->t_flags;
+               __entry->reserveq = list_empty(&log->l_reserveq);
+               __entry->writeq = list_empty(&log->l_writeq);
+               xlog_crack_grant_head(&log->l_grant_reserve_head,
+                               &__entry->grant_reserve_cycle,
+                               &__entry->grant_reserve_bytes);
+               xlog_crack_grant_head(&log->l_grant_write_head,
+                               &__entry->grant_write_cycle,
+                               &__entry->grant_write_bytes);
+               __entry->curr_cycle = log->l_curr_cycle;
+               __entry->curr_block = log->l_curr_block;
+               __entry->tail_lsn = atomic64_read(&log->l_tail_lsn);
+       ),
+       TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u "
+                 "t_unit_res %u t_flags %s reserveq %s "
+                 "writeq %s grant_reserve_cycle %d "
+                 "grant_reserve_bytes %d grant_write_cycle %d "
+                 "grant_write_bytes %d curr_cycle %d curr_block %d "
+                 "tail_cycle %d tail_block %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES),
+                 __entry->ocnt,
+                 __entry->cnt,
+                 __entry->curr_res,
+                 __entry->unit_res,
+                 __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS),
+                 __entry->reserveq ? "empty" : "active",
+                 __entry->writeq ? "empty" : "active",
+                 __entry->grant_reserve_cycle,
+                 __entry->grant_reserve_bytes,
+                 __entry->grant_write_cycle,
+                 __entry->grant_write_bytes,
+                 __entry->curr_cycle,
+                 __entry->curr_block,
+                 CYCLE_LSN(__entry->tail_lsn),
+                 BLOCK_LSN(__entry->tail_lsn)
+       )
+)
+
+#define DEFINE_LOGGRANT_EVENT(name) \
+DEFINE_EVENT(xfs_loggrant_class, name, \
+       TP_PROTO(struct log *log, struct xlog_ticket *tic), \
+       TP_ARGS(log, tic))
+DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm);
+DEFINE_LOGGRANT_EVENT(xfs_log_done_perm);
+DEFINE_LOGGRANT_EVENT(xfs_log_reserve);
+DEFINE_LOGGRANT_EVENT(xfs_log_umount_write);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
+DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+
+DECLARE_EVENT_CLASS(xfs_file_class,
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
+       TP_ARGS(ip, count, offset, flags),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_fsize_t, size)
+               __field(xfs_fsize_t, new_size)
+               __field(loff_t, offset)
+               __field(size_t, count)
+               __field(int, flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->size = ip->i_d.di_size;
+               __entry->new_size = ip->i_new_size;
+               __entry->offset = offset;
+               __entry->count = count;
+               __entry->flags = flags;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+                 "offset 0x%llx count 0x%zx ioflags %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->new_size,
+                 __entry->offset,
+                 __entry->count,
+                 __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
+)
+
+#define DEFINE_RW_EVENT(name)          \
+DEFINE_EVENT(xfs_file_class, name,     \
+       TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+       TP_ARGS(ip, count, offset, flags))
+DEFINE_RW_EVENT(xfs_file_read);
+DEFINE_RW_EVENT(xfs_file_buffered_write);
+DEFINE_RW_EVENT(xfs_file_direct_write);
+DEFINE_RW_EVENT(xfs_file_splice_read);
+DEFINE_RW_EVENT(xfs_file_splice_write);
+
+DECLARE_EVENT_CLASS(xfs_page_class,
+       TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
+       TP_ARGS(inode, page, off),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(pgoff_t, pgoff)
+               __field(loff_t, size)
+               __field(unsigned long, offset)
+               __field(int, delalloc)
+               __field(int, unwritten)
+       ),
+       TP_fast_assign(
+               int delalloc = -1, unwritten = -1;
+
+               if (page_has_buffers(page))
+                       xfs_count_page_state(page, &delalloc, &unwritten);
+               __entry->dev = inode->i_sb->s_dev;
+               __entry->ino = XFS_I(inode)->i_ino;
+               __entry->pgoff = page_offset(page);
+               __entry->size = i_size_read(inode);
+               __entry->offset = off;
+               __entry->delalloc = delalloc;
+               __entry->unwritten = unwritten;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
+                 "delalloc %d unwritten %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->pgoff,
+                 __entry->size,
+                 __entry->offset,
+                 __entry->delalloc,
+                 __entry->unwritten)
+)
+
+#define DEFINE_PAGE_EVENT(name)                \
+DEFINE_EVENT(xfs_page_class, name,     \
+       TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+       TP_ARGS(inode, page, off))
+DEFINE_PAGE_EVENT(xfs_writepage);
+DEFINE_PAGE_EVENT(xfs_releasepage);
+DEFINE_PAGE_EVENT(xfs_invalidatepage);
+
+DECLARE_EVENT_CLASS(xfs_imap_class,
+       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
+                int type, struct xfs_bmbt_irec *irec),
+       TP_ARGS(ip, offset, count, type, irec),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(loff_t, size)
+               __field(loff_t, new_size)
+               __field(loff_t, offset)
+               __field(size_t, count)
+               __field(int, type)
+               __field(xfs_fileoff_t, startoff)
+               __field(xfs_fsblock_t, startblock)
+               __field(xfs_filblks_t, blockcount)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->size = ip->i_d.di_size;
+               __entry->new_size = ip->i_new_size;
+               __entry->offset = offset;
+               __entry->count = count;
+               __entry->type = type;
+               __entry->startoff = irec ? irec->br_startoff : 0;
+               __entry->startblock = irec ? irec->br_startblock : 0;
+               __entry->blockcount = irec ? irec->br_blockcount : 0;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
+                 "offset 0x%llx count %zd type %s "
+                 "startoff 0x%llx startblock %lld blockcount 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->new_size,
+                 __entry->offset,
+                 __entry->count,
+                 __print_symbolic(__entry->type, XFS_IO_TYPES),
+                 __entry->startoff,
+                 (__int64_t)__entry->startblock,
+                 __entry->blockcount)
+)
+
+#define DEFINE_IOMAP_EVENT(name)       \
+DEFINE_EVENT(xfs_imap_class, name,     \
+       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                int type, struct xfs_bmbt_irec *irec),         \
+       TP_ARGS(ip, offset, count, type, irec))
+DEFINE_IOMAP_EVENT(xfs_map_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_found);
+DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc);
+
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
+       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
+       TP_ARGS(ip, offset, count),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(loff_t, isize)
+               __field(loff_t, disize)
+               __field(loff_t, new_size)
+               __field(loff_t, offset)
+               __field(size_t, count)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->isize = ip->i_size;
+               __entry->disize = ip->i_d.di_size;
+               __entry->new_size = ip->i_new_size;
+               __entry->offset = offset;
+               __entry->count = count;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx isize 0x%llx disize 0x%llx new_size 0x%llx "
+                 "offset 0x%llx count %zd",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->isize,
+                 __entry->disize,
+                 __entry->new_size,
+                 __entry->offset,
+                 __entry->count)
+);
+
+#define DEFINE_SIMPLE_IO_EVENT(name)   \
+DEFINE_EVENT(xfs_simple_io_class, name,        \
+       TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
+       TP_ARGS(ip, offset, count))
+DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
+DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
+DEFINE_SIMPLE_IO_EVENT(xfs_get_blocks_notfound);
+DEFINE_SIMPLE_IO_EVENT(xfs_setfilesize);
+
+DECLARE_EVENT_CLASS(xfs_itrunc_class,
+       TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size),
+       TP_ARGS(ip, new_size),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_fsize_t, size)
+               __field(xfs_fsize_t, new_size)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->size = ip->i_d.di_size;
+               __entry->new_size = new_size;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->new_size)
+)
+
+#define DEFINE_ITRUNC_EVENT(name) \
+DEFINE_EVENT(xfs_itrunc_class, name, \
+       TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \
+       TP_ARGS(ip, new_size))
+DEFINE_ITRUNC_EVENT(xfs_itruncate_data_start);
+DEFINE_ITRUNC_EVENT(xfs_itruncate_data_end);
+
+TRACE_EVENT(xfs_pagecache_inval,
+       TP_PROTO(struct xfs_inode *ip, xfs_off_t start, xfs_off_t finish),
+       TP_ARGS(ip, start, finish),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_fsize_t, size)
+               __field(xfs_off_t, start)
+               __field(xfs_off_t, finish)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->size = ip->i_d.di_size;
+               __entry->start = start;
+               __entry->finish = finish;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx start 0x%llx finish 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->start,
+                 __entry->finish)
+);
+
+TRACE_EVENT(xfs_bunmap,
+       TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len,
+                int flags, unsigned long caller_ip),
+       TP_ARGS(ip, bno, len, flags, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(xfs_fsize_t, size)
+               __field(xfs_fileoff_t, bno)
+               __field(xfs_filblks_t, len)
+               __field(unsigned long, caller_ip)
+               __field(int, flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->ino = ip->i_ino;
+               __entry->size = ip->i_d.di_size;
+               __entry->bno = bno;
+               __entry->len = len;
+               __entry->caller_ip = caller_ip;
+               __entry->flags = flags;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx"
+                 "flags %s caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->bno,
+                 __entry->len,
+                 __print_flags(__entry->flags, "|", XFS_BMAPI_FLAGS),
+                 (void *)__entry->caller_ip)
+
+);
+
+DECLARE_EVENT_CLASS(xfs_busy_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len),
+       TP_ARGS(mp, agno, agbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len)
+);
+#define DEFINE_BUSY_EVENT(name) \
+DEFINE_EVENT(xfs_busy_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len), \
+       TP_ARGS(mp, agno, agbno, len))
+DEFINE_BUSY_EVENT(xfs_alloc_busy);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_enomem);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_force);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_reuse);
+DEFINE_BUSY_EVENT(xfs_alloc_busy_clear);
+
+TRACE_EVENT(xfs_alloc_busy_trim,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len,
+                xfs_agblock_t tbno, xfs_extlen_t tlen),
+       TP_ARGS(mp, agno, agbno, len, tbno, tlen),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+               __field(xfs_agblock_t, tbno)
+               __field(xfs_extlen_t, tlen)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+               __entry->tbno = tbno;
+               __entry->tlen = tlen;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u tbno %u tlen %u",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len,
+                 __entry->tbno,
+                 __entry->tlen)
+);
+
+TRACE_EVENT(xfs_trans_commit_lsn,
+       TP_PROTO(struct xfs_trans *trans),
+       TP_ARGS(trans),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(struct xfs_trans *, tp)
+               __field(xfs_lsn_t, lsn)
+       ),
+       TP_fast_assign(
+               __entry->dev = trans->t_mountp->m_super->s_dev;
+               __entry->tp = trans;
+               __entry->lsn = trans->t_commit_lsn;
+       ),
+       TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->tp,
+                 __entry->lsn)
+);
+
+TRACE_EVENT(xfs_agf,
+       TP_PROTO(struct xfs_mount *mp, struct xfs_agf *agf, int flags,
+                unsigned long caller_ip),
+       TP_ARGS(mp, agf, flags, caller_ip),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(int, flags)
+               __field(__u32, length)
+               __field(__u32, bno_root)
+               __field(__u32, cnt_root)
+               __field(__u32, bno_level)
+               __field(__u32, cnt_level)
+               __field(__u32, flfirst)
+               __field(__u32, fllast)
+               __field(__u32, flcount)
+               __field(__u32, freeblks)
+               __field(__u32, longest)
+               __field(unsigned long, caller_ip)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = be32_to_cpu(agf->agf_seqno),
+               __entry->flags = flags;
+               __entry->length = be32_to_cpu(agf->agf_length),
+               __entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
+               __entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
+               __entry->bno_level =
+                               be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
+               __entry->cnt_level =
+                               be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+               __entry->flfirst = be32_to_cpu(agf->agf_flfirst),
+               __entry->fllast = be32_to_cpu(agf->agf_fllast),
+               __entry->flcount = be32_to_cpu(agf->agf_flcount),
+               __entry->freeblks = be32_to_cpu(agf->agf_freeblks),
+               __entry->longest = be32_to_cpu(agf->agf_longest);
+               __entry->caller_ip = caller_ip;
+       ),
+       TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u "
+                 "levels b %u c %u flfirst %u fllast %u flcount %u "
+                 "freeblks %u longest %u caller %pf",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __print_flags(__entry->flags, "|", XFS_AGF_FLAGS),
+                 __entry->length,
+                 __entry->bno_root,
+                 __entry->cnt_root,
+                 __entry->bno_level,
+                 __entry->cnt_level,
+                 __entry->flfirst,
+                 __entry->fllast,
+                 __entry->flcount,
+                 __entry->freeblks,
+                 __entry->longest,
+                 (void *)__entry->caller_ip)
+);
+
+TRACE_EVENT(xfs_free_extent,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+                xfs_extlen_t len, bool isfl, int haveleft, int haveright),
+       TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+               __field(int, isfl)
+               __field(int, haveleft)
+               __field(int, haveright)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+               __entry->isfl = isfl;
+               __entry->haveleft = haveleft;
+               __entry->haveright = haveright;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len,
+                 __entry->isfl,
+                 __entry->haveleft ?
+                       (__entry->haveright ? "both" : "left") :
+                       (__entry->haveright ? "right" : "none"))
+
+);
+
+DECLARE_EVENT_CLASS(xfs_alloc_class,
+       TP_PROTO(struct xfs_alloc_arg *args),
+       TP_ARGS(args),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, minlen)
+               __field(xfs_extlen_t, maxlen)
+               __field(xfs_extlen_t, mod)
+               __field(xfs_extlen_t, prod)
+               __field(xfs_extlen_t, minleft)
+               __field(xfs_extlen_t, total)
+               __field(xfs_extlen_t, alignment)
+               __field(xfs_extlen_t, minalignslop)
+               __field(xfs_extlen_t, len)
+               __field(short, type)
+               __field(short, otype)
+               __field(char, wasdel)
+               __field(char, wasfromfl)
+               __field(char, isfl)
+               __field(char, userdata)
+               __field(xfs_fsblock_t, firstblock)
+       ),
+       TP_fast_assign(
+               __entry->dev = args->mp->m_super->s_dev;
+               __entry->agno = args->agno;
+               __entry->agbno = args->agbno;
+               __entry->minlen = args->minlen;
+               __entry->maxlen = args->maxlen;
+               __entry->mod = args->mod;
+               __entry->prod = args->prod;
+               __entry->minleft = args->minleft;
+               __entry->total = args->total;
+               __entry->alignment = args->alignment;
+               __entry->minalignslop = args->minalignslop;
+               __entry->len = args->len;
+               __entry->type = args->type;
+               __entry->otype = args->otype;
+               __entry->wasdel = args->wasdel;
+               __entry->wasfromfl = args->wasfromfl;
+               __entry->isfl = args->isfl;
+               __entry->userdata = args->userdata;
+               __entry->firstblock = args->firstblock;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u "
+                 "prod %u minleft %u total %u alignment %u minalignslop %u "
+                 "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d "
+                 "userdata %d firstblock 0x%llx",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->minlen,
+                 __entry->maxlen,
+                 __entry->mod,
+                 __entry->prod,
+                 __entry->minleft,
+                 __entry->total,
+                 __entry->alignment,
+                 __entry->minalignslop,
+                 __entry->len,
+                 __print_symbolic(__entry->type, XFS_ALLOC_TYPES),
+                 __print_symbolic(__entry->otype, XFS_ALLOC_TYPES),
+                 __entry->wasdel,
+                 __entry->wasfromfl,
+                 __entry->isfl,
+                 __entry->userdata,
+                 (unsigned long long)__entry->firstblock)
+)
+
+#define DEFINE_ALLOC_EVENT(name) \
+DEFINE_EVENT(xfs_alloc_class, name, \
+       TP_PROTO(struct xfs_alloc_arg *args), \
+       TP_ARGS(args))
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_notfound);
+DEFINE_ALLOC_EVENT(xfs_alloc_exact_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_first);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_greater);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_lesser);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_near_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_neither);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_noentry);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_nominleft);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_size_busy);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_freelist);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_notenough);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_done);
+DEFINE_ALLOC_EVENT(xfs_alloc_small_error);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_badargs);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_nofix);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed);
+DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed);
+
+DECLARE_EVENT_CLASS(xfs_dir2_class,
+       TP_PROTO(struct xfs_da_args *args),
+       TP_ARGS(args),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __dynamic_array(char, name, args->namelen)
+               __field(int, namelen)
+               __field(xfs_dahash_t, hashval)
+               __field(xfs_ino_t, inumber)
+               __field(int, op_flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+               __entry->ino = args->dp->i_ino;
+               if (args->namelen)
+                       memcpy(__get_str(name), args->name, args->namelen);
+               __entry->namelen = args->namelen;
+               __entry->hashval = args->hashval;
+               __entry->inumber = args->inumber;
+               __entry->op_flags = args->op_flags;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x "
+                 "inumber 0x%llx op_flags %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->namelen,
+                 __entry->namelen ? __get_str(name) : NULL,
+                 __entry->namelen,
+                 __entry->hashval,
+                 __entry->inumber,
+                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
+#define DEFINE_DIR2_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_class, name, \
+       TP_PROTO(struct xfs_da_args *args), \
+       TP_ARGS(args))
+DEFINE_DIR2_EVENT(xfs_dir2_sf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_create);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8);
+DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_block_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_block_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_block_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_block_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf);
+DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block);
+DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node);
+DEFINE_DIR2_EVENT(xfs_dir2_node_addname);
+DEFINE_DIR2_EVENT(xfs_dir2_node_lookup);
+DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
+DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
+DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+
+DECLARE_EVENT_CLASS(xfs_dir2_space_class,
+       TP_PROTO(struct xfs_da_args *args, int idx),
+       TP_ARGS(args, idx),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(int, op_flags)
+               __field(int, idx)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+               __entry->ino = args->dp->i_ino;
+               __entry->op_flags = args->op_flags;
+               __entry->idx = idx;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+                 __entry->idx)
+)
+
+#define DEFINE_DIR2_SPACE_EVENT(name) \
+DEFINE_EVENT(xfs_dir2_space_class, name, \
+       TP_PROTO(struct xfs_da_args *args, int idx), \
+       TP_ARGS(args, idx))
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode);
+DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode);
+
+TRACE_EVENT(xfs_dir2_leafn_moveents,
+       TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count),
+       TP_ARGS(args, src_idx, dst_idx, count),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(int, op_flags)
+               __field(int, src_idx)
+               __field(int, dst_idx)
+               __field(int, count)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+               __entry->ino = args->dp->i_ino;
+               __entry->op_flags = args->op_flags;
+               __entry->src_idx = src_idx;
+               __entry->dst_idx = dst_idx;
+               __entry->count = count;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx op_flags %s "
+                 "src_idx %d dst_idx %d count %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS),
+                 __entry->src_idx,
+                 __entry->dst_idx,
+                 __entry->count)
+);
+
+#define XFS_SWAPEXT_INODES \
+       { 0,    "target" }, \
+       { 1,    "temp" }
+
+#define XFS_INODE_FORMAT_STR \
+       { 0,    "invalid" }, \
+       { 1,    "local" }, \
+       { 2,    "extent" }, \
+       { 3,    "btree" }
+
+DECLARE_EVENT_CLASS(xfs_swap_extent_class,
+       TP_PROTO(struct xfs_inode *ip, int which),
+       TP_ARGS(ip, which),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(int, which)
+               __field(xfs_ino_t, ino)
+               __field(int, format)
+               __field(int, nex)
+               __field(int, max_nex)
+               __field(int, broot_size)
+               __field(int, fork_off)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(ip)->i_sb->s_dev;
+               __entry->which = which;
+               __entry->ino = ip->i_ino;
+               __entry->format = ip->i_d.di_format;
+               __entry->nex = ip->i_d.di_nextents;
+               __entry->max_nex = ip->i_df.if_ext_max;
+               __entry->broot_size = ip->i_df.if_broot_bytes;
+               __entry->fork_off = XFS_IFORK_BOFF(ip);
+       ),
+       TP_printk("dev %d:%d ino 0x%llx (%s), %s format, num_extents %d, "
+                 "Max in-fork extents %d, broot size %d, fork offset %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __print_symbolic(__entry->which, XFS_SWAPEXT_INODES),
+                 __print_symbolic(__entry->format, XFS_INODE_FORMAT_STR),
+                 __entry->nex,
+                 __entry->max_nex,
+                 __entry->broot_size,
+                 __entry->fork_off)
+)
+
+#define DEFINE_SWAPEXT_EVENT(name) \
+DEFINE_EVENT(xfs_swap_extent_class, name, \
+       TP_PROTO(struct xfs_inode *ip, int which), \
+       TP_ARGS(ip, which))
+
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before);
+DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_item_class,
+       TP_PROTO(struct log *log, struct xlog_recover *trans,
+               struct xlog_recover_item *item, int pass),
+       TP_ARGS(log, trans, item, pass),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(unsigned long, item)
+               __field(xlog_tid_t, tid)
+               __field(int, type)
+               __field(int, pass)
+               __field(int, count)
+               __field(int, total)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->item = (unsigned long)item;
+               __entry->tid = trans->r_log_tid;
+               __entry->type = ITEM_TYPE(item);
+               __entry->pass = pass;
+               __entry->count = item->ri_cnt;
+               __entry->total = item->ri_total;
+       ),
+       TP_printk("dev %d:%d trans 0x%x, pass %d, item 0x%p, item type %s "
+                 "item region count/total %d/%d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->tid,
+                 __entry->pass,
+                 (void *)__entry->item,
+                 __print_symbolic(__entry->type, XFS_LI_TYPE_DESC),
+                 __entry->count,
+                 __entry->total)
+)
+
+#define DEFINE_LOG_RECOVER_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_item_class, name, \
+       TP_PROTO(struct log *log, struct xlog_recover *trans, \
+               struct xlog_recover_item *item, int pass), \
+       TP_ARGS(log, trans, item, pass))
+
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_add_cont);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_head);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_reorder_tail);
+DEFINE_LOG_RECOVER_ITEM(xfs_log_recover_item_recover);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_buf_item_class,
+       TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f),
+       TP_ARGS(log, buf_f),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(__int64_t, blkno)
+               __field(unsigned short, len)
+               __field(unsigned short, flags)
+               __field(unsigned short, size)
+               __field(unsigned int, map_size)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->blkno = buf_f->blf_blkno;
+               __entry->len = buf_f->blf_len;
+               __entry->flags = buf_f->blf_flags;
+               __entry->size = buf_f->blf_size;
+               __entry->map_size = buf_f->blf_map_size;
+       ),
+       TP_printk("dev %d:%d blkno 0x%llx, len %u, flags 0x%x, size %d, "
+                       "map_size %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->blkno,
+                 __entry->len,
+                 __entry->flags,
+                 __entry->size,
+                 __entry->map_size)
+)
+
+#define DEFINE_LOG_RECOVER_BUF_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_buf_item_class, name, \
+       TP_PROTO(struct log *log, struct xfs_buf_log_format *buf_f), \
+       TP_ARGS(log, buf_f))
+
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_not_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_add);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_cancel_ref_inc);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_recover);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_inode_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_reg_buf);
+DEFINE_LOG_RECOVER_BUF_ITEM(xfs_log_recover_buf_dquot_buf);
+
+DECLARE_EVENT_CLASS(xfs_log_recover_ino_item_class,
+       TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f),
+       TP_ARGS(log, in_f),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __field(unsigned short, size)
+               __field(int, fields)
+               __field(unsigned short, asize)
+               __field(unsigned short, dsize)
+               __field(__int64_t, blkno)
+               __field(int, len)
+               __field(int, boffset)
+       ),
+       TP_fast_assign(
+               __entry->dev = log->l_mp->m_super->s_dev;
+               __entry->ino = in_f->ilf_ino;
+               __entry->size = in_f->ilf_size;
+               __entry->fields = in_f->ilf_fields;
+               __entry->asize = in_f->ilf_asize;
+               __entry->dsize = in_f->ilf_dsize;
+               __entry->blkno = in_f->ilf_blkno;
+               __entry->len = in_f->ilf_len;
+               __entry->boffset = in_f->ilf_boffset;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx, size %u, fields 0x%x, asize %d, "
+                       "dsize %d, blkno 0x%llx, len %d, boffset %d",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->size,
+                 __entry->fields,
+                 __entry->asize,
+                 __entry->dsize,
+                 __entry->blkno,
+                 __entry->len,
+                 __entry->boffset)
+)
+#define DEFINE_LOG_RECOVER_INO_ITEM(name) \
+DEFINE_EVENT(xfs_log_recover_ino_item_class, name, \
+       TP_PROTO(struct log *log, struct xfs_inode_log_format *in_f), \
+       TP_ARGS(log, in_f))
+
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_recover);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_cancel);
+DEFINE_LOG_RECOVER_INO_ITEM(xfs_log_recover_inode_skip);
+
+DECLARE_EVENT_CLASS(xfs_discard_class,
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+                xfs_agblock_t agbno, xfs_extlen_t len),
+       TP_ARGS(mp, agno, agbno, len),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_agnumber_t, agno)
+               __field(xfs_agblock_t, agbno)
+               __field(xfs_extlen_t, len)
+       ),
+       TP_fast_assign(
+               __entry->dev = mp->m_super->s_dev;
+               __entry->agno = agno;
+               __entry->agbno = agbno;
+               __entry->len = len;
+       ),
+       TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->agno,
+                 __entry->agbno,
+                 __entry->len)
+)
+
+#define DEFINE_DISCARD_EVENT(name) \
+DEFINE_EVENT(xfs_discard_class, name, \
+       TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+                xfs_agblock_t agbno, xfs_extlen_t len), \
+       TP_ARGS(mp, agno, agbno, len))
+DEFINE_DISCARD_EVENT(xfs_discard_extent);
+DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
+DEFINE_DISCARD_EVENT(xfs_discard_exclude);
+DEFINE_DISCARD_EVENT(xfs_discard_busy);
+
+#endif /* _TRACE_XFS_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE xfs_trace
+#include <trace/define_trace.h>
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
new file mode 100644 (file)
index 0000000..4d00ee6
--- /dev/null
@@ -0,0 +1,890 @@
+/*
+ * Copyright (c) 2000-2002 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_alloc.h"
+#include "xfs_quota.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_itable.h"
+#include "xfs_bmap.h"
+#include "xfs_rtalloc.h"
+#include "xfs_error.h"
+#include "xfs_attr.h"
+#include "xfs_buf_item.h"
+#include "xfs_trans_priv.h"
+#include "xfs_qm.h"
+
+STATIC void    xfs_trans_alloc_dqinfo(xfs_trans_t *);
+
+/*
+ * Add the locked dquot to the transaction.
+ * The dquot must be locked, and it cannot be associated with any
+ * transaction.
+ */
+void
+xfs_trans_dqjoin(
+       xfs_trans_t     *tp,
+       xfs_dquot_t     *dqp)
+{
+       ASSERT(dqp->q_transp != tp);
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+       ASSERT(dqp->q_logitem.qli_dquot == dqp);
+
+       /*
+        * Get a log_item_desc to point at the new item.
+        */
+       xfs_trans_add_item(tp, &dqp->q_logitem.qli_item);
+
+       /*
+        * Initialize d_transp so we can later determine if this dquot is
+        * associated with this transaction.
+        */
+       dqp->q_transp = tp;
+}
+
+
+/*
+ * This is called to mark the dquot as needing
+ * to be logged when the transaction is committed.  The dquot must
+ * already be associated with the given transaction.
+ * Note that it marks the entire transaction as dirty. In the ordinary
+ * case, this gets called via xfs_trans_commit, after the transaction
+ * is already dirty. However, there's nothing stop this from getting
+ * called directly, as done by xfs_qm_scall_setqlim. Hence, the TRANS_DIRTY
+ * flag.
+ */
+void
+xfs_trans_log_dquot(
+       xfs_trans_t     *tp,
+       xfs_dquot_t     *dqp)
+{
+       ASSERT(dqp->q_transp == tp);
+       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+/*
+ * Carry forward whatever is left of the quota blk reservation to
+ * the spanky new transaction
+ */
+void
+xfs_trans_dup_dqinfo(
+       xfs_trans_t     *otp,
+       xfs_trans_t     *ntp)
+{
+       xfs_dqtrx_t     *oq, *nq;
+       int             i,j;
+       xfs_dqtrx_t     *oqa, *nqa;
+
+       if (!otp->t_dqinfo)
+               return;
+
+       xfs_trans_alloc_dqinfo(ntp);
+       oqa = otp->t_dqinfo->dqa_usrdquots;
+       nqa = ntp->t_dqinfo->dqa_usrdquots;
+
+       /*
+        * Because the quota blk reservation is carried forward,
+        * it is also necessary to carry forward the DQ_DIRTY flag.
+        */
+       if(otp->t_flags & XFS_TRANS_DQ_DIRTY)
+               ntp->t_flags |= XFS_TRANS_DQ_DIRTY;
+
+       for (j = 0; j < 2; j++) {
+               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       if (oqa[i].qt_dquot == NULL)
+                               break;
+                       oq = &oqa[i];
+                       nq = &nqa[i];
+
+                       nq->qt_dquot = oq->qt_dquot;
+                       nq->qt_bcount_delta = nq->qt_icount_delta = 0;
+                       nq->qt_rtbcount_delta = 0;
+
+                       /*
+                        * Transfer whatever is left of the reservations.
+                        */
+                       nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used;
+                       oq->qt_blk_res = oq->qt_blk_res_used;
+
+                       nq->qt_rtblk_res = oq->qt_rtblk_res -
+                               oq->qt_rtblk_res_used;
+                       oq->qt_rtblk_res = oq->qt_rtblk_res_used;
+
+                       nq->qt_ino_res = oq->qt_ino_res - oq->qt_ino_res_used;
+                       oq->qt_ino_res = oq->qt_ino_res_used;
+
+               }
+               oqa = otp->t_dqinfo->dqa_grpdquots;
+               nqa = ntp->t_dqinfo->dqa_grpdquots;
+       }
+}
+
+/*
+ * Wrap around mod_dquot to account for both user and group quotas.
+ */
+void
+xfs_trans_mod_dquot_byino(
+       xfs_trans_t     *tp,
+       xfs_inode_t     *ip,
+       uint            field,
+       long            delta)
+{
+       xfs_mount_t     *mp = tp->t_mountp;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) ||
+           !XFS_IS_QUOTA_ON(mp) ||
+           ip->i_ino == mp->m_sb.sb_uquotino ||
+           ip->i_ino == mp->m_sb.sb_gquotino)
+               return;
+
+       if (tp->t_dqinfo == NULL)
+               xfs_trans_alloc_dqinfo(tp);
+
+       if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
+               (void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+       if (XFS_IS_OQUOTA_ON(mp) && ip->i_gdquot)
+               (void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+}
+
+STATIC xfs_dqtrx_t *
+xfs_trans_get_dqtrx(
+       xfs_trans_t     *tp,
+       xfs_dquot_t     *dqp)
+{
+       int             i;
+       xfs_dqtrx_t     *qa;
+
+       qa = XFS_QM_ISUDQ(dqp) ?
+               tp->t_dqinfo->dqa_usrdquots : tp->t_dqinfo->dqa_grpdquots;
+
+       for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+               if (qa[i].qt_dquot == NULL ||
+                   qa[i].qt_dquot == dqp)
+                       return &qa[i];
+       }
+
+       return NULL;
+}
+
+/*
+ * Make the changes in the transaction structure.
+ * The moral equivalent to xfs_trans_mod_sb().
+ * We don't touch any fields in the dquot, so we don't care
+ * if it's locked or not (most of the time it won't be).
+ */
+void
+xfs_trans_mod_dquot(
+       xfs_trans_t     *tp,
+       xfs_dquot_t     *dqp,
+       uint            field,
+       long            delta)
+{
+       xfs_dqtrx_t     *qtrx;
+
+       ASSERT(tp);
+       ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp));
+       qtrx = NULL;
+
+       if (tp->t_dqinfo == NULL)
+               xfs_trans_alloc_dqinfo(tp);
+       /*
+        * Find either the first free slot or the slot that belongs
+        * to this dquot.
+        */
+       qtrx = xfs_trans_get_dqtrx(tp, dqp);
+       ASSERT(qtrx);
+       if (qtrx->qt_dquot == NULL)
+               qtrx->qt_dquot = dqp;
+
+       switch (field) {
+
+               /*
+                * regular disk blk reservation
+                */
+             case XFS_TRANS_DQ_RES_BLKS:
+               qtrx->qt_blk_res += (ulong)delta;
+               break;
+
+               /*
+                * inode reservation
+                */
+             case XFS_TRANS_DQ_RES_INOS:
+               qtrx->qt_ino_res += (ulong)delta;
+               break;
+
+               /*
+                * disk blocks used.
+                */
+             case XFS_TRANS_DQ_BCOUNT:
+               if (qtrx->qt_blk_res && delta > 0) {
+                       qtrx->qt_blk_res_used += (ulong)delta;
+                       ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used);
+               }
+               qtrx->qt_bcount_delta += delta;
+               break;
+
+             case XFS_TRANS_DQ_DELBCOUNT:
+               qtrx->qt_delbcnt_delta += delta;
+               break;
+
+               /*
+                * Inode Count
+                */
+             case XFS_TRANS_DQ_ICOUNT:
+               if (qtrx->qt_ino_res && delta > 0) {
+                       qtrx->qt_ino_res_used += (ulong)delta;
+                       ASSERT(qtrx->qt_ino_res >= qtrx->qt_ino_res_used);
+               }
+               qtrx->qt_icount_delta += delta;
+               break;
+
+               /*
+                * rtblk reservation
+                */
+             case XFS_TRANS_DQ_RES_RTBLKS:
+               qtrx->qt_rtblk_res += (ulong)delta;
+               break;
+
+               /*
+                * rtblk count
+                */
+             case XFS_TRANS_DQ_RTBCOUNT:
+               if (qtrx->qt_rtblk_res && delta > 0) {
+                       qtrx->qt_rtblk_res_used += (ulong)delta;
+                       ASSERT(qtrx->qt_rtblk_res >= qtrx->qt_rtblk_res_used);
+               }
+               qtrx->qt_rtbcount_delta += delta;
+               break;
+
+             case XFS_TRANS_DQ_DELRTBCOUNT:
+               qtrx->qt_delrtb_delta += delta;
+               break;
+
+             default:
+               ASSERT(0);
+       }
+       tp->t_flags |= XFS_TRANS_DQ_DIRTY;
+}
+
+
+/*
+ * Given an array of dqtrx structures, lock all the dquots associated
+ * and join them to the transaction, provided they have been modified.
+ * We know that the highest number of dquots (of one type - usr OR grp),
+ * involved in a transaction is 2 and that both usr and grp combined - 3.
+ * So, we don't attempt to make this very generic.
+ */
+STATIC void
+xfs_trans_dqlockedjoin(
+       xfs_trans_t     *tp,
+       xfs_dqtrx_t     *q)
+{
+       ASSERT(q[0].qt_dquot != NULL);
+       if (q[1].qt_dquot == NULL) {
+               xfs_dqlock(q[0].qt_dquot);
+               xfs_trans_dqjoin(tp, q[0].qt_dquot);
+       } else {
+               ASSERT(XFS_QM_TRANS_MAXDQS == 2);
+               xfs_dqlock2(q[0].qt_dquot, q[1].qt_dquot);
+               xfs_trans_dqjoin(tp, q[0].qt_dquot);
+               xfs_trans_dqjoin(tp, q[1].qt_dquot);
+       }
+}
+
+
+/*
+ * Called by xfs_trans_commit() and similar in spirit to
+ * xfs_trans_apply_sb_deltas().
+ * Go thru all the dquots belonging to this transaction and modify the
+ * INCORE dquot to reflect the actual usages.
+ * Unreserve just the reservations done by this transaction.
+ * dquot is still left locked at exit.
+ */
+void
+xfs_trans_apply_dquot_deltas(
+       xfs_trans_t             *tp)
+{
+       int                     i, j;
+       xfs_dquot_t             *dqp;
+       xfs_dqtrx_t             *qtrx, *qa;
+       xfs_disk_dquot_t        *d;
+       long                    totalbdelta;
+       long                    totalrtbdelta;
+
+       if (!(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+               return;
+
+       ASSERT(tp->t_dqinfo);
+       qa = tp->t_dqinfo->dqa_usrdquots;
+       for (j = 0; j < 2; j++) {
+               if (qa[0].qt_dquot == NULL) {
+                       qa = tp->t_dqinfo->dqa_grpdquots;
+                       continue;
+               }
+
+               /*
+                * Lock all of the dquots and join them to the transaction.
+                */
+               xfs_trans_dqlockedjoin(tp, qa);
+
+               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       qtrx = &qa[i];
+                       /*
+                        * The array of dquots is filled
+                        * sequentially, not sparsely.
+                        */
+                       if ((dqp = qtrx->qt_dquot) == NULL)
+                               break;
+
+                       ASSERT(XFS_DQ_IS_LOCKED(dqp));
+                       ASSERT(dqp->q_transp == tp);
+
+                       /*
+                        * adjust the actual number of blocks used
+                        */
+                       d = &dqp->q_core;
+
+                       /*
+                        * The issue here is - sometimes we don't make a blkquota
+                        * reservation intentionally to be fair to users
+                        * (when the amount is small). On the other hand,
+                        * delayed allocs do make reservations, but that's
+                        * outside of a transaction, so we have no
+                        * idea how much was really reserved.
+                        * So, here we've accumulated delayed allocation blks and
+                        * non-delay blks. The assumption is that the
+                        * delayed ones are always reserved (outside of a
+                        * transaction), and the others may or may not have
+                        * quota reservations.
+                        */
+                       totalbdelta = qtrx->qt_bcount_delta +
+                               qtrx->qt_delbcnt_delta;
+                       totalrtbdelta = qtrx->qt_rtbcount_delta +
+                               qtrx->qt_delrtb_delta;
+#ifdef DEBUG
+                       if (totalbdelta < 0)
+                               ASSERT(be64_to_cpu(d->d_bcount) >=
+                                      -totalbdelta);
+
+                       if (totalrtbdelta < 0)
+                               ASSERT(be64_to_cpu(d->d_rtbcount) >=
+                                      -totalrtbdelta);
+
+                       if (qtrx->qt_icount_delta < 0)
+                               ASSERT(be64_to_cpu(d->d_icount) >=
+                                      -qtrx->qt_icount_delta);
+#endif
+                       if (totalbdelta)
+                               be64_add_cpu(&d->d_bcount, (xfs_qcnt_t)totalbdelta);
+
+                       if (qtrx->qt_icount_delta)
+                               be64_add_cpu(&d->d_icount, (xfs_qcnt_t)qtrx->qt_icount_delta);
+
+                       if (totalrtbdelta)
+                               be64_add_cpu(&d->d_rtbcount, (xfs_qcnt_t)totalrtbdelta);
+
+                       /*
+                        * Get any default limits in use.
+                        * Start/reset the timer(s) if needed.
+                        */
+                       if (d->d_id) {
+                               xfs_qm_adjust_dqlimits(tp->t_mountp, d);
+                               xfs_qm_adjust_dqtimers(tp->t_mountp, d);
+                       }
+
+                       dqp->dq_flags |= XFS_DQ_DIRTY;
+                       /*
+                        * add this to the list of items to get logged
+                        */
+                       xfs_trans_log_dquot(tp, dqp);
+                       /*
+                        * Take off what's left of the original reservation.
+                        * In case of delayed allocations, there's no
+                        * reservation that a transaction structure knows of.
+                        */
+                       if (qtrx->qt_blk_res != 0) {
+                               if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) {
+                                       if (qtrx->qt_blk_res >
+                                           qtrx->qt_blk_res_used)
+                                               dqp->q_res_bcount -= (xfs_qcnt_t)
+                                                       (qtrx->qt_blk_res -
+                                                        qtrx->qt_blk_res_used);
+                                       else
+                                               dqp->q_res_bcount -= (xfs_qcnt_t)
+                                                       (qtrx->qt_blk_res_used -
+                                                        qtrx->qt_blk_res);
+                               }
+                       } else {
+                               /*
+                                * These blks were never reserved, either inside
+                                * a transaction or outside one (in a delayed
+                                * allocation). Also, this isn't always a
+                                * negative number since we sometimes
+                                * deliberately skip quota reservations.
+                                */
+                               if (qtrx->qt_bcount_delta) {
+                                       dqp->q_res_bcount +=
+                                             (xfs_qcnt_t)qtrx->qt_bcount_delta;
+                               }
+                       }
+                       /*
+                        * Adjust the RT reservation.
+                        */
+                       if (qtrx->qt_rtblk_res != 0) {
+                               if (qtrx->qt_rtblk_res != qtrx->qt_rtblk_res_used) {
+                                       if (qtrx->qt_rtblk_res >
+                                           qtrx->qt_rtblk_res_used)
+                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
+                                                      (qtrx->qt_rtblk_res -
+                                                       qtrx->qt_rtblk_res_used);
+                                       else
+                                              dqp->q_res_rtbcount -= (xfs_qcnt_t)
+                                                      (qtrx->qt_rtblk_res_used -
+                                                       qtrx->qt_rtblk_res);
+                               }
+                       } else {
+                               if (qtrx->qt_rtbcount_delta)
+                                       dqp->q_res_rtbcount +=
+                                           (xfs_qcnt_t)qtrx->qt_rtbcount_delta;
+                       }
+
+                       /*
+                        * Adjust the inode reservation.
+                        */
+                       if (qtrx->qt_ino_res != 0) {
+                               ASSERT(qtrx->qt_ino_res >=
+                                      qtrx->qt_ino_res_used);
+                               if (qtrx->qt_ino_res > qtrx->qt_ino_res_used)
+                                       dqp->q_res_icount -= (xfs_qcnt_t)
+                                               (qtrx->qt_ino_res -
+                                                qtrx->qt_ino_res_used);
+                       } else {
+                               if (qtrx->qt_icount_delta)
+                                       dqp->q_res_icount +=
+                                           (xfs_qcnt_t)qtrx->qt_icount_delta;
+                       }
+
+                       ASSERT(dqp->q_res_bcount >=
+                               be64_to_cpu(dqp->q_core.d_bcount));
+                       ASSERT(dqp->q_res_icount >=
+                               be64_to_cpu(dqp->q_core.d_icount));
+                       ASSERT(dqp->q_res_rtbcount >=
+                               be64_to_cpu(dqp->q_core.d_rtbcount));
+               }
+               /*
+                * Do the group quotas next
+                */
+               qa = tp->t_dqinfo->dqa_grpdquots;
+       }
+}
+
+/*
+ * Release the reservations, and adjust the dquots accordingly.
+ * This is called only when the transaction is being aborted. If by
+ * any chance we have done dquot modifications incore (ie. deltas) already,
+ * we simply throw those away, since that's the expected behavior
+ * when a transaction is curtailed without a commit.
+ */
+void
+xfs_trans_unreserve_and_mod_dquots(
+       xfs_trans_t             *tp)
+{
+       int                     i, j;
+       xfs_dquot_t             *dqp;
+       xfs_dqtrx_t             *qtrx, *qa;
+       boolean_t               locked;
+
+       if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
+               return;
+
+       qa = tp->t_dqinfo->dqa_usrdquots;
+
+       for (j = 0; j < 2; j++) {
+               for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) {
+                       qtrx = &qa[i];
+                       /*
+                        * We assume that the array of dquots is filled
+                        * sequentially, not sparsely.
+                        */
+                       if ((dqp = qtrx->qt_dquot) == NULL)
+                               break;
+                       /*
+                        * Unreserve the original reservation. We don't care
+                        * about the number of blocks used field, or deltas.
+                        * Also we don't bother to zero the fields.
+                        */
+                       locked = B_FALSE;
+                       if (qtrx->qt_blk_res) {
+                               xfs_dqlock(dqp);
+                               locked = B_TRUE;
+                               dqp->q_res_bcount -=
+                                       (xfs_qcnt_t)qtrx->qt_blk_res;
+                       }
+                       if (qtrx->qt_ino_res) {
+                               if (!locked) {
+                                       xfs_dqlock(dqp);
+                                       locked = B_TRUE;
+                               }
+                               dqp->q_res_icount -=
+                                       (xfs_qcnt_t)qtrx->qt_ino_res;
+                       }
+
+                       if (qtrx->qt_rtblk_res) {
+                               if (!locked) {
+                                       xfs_dqlock(dqp);
+                                       locked = B_TRUE;
+                               }
+                               dqp->q_res_rtbcount -=
+                                       (xfs_qcnt_t)qtrx->qt_rtblk_res;
+                       }
+                       if (locked)
+                               xfs_dqunlock(dqp);
+
+               }
+               qa = tp->t_dqinfo->dqa_grpdquots;
+       }
+}
+
+STATIC void
+xfs_quota_warn(
+       struct xfs_mount        *mp,
+       struct xfs_dquot        *dqp,
+       int                     type)
+{
+       /* no warnings for project quotas - we just return ENOSPC later */
+       if (dqp->dq_flags & XFS_DQ_PROJ)
+               return;
+       quota_send_warning((dqp->dq_flags & XFS_DQ_USER) ? USRQUOTA : GRPQUOTA,
+                          be32_to_cpu(dqp->q_core.d_id), mp->m_super->s_dev,
+                          type);
+}
+
+/*
+ * This reserves disk blocks and inodes against a dquot.
+ * Flags indicate if the dquot is to be locked here and also
+ * if the blk reservation is for RT or regular blocks.
+ * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check.
+ */
+STATIC int
+xfs_trans_dqresv(
+       xfs_trans_t     *tp,
+       xfs_mount_t     *mp,
+       xfs_dquot_t     *dqp,
+       long            nblks,
+       long            ninos,
+       uint            flags)
+{
+       xfs_qcnt_t      hardlimit;
+       xfs_qcnt_t      softlimit;
+       time_t          timer;
+       xfs_qwarncnt_t  warns;
+       xfs_qwarncnt_t  warnlimit;
+       xfs_qcnt_t      count;
+       xfs_qcnt_t      *resbcountp;
+       xfs_quotainfo_t *q = mp->m_quotainfo;
+
+
+       xfs_dqlock(dqp);
+
+       if (flags & XFS_TRANS_DQ_RES_BLKS) {
+               hardlimit = be64_to_cpu(dqp->q_core.d_blk_hardlimit);
+               if (!hardlimit)
+                       hardlimit = q->qi_bhardlimit;
+               softlimit = be64_to_cpu(dqp->q_core.d_blk_softlimit);
+               if (!softlimit)
+                       softlimit = q->qi_bsoftlimit;
+               timer = be32_to_cpu(dqp->q_core.d_btimer);
+               warns = be16_to_cpu(dqp->q_core.d_bwarns);
+               warnlimit = dqp->q_mount->m_quotainfo->qi_bwarnlimit;
+               resbcountp = &dqp->q_res_bcount;
+       } else {
+               ASSERT(flags & XFS_TRANS_DQ_RES_RTBLKS);
+               hardlimit = be64_to_cpu(dqp->q_core.d_rtb_hardlimit);
+               if (!hardlimit)
+                       hardlimit = q->qi_rtbhardlimit;
+               softlimit = be64_to_cpu(dqp->q_core.d_rtb_softlimit);
+               if (!softlimit)
+                       softlimit = q->qi_rtbsoftlimit;
+               timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+               warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+               warnlimit = dqp->q_mount->m_quotainfo->qi_rtbwarnlimit;
+               resbcountp = &dqp->q_res_rtbcount;
+       }
+
+       if ((flags & XFS_QMOPT_FORCE_RES) == 0 &&
+           dqp->q_core.d_id &&
+           ((XFS_IS_UQUOTA_ENFORCED(dqp->q_mount) && XFS_QM_ISUDQ(dqp)) ||
+            (XFS_IS_OQUOTA_ENFORCED(dqp->q_mount) &&
+             (XFS_QM_ISPDQ(dqp) || XFS_QM_ISGDQ(dqp))))) {
+               if (nblks > 0) {
+                       /*
+                        * dquot is locked already. See if we'd go over the
+                        * hardlimit or exceed the timelimit if we allocate
+                        * nblks.
+                        */
+                       if (hardlimit > 0ULL &&
+                           hardlimit <= nblks + *resbcountp) {
+                               xfs_quota_warn(mp, dqp, QUOTA_NL_BHARDWARN);
+                               goto error_return;
+                       }
+                       if (softlimit > 0ULL &&
+                           softlimit <= nblks + *resbcountp) {
+                               if ((timer != 0 && get_seconds() > timer) ||
+                                   (warns != 0 && warns >= warnlimit)) {
+                                       xfs_quota_warn(mp, dqp,
+                                                      QUOTA_NL_BSOFTLONGWARN);
+                                       goto error_return;
+                               }
+
+                               xfs_quota_warn(mp, dqp, QUOTA_NL_BSOFTWARN);
+                       }
+               }
+               if (ninos > 0) {
+                       count = be64_to_cpu(dqp->q_core.d_icount);
+                       timer = be32_to_cpu(dqp->q_core.d_itimer);
+                       warns = be16_to_cpu(dqp->q_core.d_iwarns);
+                       warnlimit = dqp->q_mount->m_quotainfo->qi_iwarnlimit;
+                       hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
+                       if (!hardlimit)
+                               hardlimit = q->qi_ihardlimit;
+                       softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
+                       if (!softlimit)
+                               softlimit = q->qi_isoftlimit;
+
+                       if (hardlimit > 0ULL && count >= hardlimit) {
+                               xfs_quota_warn(mp, dqp, QUOTA_NL_IHARDWARN);
+                               goto error_return;
+                       }
+                       if (softlimit > 0ULL && count >= softlimit) {
+                               if  ((timer != 0 && get_seconds() > timer) ||
+                                    (warns != 0 && warns >= warnlimit)) {
+                                       xfs_quota_warn(mp, dqp,
+                                                      QUOTA_NL_ISOFTLONGWARN);
+                                       goto error_return;
+                               }
+                               xfs_quota_warn(mp, dqp, QUOTA_NL_ISOFTWARN);
+                       }
+               }
+       }
+
+       /*
+        * Change the reservation, but not the actual usage.
+        * Note that q_res_bcount = q_core.d_bcount + resv
+        */
+       (*resbcountp) += (xfs_qcnt_t)nblks;
+       if (ninos != 0)
+               dqp->q_res_icount += (xfs_qcnt_t)ninos;
+
+       /*
+        * note the reservation amt in the trans struct too,
+        * so that the transaction knows how much was reserved by
+        * it against this particular dquot.
+        * We don't do this when we are reserving for a delayed allocation,
+        * because we don't have the luxury of a transaction envelope then.
+        */
+       if (tp) {
+               ASSERT(tp->t_dqinfo);
+               ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+               if (nblks != 0)
+                       xfs_trans_mod_dquot(tp, dqp,
+                                           flags & XFS_QMOPT_RESBLK_MASK,
+                                           nblks);
+               if (ninos != 0)
+                       xfs_trans_mod_dquot(tp, dqp,
+                                           XFS_TRANS_DQ_RES_INOS,
+                                           ninos);
+       }
+       ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
+       ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
+       ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+       xfs_dqunlock(dqp);
+       return 0;
+
+error_return:
+       xfs_dqunlock(dqp);
+       if (flags & XFS_QMOPT_ENOSPC)
+               return ENOSPC;
+       return EDQUOT;
+}
+
+
+/*
+ * Given dquot(s), make disk block and/or inode reservations against them.
+ * The fact that this does the reservation against both the usr and
+ * grp/prj quotas is important, because this follows a both-or-nothing
+ * approach.
+ *
+ * flags = XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown.
+ *        XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT.  Used by pquota.
+ *        XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks
+ *        XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks
+ * dquots are unlocked on return, if they were not locked by caller.
+ */
+int
+xfs_trans_reserve_quota_bydquots(
+       xfs_trans_t     *tp,
+       xfs_mount_t     *mp,
+       xfs_dquot_t     *udqp,
+       xfs_dquot_t     *gdqp,
+       long            nblks,
+       long            ninos,
+       uint            flags)
+{
+       int             resvd = 0, error;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return 0;
+
+       if (tp && tp->t_dqinfo == NULL)
+               xfs_trans_alloc_dqinfo(tp);
+
+       ASSERT(flags & XFS_QMOPT_RESBLK_MASK);
+
+       if (udqp) {
+               error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos,
+                                       (flags & ~XFS_QMOPT_ENOSPC));
+               if (error)
+                       return error;
+               resvd = 1;
+       }
+
+       if (gdqp) {
+               error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+               if (error) {
+                       /*
+                        * can't do it, so backout previous reservation
+                        */
+                       if (resvd) {
+                               flags |= XFS_QMOPT_FORCE_RES;
+                               xfs_trans_dqresv(tp, mp, udqp,
+                                                -nblks, -ninos, flags);
+                       }
+                       return error;
+               }
+       }
+
+       /*
+        * Didn't change anything critical, so, no need to log
+        */
+       return 0;
+}
+
+
+/*
+ * Lock the dquot and change the reservation if we can.
+ * This doesn't change the actual usage, just the reservation.
+ * The inode sent in is locked.
+ */
+int
+xfs_trans_reserve_quota_nblks(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *ip,
+       long                    nblks,
+       long                    ninos,
+       uint                    flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+
+       if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp))
+               return 0;
+       if (XFS_IS_PQUOTA_ON(mp))
+               flags |= XFS_QMOPT_ENOSPC;
+
+       ASSERT(ip->i_ino != mp->m_sb.sb_uquotino);
+       ASSERT(ip->i_ino != mp->m_sb.sb_gquotino);
+
+       ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+       ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+                               XFS_TRANS_DQ_RES_RTBLKS ||
+              (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) ==
+                               XFS_TRANS_DQ_RES_BLKS);
+
+       /*
+        * Reserve nblks against these dquots, with trans as the mediator.
+        */
+       return xfs_trans_reserve_quota_bydquots(tp, mp,
+                                               ip->i_udquot, ip->i_gdquot,
+                                               nblks, ninos, flags);
+}
+
+/*
+ * This routine is called to allocate a quotaoff log item.
+ */
+xfs_qoff_logitem_t *
+xfs_trans_get_qoff_item(
+       xfs_trans_t             *tp,
+       xfs_qoff_logitem_t      *startqoff,
+       uint                    flags)
+{
+       xfs_qoff_logitem_t      *q;
+
+       ASSERT(tp != NULL);
+
+       q = xfs_qm_qoff_logitem_init(tp->t_mountp, startqoff, flags);
+       ASSERT(q != NULL);
+
+       /*
+        * Get a log_item_desc to point at the new item.
+        */
+       xfs_trans_add_item(tp, &q->qql_item);
+       return q;
+}
+
+
+/*
+ * This is called to mark the quotaoff logitem as needing
+ * to be logged when the transaction is committed.  The logitem must
+ * already be associated with the given transaction.
+ */
+void
+xfs_trans_log_quotaoff_item(
+       xfs_trans_t             *tp,
+       xfs_qoff_logitem_t      *qlp)
+{
+       tp->t_flags |= XFS_TRANS_DIRTY;
+       qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+}
+
+STATIC void
+xfs_trans_alloc_dqinfo(
+       xfs_trans_t     *tp)
+{
+       tp->t_dqinfo = kmem_zone_zalloc(xfs_Gqm->qm_dqtrxzone, KM_SLEEP);
+}
+
+void
+xfs_trans_free_dqinfo(
+       xfs_trans_t     *tp)
+{
+       if (!tp->t_dqinfo)
+               return;
+       kmem_zone_free(xfs_Gqm->qm_dqtrxzone, tp->t_dqinfo);
+       tp->t_dqinfo = NULL;
+}
diff --git a/fs/xfs/xfs_vnode.h b/fs/xfs/xfs_vnode.h
new file mode 100644 (file)
index 0000000..7c220b4
--- /dev/null
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef __XFS_VNODE_H__
+#define __XFS_VNODE_H__
+
+#include "xfs_fs.h"
+
+struct file;
+struct xfs_inode;
+struct xfs_iomap;
+struct attrlist_cursor_kern;
+
+/*
+ * Return values for xfs_inactive.  A return value of
+ * VN_INACTIVE_NOCACHE implies that the file system behavior
+ * has disassociated its state and bhv_desc_t from the vnode.
+ */
+#define        VN_INACTIVE_CACHE       0
+#define        VN_INACTIVE_NOCACHE     1
+
+/*
+ * Flags for read/write calls - same values as IRIX
+ */
+#define IO_ISDIRECT    0x00004         /* bypass page cache */
+#define IO_INVIS       0x00020         /* don't update inode timestamps */
+
+#define XFS_IO_FLAGS \
+       { IO_ISDIRECT,  "DIRECT" }, \
+       { IO_INVIS,     "INVIS"}
+
+/*
+ * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
+ */
+#define FI_NONE                        0       /* none */
+#define FI_REMAPF              1       /* Do a remapf prior to the operation */
+#define FI_REMAPF_LOCKED       2       /* Do a remapf prior to the operation.
+                                          Prevent VM access to the pages until
+                                          the operation completes. */
+
+/*
+ * Some useful predicates.
+ */
+#define VN_MAPPED(vp)  mapping_mapped(vp->i_mapping)
+#define VN_CACHED(vp)  (vp->i_mapping->nrpages)
+#define VN_DIRTY(vp)   mapping_tagged(vp->i_mapping, \
+                                       PAGECACHE_TAG_DIRTY)
+
+
+#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
new file mode 100644 (file)
index 0000000..87d3e03
--- /dev/null
@@ -0,0 +1,241 @@
+/*
+ * Copyright (C) 2008 Christoph Hellwig.
+ * Portions Copyright (C) 2000-2008 Silicon Graphics, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "xfs.h"
+#include "xfs_da_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_attr.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_acl.h"
+#include "xfs_vnodeops.h"
+
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+
+
+static int
+xfs_xattr_get(struct dentry *dentry, const char *name,
+               void *value, size_t size, int xflags)
+{
+       struct xfs_inode *ip = XFS_I(dentry->d_inode);
+       int error, asize = size;
+
+       if (strcmp(name, "") == 0)
+               return -EINVAL;
+
+       /* Convert Linux syscall to XFS internal ATTR flags */
+       if (!size) {
+               xflags |= ATTR_KERNOVAL;
+               value = NULL;
+       }
+
+       error = -xfs_attr_get(ip, (unsigned char *)name, value, &asize, xflags);
+       if (error)
+               return error;
+       return asize;
+}
+
+static int
+xfs_xattr_set(struct dentry *dentry, const char *name, const void *value,
+               size_t size, int flags, int xflags)
+{
+       struct xfs_inode *ip = XFS_I(dentry->d_inode);
+
+       if (strcmp(name, "") == 0)
+               return -EINVAL;
+
+       /* Convert Linux syscall to XFS internal ATTR flags */
+       if (flags & XATTR_CREATE)
+               xflags |= ATTR_CREATE;
+       if (flags & XATTR_REPLACE)
+               xflags |= ATTR_REPLACE;
+
+       if (!value)
+               return -xfs_attr_remove(ip, (unsigned char *)name, xflags);
+       return -xfs_attr_set(ip, (unsigned char *)name,
+                               (void *)value, size, xflags);
+}
+
+static const struct xattr_handler xfs_xattr_user_handler = {
+       .prefix = XATTR_USER_PREFIX,
+       .flags  = 0, /* no flags implies user namespace */
+       .get    = xfs_xattr_get,
+       .set    = xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_trusted_handler = {
+       .prefix = XATTR_TRUSTED_PREFIX,
+       .flags  = ATTR_ROOT,
+       .get    = xfs_xattr_get,
+       .set    = xfs_xattr_set,
+};
+
+static const struct xattr_handler xfs_xattr_security_handler = {
+       .prefix = XATTR_SECURITY_PREFIX,
+       .flags  = ATTR_SECURE,
+       .get    = xfs_xattr_get,
+       .set    = xfs_xattr_set,
+};
+
+const struct xattr_handler *xfs_xattr_handlers[] = {
+       &xfs_xattr_user_handler,
+       &xfs_xattr_trusted_handler,
+       &xfs_xattr_security_handler,
+#ifdef CONFIG_XFS_POSIX_ACL
+       &xfs_xattr_acl_access_handler,
+       &xfs_xattr_acl_default_handler,
+#endif
+       NULL
+};
+
+static unsigned int xfs_xattr_prefix_len(int flags)
+{
+       if (flags & XFS_ATTR_SECURE)
+               return sizeof("security");
+       else if (flags & XFS_ATTR_ROOT)
+               return sizeof("trusted");
+       else
+               return sizeof("user");
+}
+
+static const char *xfs_xattr_prefix(int flags)
+{
+       if (flags & XFS_ATTR_SECURE)
+               return xfs_xattr_security_handler.prefix;
+       else if (flags & XFS_ATTR_ROOT)
+               return xfs_xattr_trusted_handler.prefix;
+       else
+               return xfs_xattr_user_handler.prefix;
+}
+
+static int
+xfs_xattr_put_listent(
+       struct xfs_attr_list_context *context,
+       int             flags,
+       unsigned char   *name,
+       int             namelen,
+       int             valuelen,
+       unsigned char   *value)
+{
+       unsigned int prefix_len = xfs_xattr_prefix_len(flags);
+       char *offset;
+       int arraytop;
+
+       ASSERT(context->count >= 0);
+
+       /*
+        * Only show root namespace entries if we are actually allowed to
+        * see them.
+        */
+       if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
+               return 0;
+
+       arraytop = context->count + prefix_len + namelen + 1;
+       if (arraytop > context->firstu) {
+               context->count = -1;    /* insufficient space */
+               return 1;
+       }
+       offset = (char *)context->alist + context->count;
+       strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+       offset += prefix_len;
+       strncpy(offset, (char *)name, namelen);                 /* real name */
+       offset += namelen;
+       *offset = '\0';
+       context->count += prefix_len + namelen + 1;
+       return 0;
+}
+
+static int
+xfs_xattr_put_listent_sizes(
+       struct xfs_attr_list_context *context,
+       int             flags,
+       unsigned char   *name,
+       int             namelen,
+       int             valuelen,
+       unsigned char   *value)
+{
+       context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
+       return 0;
+}
+
+static int
+list_one_attr(const char *name, const size_t len, void *data,
+               size_t size, ssize_t *result)
+{
+       char *p = data + *result;
+
+       *result += len;
+       if (!size)
+               return 0;
+       if (*result > size)
+               return -ERANGE;
+
+       strcpy(p, name);
+       return 0;
+}
+
+ssize_t
+xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
+{
+       struct xfs_attr_list_context context;
+       struct attrlist_cursor_kern cursor = { 0 };
+       struct inode            *inode = dentry->d_inode;
+       int                     error;
+
+       /*
+        * First read the regular on-disk attributes.
+        */
+       memset(&context, 0, sizeof(context));
+       context.dp = XFS_I(inode);
+       context.cursor = &cursor;
+       context.resynch = 1;
+       context.alist = data;
+       context.bufsize = size;
+       context.firstu = context.bufsize;
+
+       if (size)
+               context.put_listent = xfs_xattr_put_listent;
+       else
+               context.put_listent = xfs_xattr_put_listent_sizes;
+
+       xfs_attr_list_int(&context);
+       if (context.count < 0)
+               return -ERANGE;
+
+       /*
+        * Then add the two synthetic ACL attributes.
+        */
+       if (posix_acl_access_exists(inode)) {
+               error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
+                               strlen(POSIX_ACL_XATTR_ACCESS) + 1,
+                               data, size, &context.count);
+               if (error)
+                       return error;
+       }
+
+       if (posix_acl_default_exists(inode)) {
+               error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
+                               strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
+                               data, size, &context.count);
+               if (error)
+                       return error;
+       }
+
+       return context.count;
+}
index fb2d63f..aea9e45 100644 (file)
@@ -39,7 +39,7 @@
 })
 
 #define __page_to_pfn(pg)                                              \
-({     struct page *__pg = (pg);                                       \
+({     const struct page *__pg = (pg);                                 \
        struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));     \
        (unsigned long)(__pg - __pgdat->node_mem_map) +                 \
         __pgdat->node_start_pfn;                                       \
@@ -57,7 +57,7 @@
  * section[i].section_mem_map == mem_map's address - start_pfn;
  */
 #define __page_to_pfn(pg)                                      \
-({     struct page *__pg = (pg);                               \
+({     const struct page *__pg = (pg);                         \
        int __sec = page_to_section(__pg);                      \
        (unsigned long)(__pg - __section_mem_map_addr(__nr_to_section(__sec))); \
 })
index 6395692..32f0076 100644 (file)
@@ -125,7 +125,11 @@ enum rq_flag_bits {
        __REQ_SYNC,             /* request is sync (sync write or read) */
        __REQ_META,             /* metadata io request */
        __REQ_DISCARD,          /* request to discard sectors */
+       __REQ_SECURE,           /* secure discard (used with __REQ_DISCARD) */
+
        __REQ_NOIDLE,           /* don't anticipate more IO after this one */
+       __REQ_FUA,              /* forced unit access */
+       __REQ_FLUSH,            /* request for cache flush */
 
        /* bio only flags */
        __REQ_RAHEAD,           /* read ahead, can fail anytime */
@@ -135,7 +139,6 @@ enum rq_flag_bits {
        /* request only flags */
        __REQ_SORTED,           /* elevator knows about this request */
        __REQ_SOFTBARRIER,      /* may not be passed by ioscheduler */
-       __REQ_FUA,              /* forced unit access */
        __REQ_NOMERGE,          /* don't touch this for merging */
        __REQ_STARTED,          /* drive already may have started this one */
        __REQ_DONTPREP,         /* don't call prep for this one */
@@ -146,11 +149,9 @@ enum rq_flag_bits {
        __REQ_PREEMPT,          /* set for "ide_preempt" requests */
        __REQ_ALLOCED,          /* request came from our alloc pool */
        __REQ_COPY_USER,        /* contains copies of user pages */
-       __REQ_FLUSH,            /* request for cache flush */
        __REQ_FLUSH_SEQ,        /* request for flush sequence */
        __REQ_IO_STAT,          /* account I/O stat */
        __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
-       __REQ_SECURE,           /* secure discard (used with __REQ_DISCARD) */
        __REQ_NR_BITS,          /* stops here */
 };
 
index 0e67c45..84b15d5 100644 (file)
@@ -30,6 +30,7 @@ struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
+struct bsg_job;
 
 #define BLKDEV_MIN_RQ  4
 #define BLKDEV_MAX_RQ  128     /* Default maximum */
@@ -117,6 +118,7 @@ struct request {
                struct {
                        unsigned int            seq;
                        struct list_head        list;
+                       rq_end_io_fn            *saved_end_io;
                } flush;
        };
 
@@ -209,6 +211,7 @@ typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
+typedef int (bsg_job_fn) (struct bsg_job *);
 
 enum blk_eh_timer_return {
        BLK_EH_NOT_HANDLED,
@@ -375,6 +378,8 @@ struct request_queue {
        struct mutex            sysfs_lock;
 
 #if defined(CONFIG_BLK_DEV_BSG)
+       bsg_job_fn              *bsg_job_fn;
+       int                     bsg_job_size;
        struct bsg_class_device bsg_dev;
 #endif
 
index 8c7c2de..8e9e4bc 100644 (file)
@@ -14,7 +14,7 @@
 enum blktrace_cat {
        BLK_TC_READ     = 1 << 0,       /* reads */
        BLK_TC_WRITE    = 1 << 1,       /* writes */
-       BLK_TC_BARRIER  = 1 << 2,       /* barrier */
+       BLK_TC_FLUSH    = 1 << 2,       /* flush */
        BLK_TC_SYNC     = 1 << 3,       /* sync IO */
        BLK_TC_SYNCIO   = BLK_TC_SYNC,
        BLK_TC_QUEUE    = 1 << 4,       /* queueing/merging */
@@ -28,8 +28,9 @@ enum blktrace_cat {
        BLK_TC_META     = 1 << 12,      /* metadata */
        BLK_TC_DISCARD  = 1 << 13,      /* discard requests */
        BLK_TC_DRV_DATA = 1 << 14,      /* binary per-driver data */
+       BLK_TC_FUA      = 1 << 15,      /* fua requests */
 
-       BLK_TC_END      = 1 << 15,      /* only 16-bits, reminder */
+       BLK_TC_END      = 1 << 15,      /* we've run out of bits! */
 };
 
 #define BLK_TC_SHIFT           (16)
diff --git a/include/linux/bsg-lib.h b/include/linux/bsg-lib.h
new file mode 100644 (file)
index 0000000..f55ab8c
--- /dev/null
@@ -0,0 +1,73 @@
+/*
+ *  BSG helper library
+ *
+ *  Copyright (C) 2008   James Smart, Emulex Corporation
+ *  Copyright (C) 2011   Red Hat, Inc.  All rights reserved.
+ *  Copyright (C) 2011   Mike Christie
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+#ifndef _BLK_BSG_
+#define _BLK_BSG_
+
+#include <linux/blkdev.h>
+
+struct request;
+struct device;
+struct scatterlist;
+struct request_queue;
+
+struct bsg_buffer {
+       unsigned int payload_len;
+       int sg_cnt;
+       struct scatterlist *sg_list;
+};
+
+struct bsg_job {
+       struct device *dev;
+       struct request *req;
+
+       /* Transport/driver specific request/reply structs */
+       void *request;
+       void *reply;
+
+       unsigned int request_len;
+       unsigned int reply_len;
+       /*
+        * On entry : reply_len indicates the buffer size allocated for
+        * the reply.
+        *
+        * Upon completion : the message handler must set reply_len
+        *  to indicates the size of the reply to be returned to the
+        *  caller.
+        */
+
+       /* DMA payloads for the request/response */
+       struct bsg_buffer request_payload;
+       struct bsg_buffer reply_payload;
+
+       void *dd_data;          /* Used for driver-specific storage */
+};
+
+void bsg_job_done(struct bsg_job *job, int result,
+                 unsigned int reply_payload_rcv_len);
+int bsg_setup_queue(struct device *dev, struct request_queue *q, char *name,
+                   bsg_job_fn *job_fn, int dd_job_size);
+void bsg_request_fn(struct request_queue *q);
+void bsg_remove_queue(struct request_queue *q);
+void bsg_goose_queue(struct request_queue *q);
+
+#endif
index 06d25c1..b80506b 100644 (file)
@@ -63,7 +63,7 @@ static inline u32 hash_32(u32 val, unsigned int bits)
        return hash >> (32 - bits);
 }
 
-static inline unsigned long hash_ptr(void *ptr, unsigned int bits)
+static inline unsigned long hash_ptr(const void *ptr, unsigned int bits)
 {
        return hash_long((unsigned long)ptr, bits);
 }
index 87a06f3..5951730 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/errno.h>
 #include <linux/topology.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -547,7 +548,15 @@ static inline struct msi_desc *irq_data_get_msi(struct irq_data *d)
        return d->msi_desc;
 }
 
-int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node);
+int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+               struct module *owner);
+
+static inline int irq_alloc_descs(int irq, unsigned int from, unsigned int cnt,
+               int node)
+{
+       return __irq_alloc_descs(irq, from, cnt, node, THIS_MODULE);
+}
+
 void irq_free_descs(unsigned int irq, unsigned int cnt);
 int irq_reserve_irqs(unsigned int from, unsigned int cnt);
 
index 2d921b3..150134a 100644 (file)
@@ -66,6 +66,7 @@ struct irq_desc {
 #ifdef CONFIG_PROC_FS
        struct proc_dir_entry   *dir;
 #endif
+       struct module           *owner;
        const char              *name;
 } ____cacheline_internodealigned_in_smp;
 
index 66c194e..683d698 100644 (file)
@@ -64,7 +64,6 @@ struct loop_device {
 
        struct request_queue    *lo_queue;
        struct gendisk          *lo_disk;
-       struct list_head        lo_list;
 };
 
 #endif /* __KERNEL__ */
@@ -161,4 +160,8 @@ int loop_unregister_transfer(int number);
 #define LOOP_CHANGE_FD         0x4C06
 #define LOOP_SET_CAPACITY      0x4C07
 
+/* /dev/loop-control interface */
+#define LOOP_CTL_ADD           0x4C80
+#define LOOP_CTL_REMOVE                0x4C81
+#define LOOP_CTL_GET_FREE      0x4C82
 #endif
index 18fd130..c309b1e 100644 (file)
@@ -40,6 +40,7 @@
 #define BTRFS_MINOR            234
 #define AUTOFS_MINOR           235
 #define MAPPER_CTRL_MINOR      236
+#define LOOP_CTRL_MINOR                237
 #define MISC_DYNAMIC_MINOR     255
 
 struct device;
index fd599f4..7438071 100644 (file)
@@ -685,7 +685,7 @@ static inline void set_page_section(struct page *page, unsigned long section)
        page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT;
 }
 
-static inline unsigned long page_to_section(struct page *page)
+static inline unsigned long page_to_section(const struct page *page)
 {
        return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK;
 }
@@ -720,7 +720,7 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
 
 static __always_inline void *lowmem_page_address(const struct page *page)
 {
-       return __va(PFN_PHYS(page_to_pfn((struct page *)page)));
+       return __va(PFN_PHYS(page_to_pfn(page)));
 }
 
 #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL)
@@ -737,7 +737,7 @@ static __always_inline void *lowmem_page_address(const struct page *page)
 #endif
 
 #if defined(HASHED_PAGE_VIRTUAL)
-void *page_address(struct page *page);
+void *page_address(const struct page *page);
 void set_page_address(struct page *page, void *virtual);
 void page_address_init(void);
 #endif
index 0f83858..1d09562 100644 (file)
@@ -56,8 +56,6 @@ struct mmc_ios {
 #define MMC_TIMING_UHS_SDR104  4
 #define MMC_TIMING_UHS_DDR50   5
 
-       unsigned char   ddr;                    /* dual data rate used */
-
 #define MMC_SDR_MODE           0
 #define MMC_1_2V_DDR_MODE      1
 #define MMC_1_8V_DDR_MODE      2
index f27893b..8c230cb 100644 (file)
@@ -251,7 +251,8 @@ struct pci_dev {
        u8              revision;       /* PCI revision, low byte of class word */
        u8              hdr_type;       /* PCI header type (`multi' flag masked out) */
        u8              pcie_cap;       /* PCI-E capability offset */
-       u8              pcie_type;      /* PCI-E device/port type */
+       u8              pcie_type:4;    /* PCI-E device/port type */
+       u8              pcie_mpss:3;    /* PCI-E Max Payload Size Supported */
        u8              rom_base_reg;   /* which config register controls the ROM */
        u8              pin;            /* which interrupt pin this device uses */
 
@@ -617,6 +618,16 @@ struct pci_driver {
 /* these external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
+extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
+
+enum pcie_bus_config_types {
+       PCIE_BUS_PERFORMANCE,
+       PCIE_BUS_SAFE,
+       PCIE_BUS_PEER2PEER,
+};
+
+extern enum pcie_bus_config_types pcie_bus_config;
+
 extern struct bus_type pci_bus_type;
 
 /* Do NOT directly access these two variables, unless you are arch specific pci
@@ -796,10 +807,13 @@ int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
 int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
+int pcie_get_mps(struct pci_dev *dev);
+int pcie_set_mps(struct pci_dev *dev, int mps);
 int __pci_reset_function(struct pci_dev *dev);
 int pci_reset_function(struct pci_dev *dev);
 void pci_update_resource(struct pci_dev *dev, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
+int __must_check pci_reassign_resource(struct pci_dev *dev, int i, resource_size_t add_size, resource_size_t align);
 int pci_select_bars(struct pci_dev *dev, unsigned long flags);
 
 /* ROM control related routines */
index 21097cb..f9ec173 100644 (file)
@@ -72,8 +72,6 @@ extern int pm_genpd_remove_subdomain(struct generic_pm_domain *genpd,
 extern void pm_genpd_init(struct generic_pm_domain *genpd,
                          struct dev_power_governor *gov, bool is_off);
 extern int pm_genpd_poweron(struct generic_pm_domain *genpd);
-extern void pm_genpd_poweroff_unused(void);
-extern void genpd_queue_power_off_work(struct generic_pm_domain *genpd);
 #else
 static inline int pm_genpd_add_device(struct generic_pm_domain *genpd,
                                      struct device *dev)
@@ -101,8 +99,14 @@ static inline int pm_genpd_poweron(struct generic_pm_domain *genpd)
 {
        return -ENOSYS;
 }
-static inline void pm_genpd_poweroff_unused(void) {}
+#endif
+
+#ifdef CONFIG_PM_GENERIC_DOMAINS_RUNTIME
+extern void genpd_queue_power_off_work(struct generic_pm_domain *genpd);
+extern void pm_genpd_poweroff_unused(void);
+#else
 static inline void genpd_queue_power_off_work(struct generic_pm_domain *gpd) {}
+static inline void pm_genpd_poweroff_unused(void) {}
 #endif
 
 #endif /* _LINUX_PM_DOMAIN_H */
index b27ebea..93f4d03 100644 (file)
@@ -97,6 +97,9 @@ struct rtc_pll_info {
 #define RTC_AF 0x20    /* Alarm interrupt */
 #define RTC_UF 0x10    /* Update interrupt for 1Hz RTC */
 
+
+#define RTC_MAX_FREQ   8192
+
 #ifdef __KERNEL__
 
 #include <linux/types.h>
index 99e0308..ffd9bc7 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Platform data for Texas Instruments TLV320AIC3x codec
  *
- * Author: Jarkko Nikula <jhnikula@gmail.com>
+ * Author: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
index bf36654..05c5e61 100644 (file)
@@ -8,6 +8,8 @@
 #include <linux/blkdev.h>
 #include <linux/tracepoint.h>
 
+#define RWBS_LEN       8
+
 DECLARE_EVENT_CLASS(block_rq_with_error,
 
        TP_PROTO(struct request_queue *q, struct request *rq),
@@ -19,7 +21,7 @@ DECLARE_EVENT_CLASS(block_rq_with_error,
                __field(  sector_t,     sector                  )
                __field(  unsigned int, nr_sector               )
                __field(  int,          errors                  )
-               __array(  char,         rwbs,   6               )
+               __array(  char,         rwbs,   RWBS_LEN        )
                __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
        ),
 
@@ -104,7 +106,7 @@ DECLARE_EVENT_CLASS(block_rq,
                __field(  sector_t,     sector                  )
                __field(  unsigned int, nr_sector               )
                __field(  unsigned int, bytes                   )
-               __array(  char,         rwbs,   6               )
+               __array(  char,         rwbs,   RWBS_LEN        )
                __array(  char,         comm,   TASK_COMM_LEN   )
                __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
        ),
@@ -183,7 +185,7 @@ TRACE_EVENT(block_bio_bounce,
                __field( dev_t,         dev                     )
                __field( sector_t,      sector                  )
                __field( unsigned int,  nr_sector               )
-               __array( char,          rwbs,   6               )
+               __array( char,          rwbs,   RWBS_LEN        )
                __array( char,          comm,   TASK_COMM_LEN   )
        ),
 
@@ -222,7 +224,7 @@ TRACE_EVENT(block_bio_complete,
                __field( sector_t,      sector          )
                __field( unsigned,      nr_sector       )
                __field( int,           error           )
-               __array( char,          rwbs,   6       )
+               __array( char,          rwbs,   RWBS_LEN)
        ),
 
        TP_fast_assign(
@@ -249,7 +251,7 @@ DECLARE_EVENT_CLASS(block_bio,
                __field( dev_t,         dev                     )
                __field( sector_t,      sector                  )
                __field( unsigned int,  nr_sector               )
-               __array( char,          rwbs,   6               )
+               __array( char,          rwbs,   RWBS_LEN        )
                __array( char,          comm,   TASK_COMM_LEN   )
        ),
 
@@ -321,7 +323,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
                __field( dev_t,         dev                     )
                __field( sector_t,      sector                  )
                __field( unsigned int,  nr_sector               )
-               __array( char,          rwbs,   6               )
+               __array( char,          rwbs,   RWBS_LEN        )
                __array( char,          comm,   TASK_COMM_LEN   )
         ),
 
@@ -456,7 +458,7 @@ TRACE_EVENT(block_split,
                __field( dev_t,         dev                             )
                __field( sector_t,      sector                          )
                __field( sector_t,      new_sector                      )
-               __array( char,          rwbs,           6               )
+               __array( char,          rwbs,           RWBS_LEN        )
                __array( char,          comm,           TASK_COMM_LEN   )
        ),
 
@@ -498,7 +500,7 @@ TRACE_EVENT(block_bio_remap,
                __field( unsigned int,  nr_sector       )
                __field( dev_t,         old_dev         )
                __field( sector_t,      old_sector      )
-               __array( char,          rwbs,   6       )
+               __array( char,          rwbs,   RWBS_LEN)
        ),
 
        TP_fast_assign(
@@ -542,7 +544,7 @@ TRACE_EVENT(block_rq_remap,
                __field( unsigned int,  nr_sector       )
                __field( dev_t,         old_dev         )
                __field( sector_t,      old_sector      )
-               __array( char,          rwbs,   6       )
+               __array( char,          rwbs,   RWBS_LEN)
        ),
 
        TP_fast_assign(
index 3a2cab4..e38544d 100644 (file)
@@ -246,7 +246,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
 
        for (i = gc->irq_base; msk; msk >>= 1, i++) {
-               if (!msk & 0x01)
+               if (!(msk & 0x01))
                        continue;
 
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
@@ -301,7 +301,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
        raw_spin_unlock(&gc_lock);
 
        for (; msk; msk >>= 1, i++) {
-               if (!msk & 0x01)
+               if (!(msk & 0x01))
                        continue;
 
                /* Remove handler first. That will mask the irq line */
index 4c60a50..039b889 100644 (file)
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { }
 static inline int desc_node(struct irq_desc *desc) { return 0; }
 #endif
 
-static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
+               struct module *owner)
 {
        int cpu;
 
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
+       desc->owner = owner;
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc)
 static inline void free_masks(struct irq_desc *desc) { }
 #endif
 
-static struct irq_desc *alloc_desc(int irq, int node)
+static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 {
        struct irq_desc *desc;
        gfp_t gfp = GFP_KERNEL;
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        raw_spin_lock_init(&desc->lock);
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 
-       desc_set_defaults(irq, desc, node);
+       desc_set_defaults(irq, desc, node, owner);
 
        return desc;
 
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq)
        kfree(desc);
 }
 
-static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                      struct module *owner)
 {
        struct irq_desc *desc;
        int i;
 
        for (i = 0; i < cnt; i++) {
-               desc = alloc_desc(start + i, node);
+               desc = alloc_desc(start + i, node, owner);
                if (!desc)
                        goto err;
                mutex_lock(&sparse_irq_lock);
@@ -227,7 +230,7 @@ int __init early_irq_init(void)
                nr_irqs = initcnt;
 
        for (i = 0; i < initcnt; i++) {
-               desc = alloc_desc(i, node);
+               desc = alloc_desc(i, node, NULL);
                set_bit(i, allocated_irqs);
                irq_insert_desc(i, desc);
        }
@@ -261,7 +264,7 @@ int __init early_irq_init(void)
                alloc_masks(&desc[i], GFP_KERNEL, node);
                raw_spin_lock_init(&desc[i].lock);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-               desc_set_defaults(i, &desc[i], node);
+               desc_set_defaults(i, &desc[i], node, NULL);
        }
        return arch_early_irq_init();
 }
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq)
        dynamic_irq_cleanup(irq);
 }
 
-static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                             struct module *owner)
 {
+       u32 i;
+
+       for (i = 0; i < cnt; i++) {
+               struct irq_desc *desc = irq_to_desc(start + i);
+
+               desc->owner = owner;
+       }
        return start;
 }
 
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
  * @from:      Start the search from this irq number
  * @cnt:       Number of consecutive irqs to allocate.
  * @node:      Preferred node on which the irq descriptor should be allocated
+ * @owner:     Owning module (can be NULL)
  *
  * Returns the first irq number or error code
  */
 int __ref
-irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+                 struct module *owner)
 {
        int start, ret;
 
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
 
        bitmap_set(allocated_irqs, start, cnt);
        mutex_unlock(&sparse_irq_lock);
-       return alloc_descs(start, cnt, node);
+       return alloc_descs(start, cnt, node, owner);
 
 err:
        mutex_unlock(&sparse_irq_lock);
        return ret;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_descs);
+EXPORT_SYMBOL_GPL(__irq_alloc_descs);
 
 /**
  * irq_reserve_irqs - mark irqs allocated
@@ -440,7 +453,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
 
        raw_spin_lock_irqsave(&desc->lock, flags);
-       desc_set_defaults(irq, desc, desc_node(desc));
+       desc_set_defaults(irq, desc, desc_node(desc), NULL);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
 
index 0a7840a..9b956fa 100644 (file)
@@ -883,6 +883,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
+       if (!try_module_get(desc->owner))
+               return -ENODEV;
        /*
         * Some drivers like serial.c use request_irq() heavily,
         * so we have to be careful not to interfere with a
@@ -906,8 +908,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
-               if (!new->thread_fn)
-                       return -EINVAL;
+               if (!new->thread_fn) {
+                       ret = -EINVAL;
+                       goto out_mput;
+               }
                /*
                 * Replace the primary handler which was provided from
                 * the driver for non nested interrupt handling by the
@@ -929,8 +933,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
                                   new->name);
-               if (IS_ERR(t))
-                       return PTR_ERR(t);
+               if (IS_ERR(t)) {
+                       ret = PTR_ERR(t);
+                       goto out_mput;
+               }
                /*
                 * We keep the reference to the task struct even if
                 * the thread dies to avoid that the interrupt code
@@ -1095,6 +1101,8 @@ out_thread:
                        kthread_stop(t);
                put_task_struct(t);
        }
+out_mput:
+       module_put(desc->owner);
        return ret;
 }
 
@@ -1203,6 +1211,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                put_task_struct(action->thread);
        }
 
+       module_put(desc->owner);
        return action;
 }
 
index 8c24294..91d67ce 100644 (file)
@@ -3111,7 +3111,13 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                if (!class)
                        class = look_up_lock_class(lock, 0);
 
-               if (DEBUG_LOCKS_WARN_ON(!class))
+               /*
+                * If look_up_lock_class() failed to find a class, we're trying
+                * to test if we hold a lock that has never yet been acquired.
+                * Clearly if the lock hasn't been acquired _ever_, we're not
+                * holding it either, so report failure.
+                */
+               if (!class)
                        return 0;
 
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
index b1914cb..3744c59 100644 (file)
@@ -231,3 +231,7 @@ config PM_CLK
 config PM_GENERIC_DOMAINS
        bool
        depends on PM
+
+config PM_GENERIC_DOMAINS_RUNTIME
+       def_bool y
+       depends on PM_RUNTIME && PM_GENERIC_DOMAINS
index 3b8e028..e8bffbe 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
index 4e4932a..362da65 100644 (file)
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
index 6957aa2..7c910a5 100644 (file)
@@ -206,6 +206,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
        what |= MASK_TC_BIT(rw, DISCARD);
+       what |= MASK_TC_BIT(rw, FLUSH);
+       what |= MASK_TC_BIT(rw, FUA);
 
        pid = tsk->pid;
        if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1056,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
                goto out;
        }
 
+       if (tc & BLK_TC_FLUSH)
+               rwbs[i++] = 'F';
+
        if (tc & BLK_TC_DISCARD)
                rwbs[i++] = 'D';
        else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1068,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
        else
                rwbs[i++] = 'N';
 
+       if (tc & BLK_TC_FUA)
+               rwbs[i++] = 'F';
        if (tc & BLK_TC_AHEAD)
                rwbs[i++] = 'A';
-       if (tc & BLK_TC_BARRIER)
-               rwbs[i++] = 'B';
        if (tc & BLK_TC_SYNC)
                rwbs[i++] = 'S';
        if (tc & BLK_TC_META)
@@ -1132,7 +1137,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 
 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
-       char rwbs[6];
+       char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
        unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
        unsigned secs          = (unsigned long)ts;
@@ -1148,7 +1153,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 
 static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
-       char rwbs[6];
+       char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
        fill_rwbs(rwbs, t);
@@ -1561,7 +1566,7 @@ static const struct {
 } mask_maps[] = {
        { BLK_TC_READ,          "read"          },
        { BLK_TC_WRITE,         "write"         },
-       { BLK_TC_BARRIER,       "barrier"       },
+       { BLK_TC_FLUSH,         "flush"         },
        { BLK_TC_SYNC,          "sync"          },
        { BLK_TC_QUEUE,         "queue"         },
        { BLK_TC_REQUEUE,       "requeue"       },
@@ -1573,6 +1578,7 @@ static const struct {
        { BLK_TC_META,          "meta"          },
        { BLK_TC_DISCARD,       "discard"       },
        { BLK_TC_DRV_DATA,      "drv_data"      },
+       { BLK_TC_FUA,           "fua"           },
 };
 
 static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1794,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 {
        int i = 0;
 
+       if (rw & REQ_FLUSH)
+               rwbs[i++] = 'F';
+
        if (rw & WRITE)
                rwbs[i++] = 'W';
        else if (rw & REQ_DISCARD)
@@ -1797,6 +1806,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        else
                rwbs[i++] = 'N';
 
+       if (rw & REQ_FUA)
+               rwbs[i++] = 'F';
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
        if (rw & REQ_SYNC)
index 693394d..5ef672c 100644 (file)
@@ -326,7 +326,7 @@ static struct page_address_slot {
        spinlock_t lock;                        /* Protect this bucket's list */
 } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER];
 
-static struct page_address_slot *page_slot(struct page *page)
+static struct page_address_slot *page_slot(const struct page *page)
 {
        return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)];
 }
@@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page)
  *
  * Returns the page's virtual address.
  */
-void *page_address(struct page *page)
+void *page_address(const struct page *page)
 {
        unsigned long flags;
        void *ret;
index 464621d..7ef0903 100644 (file)
@@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr)
 #define VMAP_BBMAP_BITS_MIN    (VMAP_MAX_ALLOC*2)
 #define VMAP_MIN(x, y)         ((x) < (y) ? (x) : (y)) /* can't use min() */
 #define VMAP_MAX(x, y)         ((x) > (y) ? (x) : (y)) /* can't use max() */
-#define VMAP_BBMAP_BITS                VMAP_MIN(VMAP_BBMAP_BITS_MAX,           \
-                                       VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
-                                               VMALLOC_PAGES / NR_CPUS / 16))
+#define VMAP_BBMAP_BITS                \
+               VMAP_MIN(VMAP_BBMAP_BITS_MAX,   \
+               VMAP_MAX(VMAP_BBMAP_BITS_MIN,   \
+                       VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
 
 #define VMAP_BLOCK_SIZE                (VMAP_BBMAP_BITS * PAGE_SIZE)
 
index 3fd1a7e..552b97a 100644 (file)
@@ -1073,10 +1073,10 @@ static int aoa_fabric_layout_probe(struct soundbus_dev *sdev)
        sdev->pcmid = -1;
        list_del(&ldev->list);
        layouts_list_items--;
+       kfree(ldev);
  outnodev:
        of_node_put(sound);
        layout_device = NULL;
-       kfree(ldev);
        return -ENODEV;
 }
 
index 200c9a1..a872d0a 100644 (file)
@@ -1909,6 +1909,7 @@ static unsigned int ad1981_jacks_whitelist[] = {
        0x103c0944, /* HP nc6220 */
        0x103c0934, /* HP nc8220 */
        0x103c006d, /* HP nx9105 */
+       0x103c300d, /* HP Compaq dc5100 SFF(PT003AW) */
        0x17340088, /* FSC Scenic-W */
        0 /* end */
 };
index e4d76a2..579fc0d 100644 (file)
@@ -2625,16 +2625,19 @@ snd_azf3328_probe(struct pci_dev *pci, const struct pci_device_id *pci_id)
        int err;
 
        snd_azf3328_dbgcallenter();
-       if (dev >= SNDRV_CARDS)
-               return -ENODEV;
+       if (dev >= SNDRV_CARDS) {
+               err = -ENODEV;
+               goto out;
+       }
        if (!enable[dev]) {
                dev++;
-               return -ENOENT;
+               err = -ENOENT;
+               goto out;
        }
 
        err = snd_card_create(index[dev], id[dev], THIS_MODULE, 0, &card);
        if (err < 0)
-               return err;
+               goto out;
 
        strcpy(card->driver, "AZF3328");
        strcpy(card->shortname, "Aztech AZF3328 (PCI168)");
index be58bf2..2e5876c 100644 (file)
@@ -476,8 +476,8 @@ static const struct snd_pci_quirk alc268_ssid_cfg_tbl[] = {
 
 static const struct alc_config_preset alc268_presets[] = {
        [ALC267_QUANTA_IL1] = {
-               .mixers = { alc267_quanta_il1_mixer, alc268_beep_mixer,
-                           alc268_capture_nosrc_mixer },
+               .mixers = { alc267_quanta_il1_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_nosrc_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc267_quanta_il1_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -492,8 +492,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_3ST] = {
-               .mixers = { alc268_base_mixer, alc268_capture_alt_mixer,
-                           alc268_beep_mixer },
+               .mixers = { alc268_base_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_alt_mixer,
                .init_verbs = { alc268_base_init_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
                .dac_nids = alc268_dac_nids,
@@ -507,8 +507,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .input_mux = &alc268_capture_source,
        },
        [ALC268_TOSHIBA] = {
-               .mixers = { alc268_toshiba_mixer, alc268_capture_alt_mixer,
-                           alc268_beep_mixer },
+               .mixers = { alc268_toshiba_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_alt_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_toshiba_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -525,8 +525,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_ACER] = {
-               .mixers = { alc268_acer_mixer, alc268_capture_alt_mixer,
-                           alc268_beep_mixer },
+               .mixers = { alc268_acer_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_alt_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_acer_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -543,8 +543,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_ACER_DMIC] = {
-               .mixers = { alc268_acer_dmic_mixer, alc268_capture_alt_mixer,
-                           alc268_beep_mixer },
+               .mixers = { alc268_acer_dmic_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_alt_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_acer_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -561,9 +561,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_ACER_ASPIRE_ONE] = {
-               .mixers = { alc268_acer_aspire_one_mixer,
-                           alc268_beep_mixer,
-                           alc268_capture_nosrc_mixer },
+               .mixers = { alc268_acer_aspire_one_mixer, alc268_beep_mixer},
+               .cap_mixer = alc268_capture_nosrc_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_acer_aspire_one_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -579,8 +578,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_DELL] = {
-               .mixers = { alc268_dell_mixer, alc268_beep_mixer,
-                           alc268_capture_nosrc_mixer },
+               .mixers = { alc268_dell_mixer, alc268_beep_mixer},
+               .cap_mixer = alc268_capture_nosrc_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_dell_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -596,8 +595,8 @@ static const struct alc_config_preset alc268_presets[] = {
                .init_hook = alc_inithook,
        },
        [ALC268_ZEPTO] = {
-               .mixers = { alc268_base_mixer, alc268_capture_alt_mixer,
-                           alc268_beep_mixer },
+               .mixers = { alc268_base_mixer, alc268_beep_mixer },
+               .cap_mixer = alc268_capture_alt_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_toshiba_verbs },
                .num_dacs = ARRAY_SIZE(alc268_dac_nids),
@@ -616,7 +615,8 @@ static const struct alc_config_preset alc268_presets[] = {
        },
 #ifdef CONFIG_SND_DEBUG
        [ALC268_TEST] = {
-               .mixers = { alc268_test_mixer, alc268_capture_mixer },
+               .mixers = { alc268_test_mixer },
+               .cap_mixer = alc268_capture_mixer,
                .init_verbs = { alc268_base_init_verbs, alc268_eapd_verbs,
                                alc268_volume_init_verbs,
                                alc268_beep_init_verbs },
index 28ce17d..c34f730 100644 (file)
@@ -144,25 +144,17 @@ static int cea_sampling_frequencies[8] = {
        SNDRV_PCM_RATE_192000,  /* 7: 192000Hz */
 };
 
-static unsigned char hdmi_get_eld_byte(struct hda_codec *codec, hda_nid_t nid,
+static unsigned int hdmi_get_eld_data(struct hda_codec *codec, hda_nid_t nid,
                                        int byte_index)
 {
        unsigned int val;
 
        val = snd_hda_codec_read(codec, nid, 0,
                                        AC_VERB_GET_HDMI_ELDD, byte_index);
-
 #ifdef BE_PARANOID
        printk(KERN_INFO "HDMI: ELD data byte %d: 0x%x\n", byte_index, val);
 #endif
-
-       if ((val & AC_ELDD_ELD_VALID) == 0) {
-               snd_printd(KERN_INFO "HDMI: invalid ELD data byte %d\n",
-                                                               byte_index);
-               val = 0;
-       }
-
-       return val & AC_ELDD_ELD_DATA;
+       return val;
 }
 
 #define GRAB_BITS(buf, byte, lowbit, bits)             \
@@ -344,11 +336,26 @@ int snd_hdmi_get_eld(struct hdmi_eld *eld,
        if (!buf)
                return -ENOMEM;
 
-       for (i = 0; i < size; i++)
-               buf[i] = hdmi_get_eld_byte(codec, nid, i);
+       for (i = 0; i < size; i++) {
+               unsigned int val = hdmi_get_eld_data(codec, nid, i);
+               if (!(val & AC_ELDD_ELD_VALID)) {
+                       if (!i) {
+                               snd_printd(KERN_INFO
+                                          "HDMI: invalid ELD data\n");
+                               ret = -EINVAL;
+                               goto error;
+                       }
+                       snd_printd(KERN_INFO
+                                 "HDMI: invalid ELD data byte %d\n", i);
+                       val = 0;
+               } else
+                       val &= AC_ELDD_ELD_DATA;
+               buf[i] = val;
+       }
 
        ret = hdmi_update_eld(eld, buf, size);
 
+error:
        kfree(buf);
        return ret;
 }
index 47d6ffc..d6c93d9 100644 (file)
@@ -375,7 +375,7 @@ static int is_ext_mic(struct hda_codec *codec, unsigned int idx)
 static hda_nid_t get_adc(struct hda_codec *codec, hda_nid_t pin,
                         unsigned int *idxp)
 {
-       int i;
+       int i, idx;
        hda_nid_t nid;
 
        nid = codec->start_nid;
@@ -384,9 +384,11 @@ static hda_nid_t get_adc(struct hda_codec *codec, hda_nid_t pin,
                type = get_wcaps_type(get_wcaps(codec, nid));
                if (type != AC_WID_AUD_IN)
                        continue;
-               *idxp = snd_hda_get_conn_index(codec, nid, pin, false);
-               if (*idxp >= 0)
+               idx = snd_hda_get_conn_index(codec, nid, pin, false);
+               if (idx >= 0) {
+                       *idxp = idx;
                        return nid;
+               }
        }
        return 0;
 }
index 9a1aa09..fcb11af 100644 (file)
@@ -1784,6 +1784,7 @@ static const char * const alc_slave_vols[] = {
        "Speaker Playback Volume",
        "Mono Playback Volume",
        "Line-Out Playback Volume",
+       "PCM Playback Volume",
        NULL,
 };
 
@@ -1798,6 +1799,7 @@ static const char * const alc_slave_sws[] = {
        "Mono Playback Switch",
        "IEC958 Playback Switch",
        "Line-Out Playback Switch",
+       "PCM Playback Switch",
        NULL,
 };
 
index aa376b5..5145b66 100644 (file)
@@ -673,6 +673,7 @@ static int stac92xx_smux_enum_put(struct snd_kcontrol *kcontrol,
        return 0;
 }
 
+#ifdef CONFIG_SND_HDA_POWER_SAVE
 static int stac_vrefout_set(struct hda_codec *codec,
                                        hda_nid_t nid, unsigned int new_vref)
 {
@@ -696,6 +697,7 @@ static int stac_vrefout_set(struct hda_codec *codec,
 
        return 1;
 }
+#endif
 
 static unsigned int stac92xx_vref_set(struct hda_codec *codec,
                                        hda_nid_t nid, unsigned int new_vref)
index 38f38fd..d0003cc 100644 (file)
@@ -778,11 +778,19 @@ static int __devexit wm8750_spi_remove(struct spi_device *spi)
        return 0;
 }
 
+static const struct spi_device_id wm8750_spi_ids[] = {
+       { "wm8750", 0 },
+       { "wm8987", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(spi, wm8750_spi_ids);
+
 static struct spi_driver wm8750_spi_driver = {
        .driver = {
                .name   = "wm8750-codec",
                .owner  = THIS_MODULE,
        },
+       .id_table       = wm8750_spi_ids,
        .probe          = wm8750_spi_probe,
        .remove         = __devexit_p(wm8750_spi_remove),
 };
index 43e3d76..4ad8ebd 100644 (file)
@@ -2046,8 +2046,13 @@ static int wm8903_probe(struct snd_soc_codec *codec)
 /* power down chip */
 static int wm8903_remove(struct snd_soc_codec *codec)
 {
+       struct wm8903_priv *wm8903 = snd_soc_codec_get_drvdata(codec);
+
        wm8903_free_gpio(codec);
        wm8903_set_bias_level(codec, SND_SOC_BIAS_OFF);
+       if (wm8903->irq)
+               free_irq(wm8903->irq, codec);
+
        return 0;
 }
 
index 09e680a..b393f9f 100644 (file)
@@ -2981,6 +2981,7 @@ static int wm8994_codec_probe(struct snd_soc_codec *codec)
                        wm8994->hubs.dcs_readback_mode = 1;
                        break;
                }
+               break;
 
        case WM8958:
                wm8994->hubs.dcs_readback_mode = 1;
index 83d213b..62e292f 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Nokia Corporation
  *
- * Contact: Jarkko Nikula <jhnikula@gmail.com>
+ * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
@@ -402,6 +402,6 @@ static void __exit n810_soc_exit(void)
 module_init(n810_soc_init);
 module_exit(n810_soc_exit);
 
-MODULE_AUTHOR("Jarkko Nikula <jhnikula@gmail.com>");
+MODULE_AUTHOR("Jarkko Nikula <jarkko.nikula@bitmer.com>");
 MODULE_DESCRIPTION("ALSA SoC Nokia N810");
 MODULE_LICENSE("GPL");
index 07b7723..ebcc2d4 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Nokia Corporation
  *
- * Contact: Jarkko Nikula <jhnikula@gmail.com>
+ * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *          Peter Ujfalusi <peter.ujfalusi@ti.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -780,6 +780,6 @@ static void __exit snd_omap_mcbsp_exit(void)
 }
 module_exit(snd_omap_mcbsp_exit);
 
-MODULE_AUTHOR("Jarkko Nikula <jhnikula@gmail.com>");
+MODULE_AUTHOR("Jarkko Nikula <jarkko.nikula@bitmer.com>");
 MODULE_DESCRIPTION("OMAP I2S SoC Interface");
 MODULE_LICENSE("GPL");
index 9a7dedd..65cde9d 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Nokia Corporation
  *
- * Contact: Jarkko Nikula <jhnikula@gmail.com>
+ * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *          Peter Ujfalusi <peter.ujfalusi@ti.com>
  *
  * This program is free software; you can redistribute it and/or
index b2f5751..9b5c88a 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Nokia Corporation
  *
- * Contact: Jarkko Nikula <jhnikula@gmail.com>
+ * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *          Peter Ujfalusi <peter.ujfalusi@ti.com>
  *
  * This program is free software; you can redistribute it and/or
@@ -436,6 +436,6 @@ static void __exit snd_omap_pcm_exit(void)
 }
 module_exit(snd_omap_pcm_exit);
 
-MODULE_AUTHOR("Jarkko Nikula <jhnikula@gmail.com>");
+MODULE_AUTHOR("Jarkko Nikula <jarkko.nikula@bitmer.com>");
 MODULE_DESCRIPTION("OMAP PCM DMA module");
 MODULE_LICENSE("GPL");
index a0ed1db..f95fe30 100644 (file)
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2008 Nokia Corporation
  *
- * Contact: Jarkko Nikula <jhnikula@gmail.com>
+ * Contact: Jarkko Nikula <jarkko.nikula@bitmer.com>
  *          Peter Ujfalusi <peter.ujfalusi@ti.com>
  *
  * This program is free software; you can redistribute it and/or
index 0aae998..893300a 100644 (file)
@@ -5,7 +5,7 @@
  *
  * Contact: Peter Ujfalusi <peter.ujfalusi@ti.com>
  *          Eduardo Valentin <eduardo.valentin@nokia.com>
- *          Jarkko Nikula <jhnikula@gmail.com>
+ *          Jarkko Nikula <jarkko.nikula@bitmer.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License
index 9eb3b12..8509d3c 100644 (file)
@@ -1,5 +1,6 @@
 # S3c24XX Platform Support
 snd-soc-s3c24xx-objs := dma.o
+snd-soc-idma-objs := idma.o
 snd-soc-s3c24xx-i2s-objs := s3c24xx-i2s.o
 snd-soc-s3c2412-i2s-objs := s3c2412-i2s.o
 snd-soc-ac97-objs := ac97.o
@@ -16,6 +17,7 @@ obj-$(CONFIG_SND_S3C_I2SV2_SOC) += snd-soc-s3c-i2s-v2.o
 obj-$(CONFIG_SND_SAMSUNG_SPDIF) += snd-soc-samsung-spdif.o
 obj-$(CONFIG_SND_SAMSUNG_PCM) += snd-soc-pcm.o
 obj-$(CONFIG_SND_SAMSUNG_I2S) += snd-soc-i2s.o
+obj-$(CONFIG_SND_SAMSUNG_I2S) += snd-soc-idma.o
 
 # S3C24XX Machine Support
 snd-soc-jive-wm8750-objs := jive_wm8750.o
diff --git a/sound/soc/samsung/idma.c b/sound/soc/samsung/idma.c
new file mode 100644 (file)
index 0000000..ebde074
--- /dev/null
@@ -0,0 +1,453 @@
+/*
+ * sound/soc/samsung/idma.c
+ *
+ * Copyright (c) 2011 Samsung Electronics Co., Ltd.
+ *             http://www.samsung.com
+ *
+ * I2S0's Internal DMA driver
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <sound/pcm.h>
+#include <sound/pcm_params.h>
+#include <sound/soc.h>
+
+#include "i2s.h"
+#include "idma.h"
+#include "dma.h"
+#include "i2s-regs.h"
+
+#define ST_RUNNING             (1<<0)
+#define ST_OPENED              (1<<1)
+
+static const struct snd_pcm_hardware idma_hardware = {
+       .info = SNDRV_PCM_INFO_INTERLEAVED |
+                   SNDRV_PCM_INFO_BLOCK_TRANSFER |
+                   SNDRV_PCM_INFO_MMAP |
+                   SNDRV_PCM_INFO_MMAP_VALID |
+                   SNDRV_PCM_INFO_PAUSE |
+                   SNDRV_PCM_INFO_RESUME,
+       .formats = SNDRV_PCM_FMTBIT_S16_LE |
+                   SNDRV_PCM_FMTBIT_U16_LE |
+                   SNDRV_PCM_FMTBIT_S24_LE |
+                   SNDRV_PCM_FMTBIT_U24_LE |
+                   SNDRV_PCM_FMTBIT_U8 |
+                   SNDRV_PCM_FMTBIT_S8,
+       .channels_min = 2,
+       .channels_max = 2,
+       .buffer_bytes_max = MAX_IDMA_BUFFER,
+       .period_bytes_min = 128,
+       .period_bytes_max = MAX_IDMA_PERIOD,
+       .periods_min = 1,
+       .periods_max = 2,
+};
+
+struct idma_ctrl {
+       spinlock_t      lock;
+       int             state;
+       dma_addr_t      start;
+       dma_addr_t      pos;
+       dma_addr_t      end;
+       dma_addr_t      period;
+       dma_addr_t      periodsz;
+       void            *token;
+       void            (*cb)(void *dt, int bytes_xfer);
+};
+
+static struct idma_info {
+       spinlock_t      lock;
+       void             __iomem  *regs;
+       dma_addr_t      lp_tx_addr;
+} idma;
+
+static void idma_getpos(dma_addr_t *src)
+{
+       *src = idma.lp_tx_addr +
+               (readl(idma.regs + I2STRNCNT) & 0xffffff) * 4;
+}
+
+static int idma_enqueue(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+       u32 val;
+
+       spin_lock(&prtd->lock);
+       prtd->token = (void *) substream;
+       spin_unlock(&prtd->lock);
+
+       /* Internal DMA Level0 Interrupt Address */
+       val = idma.lp_tx_addr + prtd->periodsz;
+       writel(val, idma.regs + I2SLVL0ADDR);
+
+       /* Start address0 of I2S internal DMA operation. */
+       val = idma.lp_tx_addr;
+       writel(val, idma.regs + I2SSTR0);
+
+       /*
+        * Transfer block size for I2S internal DMA.
+        * Should decide transfer size before start dma operation
+        */
+       val = readl(idma.regs + I2SSIZE);
+       val &= ~(I2SSIZE_TRNMSK << I2SSIZE_SHIFT);
+       val |= (((runtime->dma_bytes >> 2) &
+                       I2SSIZE_TRNMSK) << I2SSIZE_SHIFT);
+       writel(val, idma.regs + I2SSIZE);
+
+       val = readl(idma.regs + I2SAHB);
+       val |= AHB_INTENLVL0;
+       writel(val, idma.regs + I2SAHB);
+
+       return 0;
+}
+
+static void idma_setcallbk(struct snd_pcm_substream *substream,
+                               void (*cb)(void *, int))
+{
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+
+       spin_lock(&prtd->lock);
+       prtd->cb = cb;
+       spin_unlock(&prtd->lock);
+}
+
+static void idma_control(int op)
+{
+       u32 val = readl(idma.regs + I2SAHB);
+
+       spin_lock(&idma.lock);
+
+       switch (op) {
+       case LPAM_DMA_START:
+               val |= (AHB_INTENLVL0 | AHB_DMAEN);
+               break;
+       case LPAM_DMA_STOP:
+               val &= ~(AHB_INTENLVL0 | AHB_DMAEN);
+               break;
+       default:
+               spin_unlock(&idma.lock);
+               return;
+       }
+
+       writel(val, idma.regs + I2SAHB);
+       spin_unlock(&idma.lock);
+}
+
+static void idma_done(void *id, int bytes_xfer)
+{
+       struct snd_pcm_substream *substream = id;
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+
+       if (prtd && (prtd->state & ST_RUNNING))
+               snd_pcm_period_elapsed(substream);
+}
+
+static int idma_hw_params(struct snd_pcm_substream *substream,
+                               struct snd_pcm_hw_params *params)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+       u32 mod = readl(idma.regs + I2SMOD);
+       u32 ahb = readl(idma.regs + I2SAHB);
+
+       ahb |= (AHB_DMARLD | AHB_INTMASK);
+       mod |= MOD_TXS_IDMA;
+       writel(ahb, idma.regs + I2SAHB);
+       writel(mod, idma.regs + I2SMOD);
+
+       snd_pcm_set_runtime_buffer(substream, &substream->dma_buffer);
+       runtime->dma_bytes = params_buffer_bytes(params);
+
+       prtd->start = prtd->pos = runtime->dma_addr;
+       prtd->period = params_periods(params);
+       prtd->periodsz = params_period_bytes(params);
+       prtd->end = runtime->dma_addr + runtime->dma_bytes;
+
+       idma_setcallbk(substream, idma_done);
+
+       return 0;
+}
+
+static int idma_hw_free(struct snd_pcm_substream *substream)
+{
+       snd_pcm_set_runtime_buffer(substream, NULL);
+
+       return 0;
+}
+
+static int idma_prepare(struct snd_pcm_substream *substream)
+{
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+
+       prtd->pos = prtd->start;
+
+       /* flush the DMA channel */
+       idma_control(LPAM_DMA_STOP);
+       idma_enqueue(substream);
+
+       return 0;
+}
+
+static int idma_trigger(struct snd_pcm_substream *substream, int cmd)
+{
+       struct idma_ctrl *prtd = substream->runtime->private_data;
+       int ret = 0;
+
+       spin_lock(&prtd->lock);
+
+       switch (cmd) {
+       case SNDRV_PCM_TRIGGER_RESUME:
+       case SNDRV_PCM_TRIGGER_START:
+       case SNDRV_PCM_TRIGGER_PAUSE_RELEASE:
+               prtd->state |= ST_RUNNING;
+               idma_control(LPAM_DMA_START);
+               break;
+
+       case SNDRV_PCM_TRIGGER_SUSPEND:
+       case SNDRV_PCM_TRIGGER_STOP:
+       case SNDRV_PCM_TRIGGER_PAUSE_PUSH:
+               prtd->state &= ~ST_RUNNING;
+               idma_control(LPAM_DMA_STOP);
+               break;
+
+       default:
+               ret = -EINVAL;
+               break;
+       }
+
+       spin_unlock(&prtd->lock);
+
+       return ret;
+}
+
+static snd_pcm_uframes_t
+       idma_pointer(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct idma_ctrl *prtd = runtime->private_data;
+       dma_addr_t src;
+       unsigned long res;
+
+       spin_lock(&prtd->lock);
+
+       idma_getpos(&src);
+       res = src - prtd->start;
+
+       spin_unlock(&prtd->lock);
+
+       return bytes_to_frames(substream->runtime, res);
+}
+
+static int idma_mmap(struct snd_pcm_substream *substream,
+       struct vm_area_struct *vma)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       unsigned long size, offset;
+       int ret;
+
+       /* From snd_pcm_lib_mmap_iomem */
+       vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+       vma->vm_flags |= VM_IO;
+       size = vma->vm_end - vma->vm_start;
+       offset = vma->vm_pgoff << PAGE_SHIFT;
+       ret = io_remap_pfn_range(vma, vma->vm_start,
+                       (runtime->dma_addr + offset) >> PAGE_SHIFT,
+                       size, vma->vm_page_prot);
+
+       return ret;
+}
+
+static irqreturn_t iis_irq(int irqno, void *dev_id)
+{
+       struct idma_ctrl *prtd = (struct idma_ctrl *)dev_id;
+       u32 iiscon, iisahb, val, addr;
+
+       iisahb  = readl(idma.regs + I2SAHB);
+       iiscon  = readl(idma.regs + I2SCON);
+
+       val = (iisahb & AHB_LVL0INT) ? AHB_CLRLVL0INT : 0;
+
+       if (val) {
+               iisahb |= val;
+               writel(iisahb, idma.regs + I2SAHB);
+
+               addr = readl(idma.regs + I2SLVL0ADDR) - idma.lp_tx_addr;
+               addr += prtd->periodsz;
+               addr %= (prtd->end - prtd->start);
+               addr += idma.lp_tx_addr;
+
+               writel(addr, idma.regs + I2SLVL0ADDR);
+
+               if (prtd->cb)
+                       prtd->cb(prtd->token, prtd->period);
+       }
+
+       return IRQ_HANDLED;
+}
+
+static int idma_open(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct idma_ctrl *prtd;
+       int ret;
+
+       snd_soc_set_runtime_hwparams(substream, &idma_hardware);
+
+       prtd = kzalloc(sizeof(struct idma_ctrl), GFP_KERNEL);
+       if (prtd == NULL)
+               return -ENOMEM;
+
+       ret = request_irq(IRQ_I2S0, iis_irq, 0, "i2s", prtd);
+       if (ret < 0) {
+               pr_err("fail to claim i2s irq , ret = %d\n", ret);
+               kfree(prtd);
+               return ret;
+       }
+
+       spin_lock_init(&prtd->lock);
+
+       runtime->private_data = prtd;
+
+       return 0;
+}
+
+static int idma_close(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct idma_ctrl *prtd = runtime->private_data;
+
+       free_irq(IRQ_I2S0, prtd);
+
+       if (!prtd)
+               pr_err("idma_close called with prtd == NULL\n");
+
+       kfree(prtd);
+
+       return 0;
+}
+
+static struct snd_pcm_ops idma_ops = {
+       .open           = idma_open,
+       .close          = idma_close,
+       .ioctl          = snd_pcm_lib_ioctl,
+       .trigger        = idma_trigger,
+       .pointer        = idma_pointer,
+       .mmap           = idma_mmap,
+       .hw_params      = idma_hw_params,
+       .hw_free        = idma_hw_free,
+       .prepare        = idma_prepare,
+};
+
+static void idma_free(struct snd_pcm *pcm)
+{
+       struct snd_pcm_substream *substream;
+       struct snd_dma_buffer *buf;
+
+       substream = pcm->streams[SNDRV_PCM_STREAM_PLAYBACK].substream;
+       if (!substream)
+               return;
+
+       buf = &substream->dma_buffer;
+       if (!buf->area)
+               return;
+
+       iounmap(buf->area);
+
+       buf->area = NULL;
+       buf->addr = 0;
+}
+
+static int preallocate_idma_buffer(struct snd_pcm *pcm, int stream)
+{
+       struct snd_pcm_substream *substream = pcm->streams[stream].substream;
+       struct snd_dma_buffer *buf = &substream->dma_buffer;
+
+       buf->dev.dev = pcm->card->dev;
+       buf->private_data = NULL;
+
+       /* Assign PCM buffer pointers */
+       buf->dev.type = SNDRV_DMA_TYPE_CONTINUOUS;
+       buf->addr = idma.lp_tx_addr;
+       buf->bytes = idma_hardware.buffer_bytes_max;
+       buf->area = (unsigned char *)ioremap(buf->addr, buf->bytes);
+
+       return 0;
+}
+
+static u64 idma_mask = DMA_BIT_MASK(32);
+
+static int idma_new(struct snd_soc_pcm_runtime *rtd)
+{
+       struct snd_card *card = rtd->card->snd_card;
+       struct snd_soc_dai *dai = rtd->cpu_dai;
+       struct snd_pcm *pcm = rtd->pcm;
+       int ret = 0;
+
+       if (!card->dev->dma_mask)
+               card->dev->dma_mask = &idma_mask;
+       if (!card->dev->coherent_dma_mask)
+               card->dev->coherent_dma_mask = DMA_BIT_MASK(32);
+
+       if (dai->driver->playback.channels_min)
+               ret = preallocate_idma_buffer(pcm,
+                               SNDRV_PCM_STREAM_PLAYBACK);
+
+       return ret;
+}
+
+void idma_reg_addr_init(void *regs, dma_addr_t addr)
+{
+       spin_lock_init(&idma.lock);
+       idma.regs = regs;
+       idma.lp_tx_addr = addr;
+}
+
+struct snd_soc_platform_driver asoc_idma_platform = {
+       .ops = &idma_ops,
+       .pcm_new = idma_new,
+       .pcm_free = idma_free,
+};
+
+static int __devinit asoc_idma_platform_probe(struct platform_device *pdev)
+{
+       return snd_soc_register_platform(&pdev->dev, &asoc_idma_platform);
+}
+
+static int __devexit asoc_idma_platform_remove(struct platform_device *pdev)
+{
+       snd_soc_unregister_platform(&pdev->dev);
+       return 0;
+}
+
+static struct platform_driver asoc_idma_driver = {
+       .driver = {
+               .name = "samsung-idma",
+               .owner = THIS_MODULE,
+       },
+
+       .probe = asoc_idma_platform_probe,
+       .remove = __devexit_p(asoc_idma_platform_remove),
+};
+
+static int __init asoc_idma_init(void)
+{
+       return platform_driver_register(&asoc_idma_driver);
+}
+module_init(asoc_idma_init);
+
+static void __exit asoc_idma_exit(void)
+{
+       platform_driver_unregister(&asoc_idma_driver);
+}
+module_exit(asoc_idma_exit);
+
+MODULE_AUTHOR("Jaswinder Singh, <jassisinghbrar@gmail.com>");
+MODULE_DESCRIPTION("Samsung ASoC IDMA Driver");
+MODULE_LICENSE("GPL");
diff --git a/sound/soc/samsung/idma.h b/sound/soc/samsung/idma.h
new file mode 100644 (file)
index 0000000..4827321
--- /dev/null
@@ -0,0 +1,26 @@
+/*
+ * sound/soc/samsung/idma.h
+ *
+ * Copyright (c) 2011 Samsung Electronics Co., Ltd
+ *             http://www.samsung.com
+ *
+ *  This program is free software; you can redistribute  it and/or modify it
+ *  under  the terms of  the GNU General  Public License as published by the
+ *  Free Software Foundation;  either version 2 of the  License, or (at your
+ *  option) any later version.
+ *
+ */
+
+#ifndef __SND_SOC_SAMSUNG_IDMA_H_
+#define __SND_SOC_SAMSUNG_IDMA_H_
+
+extern void idma_reg_addr_init(void *regs, dma_addr_t addr);
+
+/* dma_state */
+#define LPAM_DMA_STOP  0
+#define LPAM_DMA_START 1
+
+#define MAX_IDMA_PERIOD (128 * 1024)
+#define MAX_IDMA_BUFFER (160 * 1024)
+
+#endif /* __SND_SOC_SAMSUNG_IDMA_H_ */
index 3b53ad5..14eb6ea 100644 (file)
@@ -131,7 +131,7 @@ static struct snd_soc_dai_link jive_dai = {
        .cpu_dai_name   = "s3c2412-i2s",
        .codec_dai_name = "wm8750-hifi",
        .platform_name  = "samsung-audio",
-       .codec_name     = "wm8750-codec.0-0x1a",
+       .codec_name     = "wm8750-codec.0-001a",
        .init           = jive_wm8750_init,
        .ops            = &jive_ops,
 };
index 8ac42bf..0b9eb5f 100644 (file)
@@ -37,7 +37,7 @@ static int speyside_wm8962_set_bias_level(struct snd_soc_card *card,
                                                     44100 * 256,
                                                     SND_SOC_CLOCK_IN);
                        if (ret < 0) {
-                               pr_err("Failed to set SYSCLK: %d\n");
+                               pr_err("Failed to set SYSCLK: %d\n", ret);
                                return ret;
                        }
                }
index ff86e5e..c7cfd96 100644 (file)
@@ -309,9 +309,14 @@ static int tegra_pcm_preallocate_dma_buffer(struct snd_pcm *pcm, int stream)
 
 static void tegra_pcm_deallocate_dma_buffer(struct snd_pcm *pcm, int stream)
 {
-       struct snd_pcm_substream *substream = pcm->streams[stream].substream;
-       struct snd_dma_buffer *buf = &substream->dma_buffer;
+       struct snd_pcm_substream *substream;
+       struct snd_dma_buffer *buf;
+
+       substream = pcm->streams[stream].substream;
+       if (!substream)
+               return;
 
+       buf = &substream->dma_buffer;
        if (!buf->area)
                return;
 
index a42e9ac..661373c 100644 (file)
@@ -56,6 +56,7 @@
 #define GPIO_HP_MUTE    BIT(1)
 #define GPIO_INT_MIC_EN BIT(2)
 #define GPIO_EXT_MIC_EN BIT(3)
+#define GPIO_HP_DET     BIT(4)
 
 struct tegra_wm8903 {
        struct tegra_asoc_utils_data util_data;
@@ -304,6 +305,7 @@ static int tegra_wm8903_init(struct snd_soc_pcm_runtime *rtd)
                snd_soc_jack_add_gpios(&tegra_wm8903_hp_jack,
                                        1,
                                        &tegra_wm8903_hp_jack_gpio);
+               machine->gpio_requested |= GPIO_HP_DET;
        }
 
        snd_soc_jack_new(codec, "Mic Jack", SND_JACK_MICROPHONE,
@@ -429,10 +431,10 @@ static int __devexit tegra_wm8903_driver_remove(struct platform_device *pdev)
        struct tegra_wm8903 *machine = snd_soc_card_get_drvdata(card);
        struct tegra_wm8903_platform_data *pdata = machine->pdata;
 
-       snd_soc_unregister_card(card);
-
-       tegra_asoc_utils_fini(&machine->util_data);
-
+       if (machine->gpio_requested & GPIO_HP_DET)
+               snd_soc_jack_free_gpios(&tegra_wm8903_hp_jack,
+                                       1,
+                                       &tegra_wm8903_hp_jack_gpio);
        if (machine->gpio_requested & GPIO_EXT_MIC_EN)
                gpio_free(pdata->gpio_ext_mic_en);
        if (machine->gpio_requested & GPIO_INT_MIC_EN)
@@ -441,6 +443,11 @@ static int __devexit tegra_wm8903_driver_remove(struct platform_device *pdev)
                gpio_free(pdata->gpio_hp_mute);
        if (machine->gpio_requested & GPIO_SPKR_EN)
                gpio_free(pdata->gpio_spkr_en);
+       machine->gpio_requested = 0;
+
+       snd_soc_unregister_card(card);
+
+       tegra_asoc_utils_fini(&machine->util_data);
 
        kfree(machine);
 
index d0d493c..2cf87f5 100644 (file)
@@ -139,8 +139,12 @@ static void stream_stop(struct snd_usb_caiaqdev *dev)
 
        for (i = 0; i < N_URBS; i++) {
                usb_kill_urb(dev->data_urbs_in[i]);
-               usb_kill_urb(dev->data_urbs_out[i]);
+
+               if (test_bit(i, &dev->outurb_active_mask))
+                       usb_kill_urb(dev->data_urbs_out[i]);
        }
+
+       dev->outurb_active_mask = 0;
 }
 
 static int snd_usb_caiaq_substream_open(struct snd_pcm_substream *substream)
@@ -612,8 +616,9 @@ static void read_completed(struct urb *urb)
 {
        struct snd_usb_caiaq_cb_info *info = urb->context;
        struct snd_usb_caiaqdev *dev;
-       struct urb *out;
-       int frame, len, send_it = 0, outframe = 0;
+       struct urb *out = NULL;
+       int i, frame, len, send_it = 0, outframe = 0;
+       size_t offset = 0;
 
        if (urb->status || !info)
                return;
@@ -623,7 +628,17 @@ static void read_completed(struct urb *urb)
        if (!dev->streaming)
                return;
 
-       out = dev->data_urbs_out[info->index];
+       /* find an unused output urb that is unused */
+       for (i = 0; i < N_URBS; i++)
+               if (test_and_set_bit(i, &dev->outurb_active_mask) == 0) {
+                       out = dev->data_urbs_out[i];
+                       break;
+               }
+
+       if (!out) {
+               log("Unable to find an output urb to use\n");
+               goto requeue;
+       }
 
        /* read the recently received packet and send back one which has
         * the same layout */
@@ -634,7 +649,8 @@ static void read_completed(struct urb *urb)
                len = urb->iso_frame_desc[outframe].actual_length;
                out->iso_frame_desc[outframe].length = len;
                out->iso_frame_desc[outframe].actual_length = 0;
-               out->iso_frame_desc[outframe].offset = BYTES_PER_FRAME * frame;
+               out->iso_frame_desc[outframe].offset = offset;
+               offset += len;
 
                if (len > 0) {
                        spin_lock(&dev->spinlock);
@@ -650,11 +666,15 @@ static void read_completed(struct urb *urb)
        }
 
        if (send_it) {
-               out->number_of_packets = FRAMES_PER_URB;
+               out->number_of_packets = outframe;
                out->transfer_flags = URB_ISO_ASAP;
                usb_submit_urb(out, GFP_ATOMIC);
+       } else {
+               struct snd_usb_caiaq_cb_info *oinfo = out->context;
+               clear_bit(oinfo->index, &dev->outurb_active_mask);
        }
 
+requeue:
        /* re-submit inbound urb */
        for (frame = 0; frame < FRAMES_PER_URB; frame++) {
                urb->iso_frame_desc[frame].offset = BYTES_PER_FRAME * frame;
@@ -676,6 +696,8 @@ static void write_completed(struct urb *urb)
                dev->output_running = 1;
                wake_up(&dev->prepare_wait_queue);
        }
+
+       clear_bit(info->index, &dev->outurb_active_mask);
 }
 
 static struct urb **alloc_urbs(struct snd_usb_caiaqdev *dev, int dir, int *ret)
@@ -827,6 +849,9 @@ int snd_usb_caiaq_audio_init(struct snd_usb_caiaqdev *dev)
        if (!dev->data_cb_info)
                return -ENOMEM;
 
+       dev->outurb_active_mask = 0;
+       BUILD_BUG_ON(N_URBS > (sizeof(dev->outurb_active_mask) * 8));
+
        for (i = 0; i < N_URBS; i++) {
                dev->data_cb_info[i].dev = dev;
                dev->data_cb_info[i].index = i;
index b2b3101..3f9c633 100644 (file)
@@ -96,6 +96,7 @@ struct snd_usb_caiaqdev {
        int input_panic, output_panic, warned;
        char *audio_in_buf, *audio_out_buf;
        unsigned int samplerates, bpp;
+       unsigned long outurb_active_mask;
 
        struct snd_pcm_substream *sub_playback[MAX_STREAMS];
        struct snd_pcm_substream *sub_capture[MAX_STREAMS];
index c04d7c7..cdd19d7 100644 (file)
@@ -152,6 +152,7 @@ static inline void check_mapped_dB(const struct usbmix_name_map *p,
        if (p && p->dB) {
                cval->dBmin = p->dB->min;
                cval->dBmax = p->dB->max;
+               cval->initialized = 1;
        }
 }
 
@@ -1092,7 +1093,7 @@ static void build_feature_ctl(struct mixer_build *state, void *raw_desc,
                                " Switch" : " Volume");
                if (control == UAC_FU_VOLUME) {
                        check_mapped_dB(map, cval);
-                       if (cval->dBmin < cval->dBmax) {
+                       if (cval->dBmin < cval->dBmax || !cval->initialized) {
                                kctl->tlv.c = mixer_vol_tlv;
                                kctl->vd[0].access |= 
                                        SNDRV_CTL_ELEM_ACCESS_TLV_READ |
index 4d4f865..a42e3ef 100644 (file)
@@ -1707,6 +1707,40 @@ YAMAHA_DEVICE(0x7010, "UB99"),
                }
        }
 },
+{
+       USB_DEVICE(0x0582, 0x0130),
+       .driver_info = (unsigned long) & (const struct snd_usb_audio_quirk) {
+               /* .vendor_name = "BOSS", */
+               /* .product_name = "MICRO BR-80", */
+               .ifnum = QUIRK_ANY_INTERFACE,
+               .type = QUIRK_COMPOSITE,
+               .data = (const struct snd_usb_audio_quirk[]) {
+                       {
+                               .ifnum = 0,
+                               .type = QUIRK_IGNORE_INTERFACE
+                       },
+                       {
+                               .ifnum = 1,
+                               .type = QUIRK_AUDIO_STANDARD_INTERFACE
+                       },
+                       {
+                               .ifnum = 2,
+                               .type = QUIRK_AUDIO_STANDARD_INTERFACE
+                       },
+                       {
+                               .ifnum = 3,
+                               .type = QUIRK_MIDI_FIXED_ENDPOINT,
+                               .data = & (const struct snd_usb_midi_endpoint_info) {
+                                       .out_cables = 0x0001,
+                                       .in_cables  = 0x0001
+                               }
+                       },
+                       {
+                               .ifnum = -1
+                       }
+               }
+       }
+},
 
 /* Guillemot devices */
 {
index 5f2a5c7..710ae3d 100644 (file)
@@ -134,10 +134,18 @@ static int opt_show_lines(const struct option *opt __used,
 {
        int ret = 0;
 
-       if (str)
-               ret = parse_line_range_desc(str, &params.line_range);
-       INIT_LIST_HEAD(&params.line_range.line_list);
+       if (!str)
+               return 0;
+
+       if (params.show_lines) {
+               pr_warning("Warning: more than one --line options are"
+                          " detected. Only the first one is valid.\n");
+               return 0;
+       }
+
        params.show_lines = true;
+       ret = parse_line_range_desc(str, &params.line_range);
+       INIT_LIST_HEAD(&params.line_range.line_list);
 
        return ret;
 }
index f6426b4..6b0519f 100644 (file)
@@ -45,7 +45,7 @@ static int                    freq                            =   1000;
 static int                     output;
 static int                     pipe_output                     =      0;
 static const char              *output_name                    = NULL;
-static int                     group                           =      0;
+static bool                    group                           =  false;
 static int                     realtime_prio                   =      0;
 static bool                    nodelay                         =  false;
 static bool                    raw_samples                     =  false;
@@ -753,6 +753,8 @@ const struct option record_options[] = {
                    "child tasks do not inherit counters"),
        OPT_UINTEGER('F', "freq", &user_freq, "profile at this frequency"),
        OPT_UINTEGER('m', "mmap-pages", &mmap_pages, "number of mmap data pages"),
+       OPT_BOOLEAN(0, "group", &group,
+                   "put the counters into a counter group"),
        OPT_BOOLEAN('g', "call-graph", &call_graph,
                    "do call-graph (stack chain/backtrace) recording"),
        OPT_INCR('v', "verbose", &verbose,
index 1ad04ce..5deb17d 100644 (file)
@@ -193,6 +193,7 @@ static int                  big_num_opt                     =  -1;
 static const char              *cpu_list;
 static const char              *csv_sep                        = NULL;
 static bool                    csv_output                      = false;
+static bool                    group                           = false;
 
 static volatile int done = 0;
 
@@ -280,14 +281,14 @@ static int create_perf_stat_counter(struct perf_evsel *evsel)
        attr->inherit = !no_inherit;
 
        if (system_wide)
-               return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, false);
+               return perf_evsel__open_per_cpu(evsel, evsel_list->cpus, group);
 
        if (target_pid == -1 && target_tid == -1) {
                attr->disabled = 1;
                attr->enable_on_exec = 1;
        }
 
-       return perf_evsel__open_per_thread(evsel, evsel_list->threads, false);
+       return perf_evsel__open_per_thread(evsel, evsel_list->threads, group);
 }
 
 /*
@@ -1043,6 +1044,8 @@ static const struct option options[] = {
                    "stat events on existing thread id"),
        OPT_BOOLEAN('a', "all-cpus", &system_wide,
                    "system-wide collection from all CPUs"),
+       OPT_BOOLEAN('g', "group", &group,
+                   "put the counters into a counter group"),
        OPT_BOOLEAN('c', "scale", &scale,
                    "scale/normalize counters"),
        OPT_INCR('v', "verbose", &verbose,
index fddf40f..ee51e9b 100644 (file)
@@ -96,6 +96,39 @@ int cu_find_lineinfo(Dwarf_Die *cu_die, unsigned long addr,
        return *lineno ?: -ENOENT;
 }
 
+static int __die_find_inline_cb(Dwarf_Die *die_mem, void *data);
+
+/**
+ * cu_walk_functions_at - Walk on function DIEs at given address
+ * @cu_die: A CU DIE
+ * @addr: An address
+ * @callback: A callback which called with found DIEs
+ * @data: A user data
+ *
+ * Walk on function DIEs at given @addr in @cu_die. Passed DIEs
+ * should be subprogram or inlined-subroutines.
+ */
+int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
+                   int (*callback)(Dwarf_Die *, void *), void *data)
+{
+       Dwarf_Die die_mem;
+       Dwarf_Die *sc_die;
+       int ret = -ENOENT;
+
+       /* Inlined function could be recursive. Trace it until fail */
+       for (sc_die = die_find_realfunc(cu_die, addr, &die_mem);
+            sc_die != NULL;
+            sc_die = die_find_child(sc_die, __die_find_inline_cb, &addr,
+                                    &die_mem)) {
+               ret = callback(sc_die, data);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+
+}
+
 /**
  * die_compare_name - Compare diename and tname
  * @dw_die: a DIE
@@ -198,6 +231,19 @@ static int die_get_attr_udata(Dwarf_Die *tp_die, unsigned int attr_name,
        return 0;
 }
 
+/* Get attribute and translate it as a sdata */
+static int die_get_attr_sdata(Dwarf_Die *tp_die, unsigned int attr_name,
+                             Dwarf_Sword *result)
+{
+       Dwarf_Attribute attr;
+
+       if (dwarf_attr(tp_die, attr_name, &attr) == NULL ||
+           dwarf_formsdata(&attr, result) != 0)
+               return -ENOENT;
+
+       return 0;
+}
+
 /**
  * die_is_signed_type - Check whether a type DIE is signed or not
  * @tp_die: a DIE of a type
@@ -250,6 +296,50 @@ int die_get_data_member_location(Dwarf_Die *mb_die, Dwarf_Word *offs)
        return 0;
 }
 
+/* Get the call file index number in CU DIE */
+static int die_get_call_fileno(Dwarf_Die *in_die)
+{
+       Dwarf_Sword idx;
+
+       if (die_get_attr_sdata(in_die, DW_AT_call_file, &idx) == 0)
+               return (int)idx;
+       else
+               return -ENOENT;
+}
+
+/* Get the declared file index number in CU DIE */
+static int die_get_decl_fileno(Dwarf_Die *pdie)
+{
+       Dwarf_Sword idx;
+
+       if (die_get_attr_sdata(pdie, DW_AT_decl_file, &idx) == 0)
+               return (int)idx;
+       else
+               return -ENOENT;
+}
+
+/**
+ * die_get_call_file - Get callsite file name of inlined function instance
+ * @in_die: a DIE of an inlined function instance
+ *
+ * Get call-site file name of @in_die. This means from which file the inline
+ * function is called.
+ */
+const char *die_get_call_file(Dwarf_Die *in_die)
+{
+       Dwarf_Die cu_die;
+       Dwarf_Files *files;
+       int idx;
+
+       idx = die_get_call_fileno(in_die);
+       if (idx < 0 || !dwarf_diecu(in_die, &cu_die, NULL, NULL) ||
+           dwarf_getsrcfiles(&cu_die, &files, NULL) != 0)
+               return NULL;
+
+       return dwarf_filesrc(files, idx, NULL, NULL);
+}
+
+
 /**
  * die_find_child - Generic DIE search function in DIE tree
  * @rt_die: a root DIE
@@ -374,9 +464,78 @@ Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
        return die_mem;
 }
 
+struct __instance_walk_param {
+       void    *addr;
+       int     (*callback)(Dwarf_Die *, void *);
+       void    *data;
+       int     retval;
+};
+
+static int __die_walk_instances_cb(Dwarf_Die *inst, void *data)
+{
+       struct __instance_walk_param *iwp = data;
+       Dwarf_Attribute attr_mem;
+       Dwarf_Die origin_mem;
+       Dwarf_Attribute *attr;
+       Dwarf_Die *origin;
+       int tmp;
+
+       attr = dwarf_attr(inst, DW_AT_abstract_origin, &attr_mem);
+       if (attr == NULL)
+               return DIE_FIND_CB_CONTINUE;
+
+       origin = dwarf_formref_die(attr, &origin_mem);
+       if (origin == NULL || origin->addr != iwp->addr)
+               return DIE_FIND_CB_CONTINUE;
+
+       /* Ignore redundant instances */
+       if (dwarf_tag(inst) == DW_TAG_inlined_subroutine) {
+               dwarf_decl_line(origin, &tmp);
+               if (die_get_call_lineno(inst) == tmp) {
+                       tmp = die_get_decl_fileno(origin);
+                       if (die_get_call_fileno(inst) == tmp)
+                               return DIE_FIND_CB_CONTINUE;
+               }
+       }
+
+       iwp->retval = iwp->callback(inst, iwp->data);
+
+       return (iwp->retval) ? DIE_FIND_CB_END : DIE_FIND_CB_CONTINUE;
+}
+
+/**
+ * die_walk_instances - Walk on instances of given DIE
+ * @or_die: an abstract original DIE
+ * @callback: a callback function which is called with instance DIE
+ * @data: user data
+ *
+ * Walk on the instances of give @in_die. @in_die must be an inlined function
+ * declartion. This returns the return value of @callback if it returns
+ * non-zero value, or -ENOENT if there is no instance.
+ */
+int die_walk_instances(Dwarf_Die *or_die, int (*callback)(Dwarf_Die *, void *),
+                      void *data)
+{
+       Dwarf_Die cu_die;
+       Dwarf_Die die_mem;
+       struct __instance_walk_param iwp = {
+               .addr = or_die->addr,
+               .callback = callback,
+               .data = data,
+               .retval = -ENOENT,
+       };
+
+       if (dwarf_diecu(or_die, &cu_die, NULL, NULL) == NULL)
+               return -ENOENT;
+
+       die_find_child(&cu_die, __die_walk_instances_cb, &iwp, &die_mem);
+
+       return iwp.retval;
+}
+
 /* Line walker internal parameters */
 struct __line_walk_param {
-       const char *fname;
+       bool recursive;
        line_walk_callback_t callback;
        void *data;
        int retval;
@@ -385,39 +544,56 @@ struct __line_walk_param {
 static int __die_walk_funclines_cb(Dwarf_Die *in_die, void *data)
 {
        struct __line_walk_param *lw = data;
-       Dwarf_Addr addr;
+       Dwarf_Addr addr = 0;
+       const char *fname;
        int lineno;
 
        if (dwarf_tag(in_die) == DW_TAG_inlined_subroutine) {
+               fname = die_get_call_file(in_die);
                lineno = die_get_call_lineno(in_die);
-               if (lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) {
-                       lw->retval = lw->callback(lw->fname, lineno, addr,
-                                                 lw->data);
+               if (fname && lineno > 0 && dwarf_entrypc(in_die, &addr) == 0) {
+                       lw->retval = lw->callback(fname, lineno, addr, lw->data);
                        if (lw->retval != 0)
                                return DIE_FIND_CB_END;
                }
        }
-       return DIE_FIND_CB_SIBLING;
+       if (!lw->recursive)
+               /* Don't need to search recursively */
+               return DIE_FIND_CB_SIBLING;
+
+       if (addr) {
+               fname = dwarf_decl_file(in_die);
+               if (fname && dwarf_decl_line(in_die, &lineno) == 0) {
+                       lw->retval = lw->callback(fname, lineno, addr, lw->data);
+                       if (lw->retval != 0)
+                               return DIE_FIND_CB_END;
+               }
+       }
+
+       /* Continue to search nested inlined function call-sites */
+       return DIE_FIND_CB_CONTINUE;
 }
 
 /* Walk on lines of blocks included in given DIE */
-static int __die_walk_funclines(Dwarf_Die *sp_die,
+static int __die_walk_funclines(Dwarf_Die *sp_die, bool recursive,
                                line_walk_callback_t callback, void *data)
 {
        struct __line_walk_param lw = {
+               .recursive = recursive,
                .callback = callback,
                .data = data,
                .retval = 0,
        };
        Dwarf_Die die_mem;
        Dwarf_Addr addr;
+       const char *fname;
        int lineno;
 
        /* Handle function declaration line */
-       lw.fname = dwarf_decl_file(sp_die);
-       if (lw.fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
+       fname = dwarf_decl_file(sp_die);
+       if (fname && dwarf_decl_line(sp_die, &lineno) == 0 &&
            dwarf_entrypc(sp_die, &addr) == 0) {
-               lw.retval = callback(lw.fname, lineno, addr, data);
+               lw.retval = callback(fname, lineno, addr, data);
                if (lw.retval != 0)
                        goto done;
        }
@@ -430,7 +606,7 @@ static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
 {
        struct __line_walk_param *lw = data;
 
-       lw->retval = __die_walk_funclines(sp_die, lw->callback, lw->data);
+       lw->retval = __die_walk_funclines(sp_die, true, lw->callback, lw->data);
        if (lw->retval != 0)
                return DWARF_CB_ABORT;
 
@@ -439,7 +615,7 @@ static int __die_walk_culines_cb(Dwarf_Die *sp_die, void *data)
 
 /**
  * die_walk_lines - Walk on lines inside given DIE
- * @rt_die: a root DIE (CU or subprogram)
+ * @rt_die: a root DIE (CU, subprogram or inlined_subroutine)
  * @callback: callback routine
  * @data: user data
  *
@@ -460,12 +636,12 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data)
        size_t nlines, i;
 
        /* Get the CU die */
-       if (dwarf_tag(rt_die) == DW_TAG_subprogram)
+       if (dwarf_tag(rt_die) != DW_TAG_compile_unit)
                cu_die = dwarf_diecu(rt_die, &die_mem, NULL, NULL);
        else
                cu_die = rt_die;
        if (!cu_die) {
-               pr_debug2("Failed to get CU from subprogram\n");
+               pr_debug2("Failed to get CU from given DIE.\n");
                return -EINVAL;
        }
 
@@ -509,7 +685,11 @@ int die_walk_lines(Dwarf_Die *rt_die, line_walk_callback_t callback, void *data)
         * subroutines. We have to check functions list or given function.
         */
        if (rt_die != cu_die)
-               ret = __die_walk_funclines(rt_die, callback, data);
+               /*
+                * Don't need walk functions recursively, because nested
+                * inlined functions don't have lines of the specified DIE.
+                */
+               ret = __die_walk_funclines(rt_die, false, callback, data);
        else {
                struct __line_walk_param param = {
                        .callback = callback,
index bc3b211..6ce1717 100644 (file)
@@ -34,12 +34,19 @@ extern const char *cu_get_comp_dir(Dwarf_Die *cu_die);
 extern int cu_find_lineinfo(Dwarf_Die *cudie, unsigned long addr,
                            const char **fname, int *lineno);
 
+/* Walk on funcitons at given address */
+extern int cu_walk_functions_at(Dwarf_Die *cu_die, Dwarf_Addr addr,
+                       int (*callback)(Dwarf_Die *, void *), void *data);
+
 /* Compare diename and tname */
 extern bool die_compare_name(Dwarf_Die *dw_die, const char *tname);
 
 /* Get callsite line number of inline-function instance */
 extern int die_get_call_lineno(Dwarf_Die *in_die);
 
+/* Get callsite file name of inlined function instance */
+extern const char *die_get_call_file(Dwarf_Die *in_die);
+
 /* Get type die */
 extern Dwarf_Die *die_get_type(Dwarf_Die *vr_die, Dwarf_Die *die_mem);
 
@@ -73,6 +80,10 @@ extern Dwarf_Die *die_find_realfunc(Dwarf_Die *cu_die, Dwarf_Addr addr,
 extern Dwarf_Die *die_find_inlinefunc(Dwarf_Die *sp_die, Dwarf_Addr addr,
                                      Dwarf_Die *die_mem);
 
+/* Walk on the instances of given DIE */
+extern int die_walk_instances(Dwarf_Die *in_die,
+                             int (*callback)(Dwarf_Die *, void *), void *data);
+
 /* Walker on lines (Note: line number will not be sorted) */
 typedef int (* line_walk_callback_t) (const char *fname, int lineno,
                                      Dwarf_Addr addr, void *data);
index e03e7bc..c12bd47 100644 (file)
@@ -85,10 +85,19 @@ int perf_evlist__add_default(struct perf_evlist *evlist)
        struct perf_evsel *evsel = perf_evsel__new(&attr, 0);
 
        if (evsel == NULL)
-               return -ENOMEM;
+               goto error;
+
+       /* use strdup() because free(evsel) assumes name is allocated */
+       evsel->name = strdup("cycles");
+       if (!evsel->name)
+               goto error_free;
 
        perf_evlist__add(evlist, evsel);
        return 0;
+error_free:
+       perf_evsel__delete(evsel);
+error:
+       return -ENOMEM;
 }
 
 void perf_evlist__disable(struct perf_evlist *evlist)
index d4f3101..b6c1ad1 100644 (file)
@@ -726,7 +726,16 @@ static int perf_header__read_build_ids_abi_quirk(struct perf_header *header,
                        return -1;
 
                bev.header = old_bev.header;
-               bev.pid    = 0;
+
+               /*
+                * As the pid is the missing value, we need to fill
+                * it properly. The header.misc value give us nice hint.
+                */
+               bev.pid = HOST_KERNEL_ID;
+               if (bev.header.misc == PERF_RECORD_MISC_GUEST_USER ||
+                   bev.header.misc == PERF_RECORD_MISC_GUEST_KERNEL)
+                       bev.pid = DEFAULT_GUEST_KERNEL_ID;
+
                memcpy(bev.build_id, old_bev.build_id, sizeof(bev.build_id));
                __event_process_build_id(&bev, filename, session);
 
index 791f9dd..547628e 100644 (file)
@@ -5,7 +5,9 @@
 #define __always_inline        inline
 #endif
 #define __user
+#ifndef __attribute_const__
 #define __attribute_const__
+#endif
 
 #define __used         __attribute__((__unused__))
 
index 4ea7e19..928918b 100644 (file)
@@ -697,7 +697,11 @@ parse_raw_event(const char **strp, struct perf_event_attr *attr)
                return EVT_FAILED;
        n = hex2u64(str + 1, &config);
        if (n > 0) {
-               *strp = str + n + 1;
+               const char *end = str + n + 1;
+               if (*end != '\0' && *end != ',' && *end != ':')
+                       return EVT_FAILED;
+
+               *strp = end;
                attr->type = PERF_TYPE_RAW;
                attr->config = config;
                return EVT_HANDLED;
@@ -1097,6 +1101,4 @@ void print_events(const char *event_glob)
        printf("\n");
 
        print_tracepoint_events(NULL, NULL);
-
-       exit(129);
 }
index 3e44a3e..555fc38 100644 (file)
@@ -612,12 +612,12 @@ static int convert_variable(Dwarf_Die *vr_die, struct probe_finder *pf)
        return ret;
 }
 
-/* Find a variable in a subprogram die */
-static int find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
+/* Find a variable in a scope DIE */
+static int find_variable(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
-       Dwarf_Die vr_die, *scopes;
+       Dwarf_Die vr_die;
        char buf[32], *ptr;
-       int ret, nscopes;
+       int ret = 0;
 
        if (!is_c_varname(pf->pvar->var)) {
                /* Copy raw parameters */
@@ -652,30 +652,16 @@ static int find_variable(Dwarf_Die *sp_die, struct probe_finder *pf)
        if (pf->tvar->name == NULL)
                return -ENOMEM;
 
-       pr_debug("Searching '%s' variable in context.\n",
-                pf->pvar->var);
+       pr_debug("Searching '%s' variable in context.\n", pf->pvar->var);
        /* Search child die for local variables and parameters. */
-       if (die_find_variable_at(sp_die, pf->pvar->var, pf->addr, &vr_die))
-               ret = convert_variable(&vr_die, pf);
-       else {
-               /* Search upper class */
-               nscopes = dwarf_getscopes_die(sp_die, &scopes);
-               while (nscopes-- > 1) {
-                       pr_debug("Searching variables in %s\n",
-                                dwarf_diename(&scopes[nscopes]));
-                       /* We should check this scope, so give dummy address */
-                       if (die_find_variable_at(&scopes[nscopes],
-                                                pf->pvar->var, 0,
-                                                &vr_die)) {
-                               ret = convert_variable(&vr_die, pf);
-                               goto found;
-                       }
-               }
-               if (scopes)
-                       free(scopes);
-               ret = -ENOENT;
+       if (!die_find_variable_at(sc_die, pf->pvar->var, pf->addr, &vr_die)) {
+               /* Search again in global variables */
+               if (!die_find_variable_at(&pf->cu_die, pf->pvar->var, 0, &vr_die))
+                       ret = -ENOENT;
        }
-found:
+       if (ret == 0)
+               ret = convert_variable(&vr_die, pf);
+
        if (ret < 0)
                pr_warning("Failed to find '%s' in this function.\n",
                           pf->pvar->var);
@@ -718,26 +704,30 @@ static int convert_to_trace_point(Dwarf_Die *sp_die, Dwarf_Addr paddr,
        return 0;
 }
 
-/* Call probe_finder callback with real subprogram DIE */
-static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
+/* Call probe_finder callback with scope DIE */
+static int call_probe_finder(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
-       Dwarf_Die die_mem;
        Dwarf_Attribute fb_attr;
        size_t nops;
        int ret;
 
-       /* If no real subprogram, find a real one */
-       if (!sp_die || dwarf_tag(sp_die) != DW_TAG_subprogram) {
-               sp_die = die_find_realfunc(&pf->cu_die, pf->addr, &die_mem);
-               if (!sp_die) {
+       if (!sc_die) {
+               pr_err("Caller must pass a scope DIE. Program error.\n");
+               return -EINVAL;
+       }
+
+       /* If not a real subprogram, find a real one */
+       if (dwarf_tag(sc_die) != DW_TAG_subprogram) {
+               if (!die_find_realfunc(&pf->cu_die, pf->addr, &pf->sp_die)) {
                        pr_warning("Failed to find probe point in any "
                                   "functions.\n");
                        return -ENOENT;
                }
-       }
+       } else
+               memcpy(&pf->sp_die, sc_die, sizeof(Dwarf_Die));
 
-       /* Get the frame base attribute/ops */
-       dwarf_attr(sp_die, DW_AT_frame_base, &fb_attr);
+       /* Get the frame base attribute/ops from subprogram */
+       dwarf_attr(&pf->sp_die, DW_AT_frame_base, &fb_attr);
        ret = dwarf_getlocation_addr(&fb_attr, pf->addr, &pf->fb_ops, &nops, 1);
        if (ret <= 0 || nops == 0) {
                pf->fb_ops = NULL;
@@ -755,7 +745,7 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
        }
 
        /* Call finder's callback handler */
-       ret = pf->callback(sp_die, pf);
+       ret = pf->callback(sc_die, pf);
 
        /* *pf->fb_ops will be cached in libdw. Don't free it. */
        pf->fb_ops = NULL;
@@ -763,17 +753,82 @@ static int call_probe_finder(Dwarf_Die *sp_die, struct probe_finder *pf)
        return ret;
 }
 
+struct find_scope_param {
+       const char *function;
+       const char *file;
+       int line;
+       int diff;
+       Dwarf_Die *die_mem;
+       bool found;
+};
+
+static int find_best_scope_cb(Dwarf_Die *fn_die, void *data)
+{
+       struct find_scope_param *fsp = data;
+       const char *file;
+       int lno;
+
+       /* Skip if declared file name does not match */
+       if (fsp->file) {
+               file = dwarf_decl_file(fn_die);
+               if (!file || strcmp(fsp->file, file) != 0)
+                       return 0;
+       }
+       /* If the function name is given, that's what user expects */
+       if (fsp->function) {
+               if (die_compare_name(fn_die, fsp->function)) {
+                       memcpy(fsp->die_mem, fn_die, sizeof(Dwarf_Die));
+                       fsp->found = true;
+                       return 1;
+               }
+       } else {
+               /* With the line number, find the nearest declared DIE */
+               dwarf_decl_line(fn_die, &lno);
+               if (lno < fsp->line && fsp->diff > fsp->line - lno) {
+                       /* Keep a candidate and continue */
+                       fsp->diff = fsp->line - lno;
+                       memcpy(fsp->die_mem, fn_die, sizeof(Dwarf_Die));
+                       fsp->found = true;
+               }
+       }
+       return 0;
+}
+
+/* Find an appropriate scope fits to given conditions */
+static Dwarf_Die *find_best_scope(struct probe_finder *pf, Dwarf_Die *die_mem)
+{
+       struct find_scope_param fsp = {
+               .function = pf->pev->point.function,
+               .file = pf->fname,
+               .line = pf->lno,
+               .diff = INT_MAX,
+               .die_mem = die_mem,
+               .found = false,
+       };
+
+       cu_walk_functions_at(&pf->cu_die, pf->addr, find_best_scope_cb, &fsp);
+
+       return fsp.found ? die_mem : NULL;
+}
+
 static int probe_point_line_walker(const char *fname, int lineno,
                                   Dwarf_Addr addr, void *data)
 {
        struct probe_finder *pf = data;
+       Dwarf_Die *sc_die, die_mem;
        int ret;
 
        if (lineno != pf->lno || strtailcmp(fname, pf->fname) != 0)
                return 0;
 
        pf->addr = addr;
-       ret = call_probe_finder(NULL, pf);
+       sc_die = find_best_scope(pf, &die_mem);
+       if (!sc_die) {
+               pr_warning("Failed to find scope of probe point.\n");
+               return -ENOENT;
+       }
+
+       ret = call_probe_finder(sc_die, pf);
 
        /* Continue if no error, because the line will be in inline function */
        return ret < 0 ? ret : 0;
@@ -827,6 +882,7 @@ static int probe_point_lazy_walker(const char *fname, int lineno,
                                   Dwarf_Addr addr, void *data)
 {
        struct probe_finder *pf = data;
+       Dwarf_Die *sc_die, die_mem;
        int ret;
 
        if (!line_list__has_line(&pf->lcache, lineno) ||
@@ -836,7 +892,14 @@ static int probe_point_lazy_walker(const char *fname, int lineno,
        pr_debug("Probe line found: line:%d addr:0x%llx\n",
                 lineno, (unsigned long long)addr);
        pf->addr = addr;
-       ret = call_probe_finder(NULL, pf);
+       pf->lno = lineno;
+       sc_die = find_best_scope(pf, &die_mem);
+       if (!sc_die) {
+               pr_warning("Failed to find scope of probe point.\n");
+               return -ENOENT;
+       }
+
+       ret = call_probe_finder(sc_die, pf);
 
        /*
         * Continue if no error, because the lazy pattern will match
@@ -861,42 +924,39 @@ static int find_probe_point_lazy(Dwarf_Die *sp_die, struct probe_finder *pf)
        return die_walk_lines(sp_die, probe_point_lazy_walker, pf);
 }
 
-/* Callback parameter with return value */
-struct dwarf_callback_param {
-       void *data;
-       int retval;
-};
-
 static int probe_point_inline_cb(Dwarf_Die *in_die, void *data)
 {
-       struct dwarf_callback_param *param = data;
-       struct probe_finder *pf = param->data;
+       struct probe_finder *pf = data;
        struct perf_probe_point *pp = &pf->pev->point;
        Dwarf_Addr addr;
+       int ret;
 
        if (pp->lazy_line)
-               param->retval = find_probe_point_lazy(in_die, pf);
+               ret = find_probe_point_lazy(in_die, pf);
        else {
                /* Get probe address */
                if (dwarf_entrypc(in_die, &addr) != 0) {
                        pr_warning("Failed to get entry address of %s.\n",
                                   dwarf_diename(in_die));
-                       param->retval = -ENOENT;
-                       return DWARF_CB_ABORT;
+                       return -ENOENT;
                }
                pf->addr = addr;
                pf->addr += pp->offset;
                pr_debug("found inline addr: 0x%jx\n",
                         (uintmax_t)pf->addr);
 
-               param->retval = call_probe_finder(in_die, pf);
-               if (param->retval < 0)
-                       return DWARF_CB_ABORT;
+               ret = call_probe_finder(in_die, pf);
        }
 
-       return DWARF_CB_OK;
+       return ret;
 }
 
+/* Callback parameter with return value for libdw */
+struct dwarf_callback_param {
+       void *data;
+       int retval;
+};
+
 /* Search function from function name */
 static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
 {
@@ -933,14 +993,10 @@ static int probe_point_search_cb(Dwarf_Die *sp_die, void *data)
                        /* TODO: Check the address in this function */
                        param->retval = call_probe_finder(sp_die, pf);
                }
-       } else {
-               struct dwarf_callback_param _param = {.data = (void *)pf,
-                                                     .retval = 0};
+       } else
                /* Inlined function: search instances */
-               dwarf_func_inline_instances(sp_die, probe_point_inline_cb,
-                                           &_param);
-               param->retval = _param.retval;
-       }
+               param->retval = die_walk_instances(sp_die,
+                                       probe_point_inline_cb, (void *)pf);
 
        return DWARF_CB_ABORT; /* Exit; no same symbol in this CU. */
 }
@@ -1060,7 +1116,7 @@ found:
 }
 
 /* Add a found probe point into trace event list */
-static int add_probe_trace_event(Dwarf_Die *sp_die, struct probe_finder *pf)
+static int add_probe_trace_event(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
        struct trace_event_finder *tf =
                        container_of(pf, struct trace_event_finder, pf);
@@ -1075,8 +1131,9 @@ static int add_probe_trace_event(Dwarf_Die *sp_die, struct probe_finder *pf)
        }
        tev = &tf->tevs[tf->ntevs++];
 
-       ret = convert_to_trace_point(sp_die, pf->addr, pf->pev->point.retprobe,
-                                    &tev->point);
+       /* Trace point should be converted from subprogram DIE */
+       ret = convert_to_trace_point(&pf->sp_die, pf->addr,
+                                    pf->pev->point.retprobe, &tev->point);
        if (ret < 0)
                return ret;
 
@@ -1091,7 +1148,8 @@ static int add_probe_trace_event(Dwarf_Die *sp_die, struct probe_finder *pf)
        for (i = 0; i < pf->pev->nargs; i++) {
                pf->pvar = &pf->pev->args[i];
                pf->tvar = &tev->args[i];
-               ret = find_variable(sp_die, pf);
+               /* Variable should be found from scope DIE */
+               ret = find_variable(sc_die, pf);
                if (ret != 0)
                        return ret;
        }
@@ -1159,13 +1217,13 @@ static int collect_variables_cb(Dwarf_Die *die_mem, void *data)
 }
 
 /* Add a found vars into available variables list */
-static int add_available_vars(Dwarf_Die *sp_die, struct probe_finder *pf)
+static int add_available_vars(Dwarf_Die *sc_die, struct probe_finder *pf)
 {
        struct available_var_finder *af =
                        container_of(pf, struct available_var_finder, pf);
        struct variable_list *vl;
-       Dwarf_Die die_mem, *scopes = NULL;
-       int ret, nscopes;
+       Dwarf_Die die_mem;
+       int ret;
 
        /* Check number of tevs */
        if (af->nvls == af->max_vls) {
@@ -1174,8 +1232,9 @@ static int add_available_vars(Dwarf_Die *sp_die, struct probe_finder *pf)
        }
        vl = &af->vls[af->nvls++];
 
-       ret = convert_to_trace_point(sp_die, pf->addr, pf->pev->point.retprobe,
-                                    &vl->point);
+       /* Trace point should be converted from subprogram DIE */
+       ret = convert_to_trace_point(&pf->sp_die, pf->addr,
+                                    pf->pev->point.retprobe, &vl->point);
        if (ret < 0)
                return ret;
 
@@ -1187,19 +1246,14 @@ static int add_available_vars(Dwarf_Die *sp_die, struct probe_finder *pf)
        if (vl->vars == NULL)
                return -ENOMEM;
        af->child = true;
-       die_find_child(sp_die, collect_variables_cb, (void *)af, &die_mem);
+       die_find_child(sc_die, collect_variables_cb, (void *)af, &die_mem);
 
        /* Find external variables */
        if (!af->externs)
                goto out;
        /* Don't need to search child DIE for externs. */
        af->child = false;
-       nscopes = dwarf_getscopes_die(sp_die, &scopes);
-       while (nscopes-- > 1)
-               die_find_child(&scopes[nscopes], collect_variables_cb,
-                              (void *)af, &die_mem);
-       if (scopes)
-               free(scopes);
+       die_find_child(&pf->cu_die, collect_variables_cb, (void *)af, &die_mem);
 
 out:
        if (strlist__empty(vl->vars)) {
@@ -1391,10 +1445,14 @@ static int find_line_range_by_line(Dwarf_Die *sp_die, struct line_finder *lf)
 
 static int line_range_inline_cb(Dwarf_Die *in_die, void *data)
 {
-       struct dwarf_callback_param *param = data;
+       find_line_range_by_line(in_die, data);
 
-       param->retval = find_line_range_by_line(in_die, param->data);
-       return DWARF_CB_ABORT;  /* No need to find other instances */
+       /*
+        * We have to check all instances of inlined function, because
+        * some execution paths can be optimized out depends on the
+        * function argument of instances
+        */
+       return 0;
 }
 
 /* Search function from function name */
@@ -1422,15 +1480,10 @@ static int line_range_search_cb(Dwarf_Die *sp_die, void *data)
                pr_debug("New line range: %d to %d\n", lf->lno_s, lf->lno_e);
                lr->start = lf->lno_s;
                lr->end = lf->lno_e;
-               if (dwarf_func_inline(sp_die)) {
-                       struct dwarf_callback_param _param;
-                       _param.data = (void *)lf;
-                       _param.retval = 0;
-                       dwarf_func_inline_instances(sp_die,
-                                                   line_range_inline_cb,
-                                                   &_param);
-                       param->retval = _param.retval;
-               } else
+               if (dwarf_func_inline(sp_die))
+                       param->retval = die_walk_instances(sp_die,
+                                               line_range_inline_cb, lf);
+               else
                        param->retval = find_line_range_by_line(sp_die, lf);
                return DWARF_CB_ABORT;
        }
index c478b42..1132c8f 100644 (file)
@@ -57,7 +57,7 @@ struct probe_finder {
        struct perf_probe_event *pev;           /* Target probe event */
 
        /* Callback when a probe point is found */
-       int (*callback)(Dwarf_Die *sp_die, struct probe_finder *pf);
+       int (*callback)(Dwarf_Die *sc_die, struct probe_finder *pf);
 
        /* For function searching */
        int                     lno;            /* Line number */
index a8b5371..469c026 100644 (file)
@@ -1506,7 +1506,7 @@ int dso__load(struct dso *dso, struct map *map, symbol_filter_t filter)
        if (strncmp(dso->name, "/tmp/perf-", 10) == 0) {
                struct stat st;
 
-               if (stat(dso->name, &st) < 0)
+               if (lstat(dso->name, &st) < 0)
                        return -1;
 
                if (st.st_uid && (st.st_uid != geteuid())) {
@@ -2181,27 +2181,22 @@ size_t machines__fprintf_dsos_buildid(struct rb_root *machines,
        return ret;
 }
 
-struct dso *dso__new_kernel(const char *name)
+static struct dso*
+dso__kernel_findnew(struct machine *machine, const char *name,
+                   const char *short_name, int dso_type)
 {
-       struct dso *dso = dso__new(name ?: "[kernel.kallsyms]");
-
-       if (dso != NULL) {
-               dso__set_short_name(dso, "[kernel]");
-               dso->kernel = DSO_TYPE_KERNEL;
-       }
-
-       return dso;
-}
+       /*
+        * The kernel dso could be created by build_id processing.
+        */
+       struct dso *dso = __dsos__findnew(&machine->kernel_dsos, name);
 
-static struct dso *dso__new_guest_kernel(struct machine *machine,
-                                       const char *name)
-{
-       char bf[PATH_MAX];
-       struct dso *dso = dso__new(name ?: machine__mmap_name(machine, bf,
-                                                             sizeof(bf)));
+       /*
+        * We need to run this in all cases, since during the build_id
+        * processing we had no idea this was the kernel dso.
+        */
        if (dso != NULL) {
-               dso__set_short_name(dso, "[guest.kernel]");
-               dso->kernel = DSO_TYPE_GUEST_KERNEL;
+               dso__set_short_name(dso, short_name);
+               dso->kernel = dso_type;
        }
 
        return dso;
@@ -2219,24 +2214,36 @@ void dso__read_running_kernel_build_id(struct dso *dso, struct machine *machine)
                dso->has_build_id = true;
 }
 
-static struct dso *machine__create_kernel(struct machine *machine)
+static struct dso *machine__get_kernel(struct machine *machine)
 {
        const char *vmlinux_name = NULL;
        struct dso *kernel;
 
        if (machine__is_host(machine)) {
                vmlinux_name = symbol_conf.vmlinux_name;
-               kernel = dso__new_kernel(vmlinux_name);
+               if (!vmlinux_name)
+                       vmlinux_name = "[kernel.kallsyms]";
+
+               kernel = dso__kernel_findnew(machine, vmlinux_name,
+                                            "[kernel]",
+                                            DSO_TYPE_KERNEL);
        } else {
+               char bf[PATH_MAX];
+
                if (machine__is_default_guest(machine))
                        vmlinux_name = symbol_conf.default_guest_vmlinux_name;
-               kernel = dso__new_guest_kernel(machine, vmlinux_name);
+               if (!vmlinux_name)
+                       vmlinux_name = machine__mmap_name(machine, bf,
+                                                         sizeof(bf));
+
+               kernel = dso__kernel_findnew(machine, vmlinux_name,
+                                            "[guest.kernel]",
+                                            DSO_TYPE_GUEST_KERNEL);
        }
 
-       if (kernel != NULL) {
+       if (kernel != NULL && (!kernel->has_build_id))
                dso__read_running_kernel_build_id(kernel, machine);
-               dsos__add(&machine->kernel_dsos, kernel);
-       }
+
        return kernel;
 }
 
@@ -2340,7 +2347,7 @@ void machine__destroy_kernel_maps(struct machine *machine)
 
 int machine__create_kernel_maps(struct machine *machine)
 {
-       struct dso *kernel = machine__create_kernel(machine);
+       struct dso *kernel = machine__get_kernel(machine);
 
        if (kernel == NULL ||
            __machine__create_kernel_maps(machine, kernel) < 0)
index 325ee36..4f377d9 100644 (file)
@@ -155,7 +155,6 @@ struct dso {
 };
 
 struct dso *dso__new(const char *name);
-struct dso *dso__new_kernel(const char *name);
 void dso__delete(struct dso *dso);
 
 int dso__name_len(const struct dso *dso);
index 5a06538..88403cf 100644 (file)
@@ -208,6 +208,5 @@ int perf_top__tui_browser(struct perf_top *top)
                },
        };
 
-       ui_helpline__push("Press <- or ESC to exit");
        return perf_top_browser__run(&browser);
 }