Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Mar 2011 02:01:30 +0000 (19:01 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 25 Mar 2011 02:01:30 +0000 (19:01 -0700)
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs-2.6:
  fs: simplify iget & friends
  fs: pull inode->i_lock up out of writeback_single_inode
  fs: rename inode_lock to inode_hash_lock
  fs: move i_wb_list out from under inode_lock
  fs: move i_sb_list out from under inode_lock
  fs: remove inode_lock from iput_final and prune_icache
  fs: Lock the inode LRU list separately
  fs: factor inode disposal
  fs: protect inode->i_state with inode->i_lock
  autofs4: Do not potentially dereference NULL pointer returned by fget() in autofs_dev_ioctl_setpipefd()
  autofs4 - remove autofs4_lock
  autofs4 - fix d_manage() return on rcu-walk
  autofs4 - fix autofs4_expire_indirect() traversal
  autofs4 - fix dentry leak in autofs4_expire_direct()
  autofs4 - reinstate last used update on access
  vfs - check non-mountpoint dentry might block in __follow_mount_rcu()

1  2 
fs/block_dev.c
fs/buffer.c
include/linux/fs.h
mm/backing-dev.c
mm/filemap.c

diff --combined fs/block_dev.c
@@@ -55,11 -55,13 +55,13 @@@ EXPORT_SYMBOL(I_BDEV)
  static void bdev_inode_switch_bdi(struct inode *inode,
                        struct backing_dev_info *dst)
  {
-       spin_lock(&inode_lock);
+       spin_lock(&inode_wb_list_lock);
+       spin_lock(&inode->i_lock);
        inode->i_data.backing_dev_info = dst;
        if (inode->i_state & I_DIRTY)
                list_move(&inode->i_wb_list, &dst->wb.b_dirty);
-       spin_unlock(&inode_lock);
+       spin_unlock(&inode->i_lock);
+       spin_unlock(&inode_wb_list_lock);
  }
  
  static sector_t max_block(struct block_device *bdev)
@@@ -1087,7 -1089,6 +1089,7 @@@ static int __blkdev_get(struct block_de
        if (!disk)
                goto out;
  
 +      disk_block_events(disk);
        mutex_lock_nested(&bdev->bd_mutex, for_part);
        if (!bdev->bd_openers) {
                bdev->bd_disk = disk;
                                         */
                                        disk_put_part(bdev->bd_part);
                                        bdev->bd_part = NULL;
 -                                      module_put(disk->fops->owner);
 -                                      put_disk(disk);
                                        bdev->bd_disk = NULL;
                                        mutex_unlock(&bdev->bd_mutex);
 +                                      disk_unblock_events(disk);
 +                                      module_put(disk->fops->owner);
 +                                      put_disk(disk);
                                        goto restart;
                                }
                                if (ret)
                        bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
                }
        } else {
 -              module_put(disk->fops->owner);
 -              put_disk(disk);
 -              disk = NULL;
                if (bdev->bd_contains == bdev) {
                        if (bdev->bd_disk->fops->open) {
                                ret = bdev->bd_disk->fops->open(bdev, mode);
                        if (bdev->bd_invalidated)
                                rescan_partitions(bdev->bd_disk, bdev);
                }
 +              /* only one opener holds refs to the module and disk */
 +              module_put(disk->fops->owner);
 +              put_disk(disk);
        }
        bdev->bd_openers++;
        if (for_part)
                bdev->bd_part_count++;
        mutex_unlock(&bdev->bd_mutex);
 +      disk_unblock_events(disk);
        return 0;
  
   out_clear:
        bdev->bd_contains = NULL;
   out_unlock_bdev:
        mutex_unlock(&bdev->bd_mutex);
 - out:
 -      if (disk)
 -              module_put(disk->fops->owner);
 +      disk_unblock_events(disk);
 +      module_put(disk->fops->owner);
        put_disk(disk);
 + out:
        bdput(bdev);
  
        return ret;
@@@ -1449,13 -1448,14 +1451,13 @@@ int blkdev_put(struct block_device *bde
                if (bdev_free) {
                        if (bdev->bd_write_holder) {
                                disk_unblock_events(bdev->bd_disk);
 -                              bdev->bd_write_holder = false;
 -                      } else
                                disk_check_events(bdev->bd_disk);
 +                              bdev->bd_write_holder = false;
 +                      }
                }
  
                mutex_unlock(&bdev->bd_mutex);
 -      } else
 -              disk_check_events(bdev->bd_disk);
 +      }
  
        return __blkdev_put(bdev, mode, 0);
  }
@@@ -1529,6 -1529,7 +1531,6 @@@ static int blkdev_releasepage(struct pa
  static const struct address_space_operations def_blk_aops = {
        .readpage       = blkdev_readpage,
        .writepage      = blkdev_writepage,
 -      .sync_page      = block_sync_page,
        .write_begin    = blkdev_write_begin,
        .write_end      = blkdev_write_end,
        .writepages     = generic_writepages,
diff --combined fs/buffer.c
@@@ -54,15 -54,23 +54,15 @@@ init_buffer(struct buffer_head *bh, bh_
  }
  EXPORT_SYMBOL(init_buffer);
  
 -static int sync_buffer(void *word)
 +static int sleep_on_buffer(void *word)
  {
 -      struct block_device *bd;
 -      struct buffer_head *bh
 -              = container_of(word, struct buffer_head, b_state);
 -
 -      smp_mb();
 -      bd = bh->b_bdev;
 -      if (bd)
 -              blk_run_address_space(bd->bd_inode->i_mapping);
        io_schedule();
        return 0;
  }
  
  void __lock_buffer(struct buffer_head *bh)
  {
 -      wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
 +      wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
                                                        TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__lock_buffer);
@@@ -82,7 -90,7 +82,7 @@@ EXPORT_SYMBOL(unlock_buffer)
   */
  void __wait_on_buffer(struct buffer_head * bh)
  {
 -      wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
 +      wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__wait_on_buffer);
  
@@@ -741,12 -749,10 +741,12 @@@ static int fsync_buffers_list(spinlock_
  {
        struct buffer_head *bh;
        struct list_head tmp;
 -      struct address_space *mapping, *prev_mapping = NULL;
 +      struct address_space *mapping;
        int err = 0, err2;
 +      struct blk_plug plug;
  
        INIT_LIST_HEAD(&tmp);
 +      blk_start_plug(&plug);
  
        spin_lock(lock);
        while (!list_empty(list)) {
                                 * still in flight on potentially older
                                 * contents.
                                 */
 -                              write_dirty_buffer(bh, WRITE_SYNC_PLUG);
 +                              write_dirty_buffer(bh, WRITE_SYNC);
  
                                /*
                                 * Kick off IO for the previous mapping. Note
                                 * wait_on_buffer() will do that for us
                                 * through sync_buffer().
                                 */
 -                              if (prev_mapping && prev_mapping != mapping)
 -                                      blk_run_address_space(prev_mapping);
 -                              prev_mapping = mapping;
 -
                                brelse(bh);
                                spin_lock(lock);
                        }
                }
        }
  
 +      spin_unlock(lock);
 +      blk_finish_plug(&plug);
 +      spin_lock(lock);
 +
        while (!list_empty(&tmp)) {
                bh = BH_ENTRY(tmp.prev);
                get_bh(bh);
@@@ -1138,7 -1144,7 +1138,7 @@@ __getblk_slow(struct block_device *bdev
   * inode list.
   *
   * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
-  * mapping->tree_lock and the global inode_lock.
+  * mapping->tree_lock and mapping->host->i_lock.
   */
  void mark_buffer_dirty(struct buffer_head *bh)
  {
@@@ -1608,8 -1614,14 +1608,8 @@@ EXPORT_SYMBOL(unmap_underlying_metadata
   * prevents this contention from occurring.
   *
   * If block_write_full_page() is called with wbc->sync_mode ==
 - * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
 - * causes the writes to be flagged as synchronous writes, but the
 - * block device queue will NOT be unplugged, since usually many pages
 - * will be pushed to the out before the higher-level caller actually
 - * waits for the writes to be completed.  The various wait functions,
 - * such as wait_on_writeback_range() will ultimately call sync_page()
 - * which will ultimately call blk_run_backing_dev(), which will end up
 - * unplugging the device queue.
 + * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
 + * causes the writes to be flagged as synchronous writes.
   */
  static int __block_write_full_page(struct inode *inode, struct page *page,
                        get_block_t *get_block, struct writeback_control *wbc,
        const unsigned blocksize = 1 << inode->i_blkbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
 -                      WRITE_SYNC_PLUG : WRITE);
 +                      WRITE_SYNC : WRITE);
  
        BUG_ON(!PageLocked(page));
  
@@@ -3126,6 -3138,17 +3126,6 @@@ out
  }
  EXPORT_SYMBOL(try_to_free_buffers);
  
 -void block_sync_page(struct page *page)
 -{
 -      struct address_space *mapping;
 -
 -      smp_mb();
 -      mapping = page_mapping(page);
 -      if (mapping)
 -              blk_run_backing_dev(mapping->backing_dev_info, page);
 -}
 -EXPORT_SYMBOL(block_sync_page);
 -
  /*
   * There are no bdflush tunables left.  But distributions are
   * still running obsolete flush daemons, so we terminate them here.
diff --combined include/linux/fs.h
@@@ -138,10 -138,16 +138,10 @@@ struct inodes_stat_t 
   *                    block layer could (in theory) choose to ignore this
   *                    request if it runs into resource problems.
   * WRITE              A normal async write. Device will be plugged.
 - * WRITE_SYNC_PLUG    Synchronous write. Identical to WRITE, but passes down
 + * WRITE_SYNC         Synchronous write. Identical to WRITE, but passes down
   *                    the hint that someone will be waiting on this IO
 - *                    shortly. The device must still be unplugged explicitly,
 - *                    WRITE_SYNC_PLUG does not do this as we could be
 - *                    submitting more writes before we actually wait on any
 - *                    of them.
 - * WRITE_SYNC         Like WRITE_SYNC_PLUG, but also unplugs the device
 - *                    immediately after submission. The write equivalent
 - *                    of READ_SYNC.
 - * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only.
 + *                    shortly. The write equivalent of READ_SYNC.
 + * WRITE_ODIRECT      Special case write for O_DIRECT only.
   * WRITE_FLUSH                Like WRITE_SYNC but with preceding cache flush.
   * WRITE_FUA          Like WRITE_SYNC but data is guaranteed to be on
   *                    non-volatile media on completion.
  #define WRITE                 RW_MASK
  #define READA                 RWA_MASK
  
 -#define READ_SYNC             (READ | REQ_SYNC | REQ_UNPLUG)
 +#define READ_SYNC             (READ | REQ_SYNC)
  #define READ_META             (READ | REQ_META)
 -#define WRITE_SYNC_PLUG               (WRITE | REQ_SYNC | REQ_NOIDLE)
 -#define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 -#define WRITE_ODIRECT_PLUG    (WRITE | REQ_SYNC)
 +#define WRITE_SYNC            (WRITE | REQ_SYNC | REQ_NOIDLE)
 +#define WRITE_ODIRECT         (WRITE | REQ_SYNC)
  #define WRITE_META            (WRITE | REQ_META)
 -#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FLUSH)
 -#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FUA)
 -#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 -                               REQ_FLUSH | REQ_FUA)
 +#define WRITE_FLUSH           (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH)
 +#define WRITE_FUA             (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 +#define WRITE_FLUSH_FUA               (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
  
  #define SEL_IN                1
  #define SEL_OUT               2
@@@ -576,6 -586,7 +576,6 @@@ typedef int (*read_actor_t)(read_descri
  struct address_space_operations {
        int (*writepage)(struct page *page, struct writeback_control *wbc);
        int (*readpage)(struct file *, struct page *);
 -      void (*sync_page)(struct page *);
  
        /* Write back some dirty pages from this mapping. */
        int (*writepages)(struct address_space *, struct writeback_control *);
@@@ -1636,7 -1647,7 +1636,7 @@@ struct super_operations 
  };
  
  /*
-  * Inode state bits.  Protected by inode_lock.
+  * Inode state bits.  Protected by inode->i_lock
   *
   * Three bits determine the dirty state of the inode, I_DIRTY_SYNC,
   * I_DIRTY_DATASYNC and I_DIRTY_PAGES.
diff --combined mm/backing-dev.c
  
  static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
  
 -void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
 -{
 -}
 -EXPORT_SYMBOL(default_unplug_io_fn);
 -
  struct backing_dev_info default_backing_dev_info = {
        .name           = "default",
        .ra_pages       = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
        .state          = 0,
        .capabilities   = BDI_CAP_MAP_COPY,
 -      .unplug_io_fn   = default_unplug_io_fn,
  };
  EXPORT_SYMBOL_GPL(default_backing_dev_info);
  
@@@ -67,14 -73,14 +67,14 @@@ static int bdi_debug_stats_show(struct 
        struct inode *inode;
  
        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
-       spin_lock(&inode_lock);
+       spin_lock(&inode_wb_list_lock);
        list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
                nr_dirty++;
        list_for_each_entry(inode, &wb->b_io, i_wb_list)
                nr_io++;
        list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
                nr_more_io++;
-       spin_unlock(&inode_lock);
+       spin_unlock(&inode_wb_list_lock);
  
        global_dirty_limits(&background_thresh, &dirty_thresh);
        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@@ -598,7 -604,7 +598,7 @@@ static void bdi_prune_sb(struct backing
        spin_lock(&sb_lock);
        list_for_each_entry(sb, &super_blocks, s_list) {
                if (sb->s_bdi == bdi)
 -                      sb->s_bdi = NULL;
 +                      sb->s_bdi = &default_backing_dev_info;
        }
        spin_unlock(&sb_lock);
  }
@@@ -676,11 -682,11 +676,11 @@@ void bdi_destroy(struct backing_dev_inf
        if (bdi_has_dirty_io(bdi)) {
                struct bdi_writeback *dst = &default_backing_dev_info.wb;
  
-               spin_lock(&inode_lock);
+               spin_lock(&inode_wb_list_lock);
                list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
                list_splice(&bdi->wb.b_io, &dst->b_io);
                list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-               spin_unlock(&inode_lock);
+               spin_unlock(&inode_wb_list_lock);
        }
  
        bdi_unregister(bdi);
diff --combined mm/filemap.c
@@@ -80,8 -80,8 +80,8 @@@
   *  ->i_mutex
   *    ->i_alloc_sem             (various)
   *
-  *  ->inode_lock
-  *    ->sb_lock                       (fs/fs-writeback.c)
+  *  inode_wb_list_lock
+  *    sb_lock                 (fs/fs-writeback.c)
   *    ->mapping->tree_lock    (__sync_single_inode)
   *
   *  ->i_mmap_lock
   *    ->zone.lru_lock         (check_pte_range->isolate_lru_page)
   *    ->private_lock          (page_remove_rmap->set_page_dirty)
   *    ->tree_lock             (page_remove_rmap->set_page_dirty)
-  *    ->inode_lock            (page_remove_rmap->set_page_dirty)
-  *    ->inode_lock            (zap_pte_range->set_page_dirty)
+  *    inode_wb_list_lock      (page_remove_rmap->set_page_dirty)
+  *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
+  *    inode_wb_list_lock      (zap_pte_range->set_page_dirty)
+  *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
   *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
   *
   *  (code doesn't rely on that order, so you could switch it around)
@@@ -164,15 -166,45 +166,15 @@@ void delete_from_page_cache(struct pag
  }
  EXPORT_SYMBOL(delete_from_page_cache);
  
 -static int sync_page(void *word)
 +static int sleep_on_page(void *word)
  {
 -      struct address_space *mapping;
 -      struct page *page;
 -
 -      page = container_of((unsigned long *)word, struct page, flags);
 -
 -      /*
 -       * page_mapping() is being called without PG_locked held.
 -       * Some knowledge of the state and use of the page is used to
 -       * reduce the requirements down to a memory barrier.
 -       * The danger here is of a stale page_mapping() return value
 -       * indicating a struct address_space different from the one it's
 -       * associated with when it is associated with one.
 -       * After smp_mb(), it's either the correct page_mapping() for
 -       * the page, or an old page_mapping() and the page's own
 -       * page_mapping() has gone NULL.
 -       * The ->sync_page() address_space operation must tolerate
 -       * page_mapping() going NULL. By an amazing coincidence,
 -       * this comes about because none of the users of the page
 -       * in the ->sync_page() methods make essential use of the
 -       * page_mapping(), merely passing the page down to the backing
 -       * device's unplug functions when it's non-NULL, which in turn
 -       * ignore it for all cases but swap, where only page_private(page) is
 -       * of interest. When page_mapping() does go NULL, the entire
 -       * call stack gracefully ignores the page and returns.
 -       * -- wli
 -       */
 -      smp_mb();
 -      mapping = page_mapping(page);
 -      if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 -              mapping->a_ops->sync_page(page);
        io_schedule();
        return 0;
  }
  
 -static int sync_page_killable(void *word)
 +static int sleep_on_page_killable(void *word)
  {
 -      sync_page(word);
 +      sleep_on_page(word);
        return fatal_signal_pending(current) ? -EINTR : 0;
  }
  
@@@ -528,6 -560,12 +530,6 @@@ struct page *__page_cache_alloc(gfp_t g
  EXPORT_SYMBOL(__page_cache_alloc);
  #endif
  
 -static int __sleep_on_page_lock(void *word)
 -{
 -      io_schedule();
 -      return 0;
 -}
 -
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@@ -555,7 -593,7 +557,7 @@@ void wait_on_page_bit(struct page *page
        DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);
  
        if (test_bit(bit_nr, &page->flags))
 -              __wait_on_bit(page_waitqueue(page), &wait, sync_page,
 +              __wait_on_bit(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(wait_on_page_bit);
@@@ -619,12 -657,17 +621,12 @@@ EXPORT_SYMBOL(end_page_writeback)
  /**
   * __lock_page - get a lock on the page, assuming we need to sleep to get it
   * @page: the page to lock
 - *
 - * Ugly. Running sync_page() in state TASK_UNINTERRUPTIBLE is scary.  If some
 - * random driver's requestfn sets TASK_RUNNING, we could busywait.  However
 - * chances are that on the second loop, the block layer's plug list is empty,
 - * so sync_page() will then return in state TASK_UNINTERRUPTIBLE.
   */
  void __lock_page(struct page *page)
  {
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
  
 -      __wait_on_bit_lock(page_waitqueue(page), &wait, sync_page,
 +      __wait_on_bit_lock(page_waitqueue(page), &wait, sleep_on_page,
                                                        TASK_UNINTERRUPTIBLE);
  }
  EXPORT_SYMBOL(__lock_page);
@@@ -634,10 -677,24 +636,10 @@@ int __lock_page_killable(struct page *p
        DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
  
        return __wait_on_bit_lock(page_waitqueue(page), &wait,
 -                                      sync_page_killable, TASK_KILLABLE);
 +                                      sleep_on_page_killable, TASK_KILLABLE);
  }
  EXPORT_SYMBOL_GPL(__lock_page_killable);
  
 -/**
 - * __lock_page_nosync - get a lock on the page, without calling sync_page()
 - * @page: the page to lock
 - *
 - * Variant of lock_page that does not require the caller to hold a reference
 - * on the page's mapping.
 - */
 -void __lock_page_nosync(struct page *page)
 -{
 -      DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
 -      __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
 -                                                      TASK_UNINTERRUPTIBLE);
 -}
 -
  int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
                         unsigned int flags)
  {
@@@ -1352,15 -1409,12 +1354,15 @@@ generic_file_aio_read(struct kiocb *ioc
        unsigned long seg = 0;
        size_t count;
        loff_t *ppos = &iocb->ki_pos;
 +      struct blk_plug plug;
  
        count = 0;
        retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
        if (retval)
                return retval;
  
 +      blk_start_plug(&plug);
 +
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (filp->f_flags & O_DIRECT) {
                loff_t size;
                        break;
        }
  out:
 +      blk_finish_plug(&plug);
        return retval;
  }
  EXPORT_SYMBOL(generic_file_aio_read);
@@@ -2545,13 -2598,11 +2547,13 @@@ ssize_t generic_file_aio_write(struct k
  {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
 +      struct blk_plug plug;
        ssize_t ret;
  
        BUG_ON(iocb->ki_pos != pos);
  
        mutex_lock(&inode->i_mutex);
 +      blk_start_plug(&plug);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
  
                if (err < 0 && ret > 0)
                        ret = err;
        }
 +      blk_finish_plug(&plug);
        return ret;
  }
  EXPORT_SYMBOL(generic_file_aio_write);