From: Nathan Scott The following patch to the generic direct IO code fixes a problem we've come across that affects the way XFS interacts with it. Between 2.6.5 and 2.6.6 several changes to direct IO were made, in particular the fallback-to-buffered path was introduced in that timeframe. Those changes split the locking done within direct-io.c into two cases - generic filesystems and blkdev/XFS. There is no locking done for the second case, and we also never set the create parameter to the get_blocks call (via direct_io_worker ->do_direct_IO->get_more_blocks) for that case. This meant that XFS was accidentally no longer being told when a direct IO write was being performed, which in turn meant that XFS would (often - depending on the inode's size and bmap) tell get_more_blocks that it was mapping to a hole. It also means that currently all direct writes into XFS are falling back to buffered writes. Further, skipping the i_alloc_sem locking entirely is not correct for us, we are relying on that aspect of the generic locking. The fallback behaviour from direct to buffered is "silent", so we didn't actually pick up on these problems until just recently, unfortunately. The approach I've taken here is to split the blkdev/XFS case into two, and corrected the new third case behaviour for the functionality XFS provides. The generic behavior used by other filesystems remains unchanged, as does direct IO to the block device, and XFS now becomes fully functional. Signed-off-by: Andrew Morton --- 25-akpm/fs/direct-io.c | 48 +++++++++++++++++++++--------------- 25-akpm/fs/xfs/linux-2.6/xfs_aops.c | 2 - 25-akpm/include/linux/fs.h | 24 +++++++++++++----- 3 files changed, 48 insertions(+), 26 deletions(-) diff -puN fs/direct-io.c~fix-generic-direct-io-code-for-xfs fs/direct-io.c --- 25/fs/direct-io.c~fix-generic-direct-io-code-for-xfs 2004-09-21 23:01:44.305269920 -0700 +++ 25-akpm/fs/direct-io.c 2004-09-21 23:01:44.314268552 -0700 @@ -53,9 +53,12 @@ * If blkfactor is zero then the user's request was aligned to the filesystem's * blocksize. * - * needs_locking is set for regular files on direct-IO-naive filesystems. It - * determines whether we need to do the fancy locking which prevents direct-IO - * from being able to read uninitialised disk blocks. + * lock_type is DIO_LOCKING for regular files on direct-IO-naive filesystems. + * This determines whether we need to do the fancy locking which prevents + * direct-IO from being able to read uninitialised disk blocks. If its zero + * (blockdev) this locking is not done, and if it is DIO_OWN_LOCKING i_sem is + * not held for the entire direct write (taken briefly, initially, during a + * direct read though, but its never held for the duration of a direct-IO). */ struct dio { @@ -63,7 +66,7 @@ struct dio { struct bio *bio; /* bio under assembly */ struct inode *inode; int rw; - int needs_locking; /* doesn't change */ + int lock_type; /* doesn't change */ unsigned blkbits; /* doesn't change */ unsigned blkfactor; /* When we're using an alignment which is finer than the filesystem's soft @@ -212,7 +215,7 @@ static void dio_complete(struct dio *dio { if (dio->end_io && dio->result) dio->end_io(dio->inode, offset, bytes, dio->map_bh.b_private); - if (dio->needs_locking) + if (dio->lock_type != DIO_NO_LOCKING) up_read(&dio->inode->i_alloc_sem); } @@ -493,7 +496,7 @@ static int get_more_blocks(struct dio *d unsigned long fs_count; /* Number of filesystem-sized blocks */ unsigned long dio_count;/* Number of dio_block-sized blocks */ unsigned long blkmask; - int beyond_eof = 0; + int create; /* * If there was a memory error and we've overwritten all the @@ -511,10 +514,13 @@ static int get_more_blocks(struct dio *d if (dio_count & blkmask) fs_count++; - if (dio->needs_locking) { - if (dio->block_in_file >= (i_size_read(dio->inode) >> + create = dio->rw == WRITE; + if (dio->lock_type == DIO_LOCKING) { + if (dio->block_in_file < (i_size_read(dio->inode) >> dio->blkbits)) - beyond_eof = 1; + create = 0; + } else if (dio->lock_type == DIO_NO_LOCKING) { + create = 0; } /* * For writes inside i_size we forbid block creations: only @@ -523,7 +529,7 @@ static int get_more_blocks(struct dio *d * writes. */ ret = (*dio->get_blocks)(dio->inode, fs_startblk, fs_count, - map_bh, (dio->rw == WRITE) && beyond_eof); + map_bh, create); } return ret; } @@ -1033,7 +1039,7 @@ direct_io_worker(int rw, struct kiocb *i * we can let i_sem go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if ((rw == READ) && dio->needs_locking) + if ((rw == READ) && (dio->lock_type == DIO_LOCKING)) up(&dio->inode->i_sem); /* @@ -1120,7 +1126,7 @@ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, - int needs_special_locking) + int dio_lock_type) { int seg; size_t size; @@ -1131,7 +1137,6 @@ __blockdev_direct_IO(int rw, struct kioc ssize_t retval = -EINVAL; loff_t end = offset; struct dio *dio; - int needs_locking; if (bdev) bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); @@ -1164,13 +1169,15 @@ __blockdev_direct_IO(int rw, struct kioc goto out; /* - * For regular files, + * For regular files using DIO_LOCKING, * readers need to grab i_sem and i_alloc_sem * writers need to grab i_alloc_sem only (i_sem is already held) + * For regular files using DIO_OWN_LOCKING, + * both readers and writers need to grab i_alloc_sem + * neither readers nor writers hold i_sem on entry (nor exit) */ - needs_locking = 0; - if (S_ISREG(inode->i_mode) && needs_special_locking) { - needs_locking = 1; + dio->lock_type = dio_lock_type; + if (dio_lock_type != DIO_NO_LOCKING) { if (rw == READ) { struct address_space *mapping; @@ -1182,10 +1189,13 @@ __blockdev_direct_IO(int rw, struct kioc kfree(dio); goto out; } + down_read(&inode->i_alloc_sem); + if (dio_lock_type == DIO_OWN_LOCKING) + up(&inode->i_sem); + } else { + down_read(&inode->i_alloc_sem); } - down_read(&inode->i_alloc_sem); } - dio->needs_locking = needs_locking; /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So diff -puN fs/xfs/linux-2.6/xfs_aops.c~fix-generic-direct-io-code-for-xfs fs/xfs/linux-2.6/xfs_aops.c --- 25/fs/xfs/linux-2.6/xfs_aops.c~fix-generic-direct-io-code-for-xfs 2004-09-21 23:01:44.307269616 -0700 +++ 25-akpm/fs/xfs/linux-2.6/xfs_aops.c 2004-09-21 23:01:44.315268400 -0700 @@ -1024,7 +1024,7 @@ linvfs_direct_IO( if (error) return -error; - return blockdev_direct_IO_no_locking(rw, iocb, inode, + return blockdev_direct_IO_own_locking(rw, iocb, inode, iomap.iomap_target->pbr_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, diff -puN include/linux/fs.h~fix-generic-direct-io-code-for-xfs include/linux/fs.h --- 25/include/linux/fs.h~fix-generic-direct-io-code-for-xfs 2004-09-21 23:01:44.309269312 -0700 +++ 25-akpm/include/linux/fs.h 2004-09-21 23:01:44.317268096 -0700 @@ -1552,18 +1552,21 @@ static inline void do_generic_file_read( ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io, - int needs_special_locking); + int lock_type); + +enum { + DIO_LOCKING = 1, /* need locking between buffered and direct access */ + DIO_NO_LOCKING, /* bdev; no locking at all between buffered/direct */ + DIO_OWN_LOCKING, /* filesystem locks buffered and direct internally */ +}; -/* - * For filesystems which need locking between buffered and direct access - */ static inline ssize_t blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct block_device *bdev, const struct iovec *iov, loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_blocks, end_io, 1); + nr_segs, get_blocks, end_io, DIO_LOCKING); } static inline ssize_t blockdev_direct_IO_no_locking(int rw, struct kiocb *iocb, @@ -1572,7 +1575,16 @@ static inline ssize_t blockdev_direct_IO dio_iodone_t end_io) { return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, - nr_segs, get_blocks, end_io, 0); + nr_segs, get_blocks, end_io, DIO_NO_LOCKING); +} + +static inline ssize_t blockdev_direct_IO_own_locking(int rw, struct kiocb *iocb, + struct inode *inode, struct block_device *bdev, const struct iovec *iov, + loff_t offset, unsigned long nr_segs, get_blocks_t get_blocks, + dio_iodone_t end_io) +{ + return __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, + nr_segs, get_blocks, end_io, DIO_OWN_LOCKING); } extern struct file_operations generic_ro_fops; _