69c3126a05b4074a9718c672c36d69ee302374b7
[cascardo/linux.git] / fs / dax.c
1 /*
2  * fs/dax.c - Direct Access filesystem code
3  * Copyright (c) 2013-2014 Intel Corporation
4  * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
5  * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
6  *
7  * This program is free software; you can redistribute it and/or modify it
8  * under the terms and conditions of the GNU General Public License,
9  * version 2, as published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope it will be useful, but WITHOUT
12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
14  * more details.
15  */
16
17 #include <linux/atomic.h>
18 #include <linux/blkdev.h>
19 #include <linux/buffer_head.h>
20 #include <linux/fs.h>
21 #include <linux/genhd.h>
22 #include <linux/mutex.h>
23 #include <linux/sched.h>
24 #include <linux/uio.h>
25
26 int dax_clear_blocks(struct inode *inode, sector_t block, long size)
27 {
28         struct block_device *bdev = inode->i_sb->s_bdev;
29         sector_t sector = block << (inode->i_blkbits - 9);
30
31         might_sleep();
32         do {
33                 void *addr;
34                 unsigned long pfn;
35                 long count;
36
37                 count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
38                 if (count < 0)
39                         return count;
40                 BUG_ON(size < count);
41                 while (count > 0) {
42                         unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
43                         if (pgsz > count)
44                                 pgsz = count;
45                         if (pgsz < PAGE_SIZE)
46                                 memset(addr, 0, pgsz);
47                         else
48                                 clear_page(addr);
49                         addr += pgsz;
50                         size -= pgsz;
51                         count -= pgsz;
52                         BUG_ON(pgsz & 511);
53                         sector += pgsz / 512;
54                         cond_resched();
55                 }
56         } while (size);
57
58         return 0;
59 }
60 EXPORT_SYMBOL_GPL(dax_clear_blocks);
61
62 static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
63 {
64         unsigned long pfn;
65         sector_t sector = bh->b_blocknr << (blkbits - 9);
66         return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
67 }
68
69 static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
70                         loff_t end)
71 {
72         loff_t final = end - pos + first; /* The final byte of the buffer */
73
74         if (first > 0)
75                 memset(addr, 0, first);
76         if (final < size)
77                 memset(addr + final, 0, size - final);
78 }
79
80 static bool buffer_written(struct buffer_head *bh)
81 {
82         return buffer_mapped(bh) && !buffer_unwritten(bh);
83 }
84
85 /*
86  * When ext4 encounters a hole, it returns without modifying the buffer_head
87  * which means that we can't trust b_size.  To cope with this, we set b_state
88  * to 0 before calling get_block and, if any bit is set, we know we can trust
89  * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
90  * and would save us time calling get_block repeatedly.
91  */
92 static bool buffer_size_valid(struct buffer_head *bh)
93 {
94         return bh->b_state != 0;
95 }
96
97 static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
98                         loff_t start, loff_t end, get_block_t get_block,
99                         struct buffer_head *bh)
100 {
101         ssize_t retval = 0;
102         loff_t pos = start;
103         loff_t max = start;
104         loff_t bh_max = start;
105         void *addr;
106         bool hole = false;
107
108         if (rw != WRITE)
109                 end = min(end, i_size_read(inode));
110
111         while (pos < end) {
112                 unsigned len;
113                 if (pos == max) {
114                         unsigned blkbits = inode->i_blkbits;
115                         sector_t block = pos >> blkbits;
116                         unsigned first = pos - (block << blkbits);
117                         long size;
118
119                         if (pos == bh_max) {
120                                 bh->b_size = PAGE_ALIGN(end - pos);
121                                 bh->b_state = 0;
122                                 retval = get_block(inode, block, bh,
123                                                                 rw == WRITE);
124                                 if (retval)
125                                         break;
126                                 if (!buffer_size_valid(bh))
127                                         bh->b_size = 1 << blkbits;
128                                 bh_max = pos - first + bh->b_size;
129                         } else {
130                                 unsigned done = bh->b_size -
131                                                 (bh_max - (pos - first));
132                                 bh->b_blocknr += done >> blkbits;
133                                 bh->b_size -= done;
134                         }
135
136                         hole = (rw != WRITE) && !buffer_written(bh);
137                         if (hole) {
138                                 addr = NULL;
139                                 size = bh->b_size - first;
140                         } else {
141                                 retval = dax_get_addr(bh, &addr, blkbits);
142                                 if (retval < 0)
143                                         break;
144                                 if (buffer_unwritten(bh) || buffer_new(bh))
145                                         dax_new_buf(addr, retval, first, pos,
146                                                                         end);
147                                 addr += first;
148                                 size = retval - first;
149                         }
150                         max = min(pos + size, end);
151                 }
152
153                 if (rw == WRITE)
154                         len = copy_from_iter(addr, max - pos, iter);
155                 else if (!hole)
156                         len = copy_to_iter(addr, max - pos, iter);
157                 else
158                         len = iov_iter_zero(max - pos, iter);
159
160                 if (!len)
161                         break;
162
163                 pos += len;
164                 addr += len;
165         }
166
167         return (pos == start) ? retval : pos - start;
168 }
169
170 /**
171  * dax_do_io - Perform I/O to a DAX file
172  * @rw: READ to read or WRITE to write
173  * @iocb: The control block for this I/O
174  * @inode: The file which the I/O is directed at
175  * @iter: The addresses to do I/O from or to
176  * @pos: The file offset where the I/O starts
177  * @get_block: The filesystem method used to translate file offsets to blocks
178  * @end_io: A filesystem callback for I/O completion
179  * @flags: See below
180  *
181  * This function uses the same locking scheme as do_blockdev_direct_IO:
182  * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
183  * caller for writes.  For reads, we take and release the i_mutex ourselves.
184  * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
185  * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
186  * is in progress.
187  */
188 ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
189                         struct iov_iter *iter, loff_t pos,
190                         get_block_t get_block, dio_iodone_t end_io, int flags)
191 {
192         struct buffer_head bh;
193         ssize_t retval = -EINVAL;
194         loff_t end = pos + iov_iter_count(iter);
195
196         memset(&bh, 0, sizeof(bh));
197
198         if ((flags & DIO_LOCKING) && (rw == READ)) {
199                 struct address_space *mapping = inode->i_mapping;
200                 mutex_lock(&inode->i_mutex);
201                 retval = filemap_write_and_wait_range(mapping, pos, end - 1);
202                 if (retval) {
203                         mutex_unlock(&inode->i_mutex);
204                         goto out;
205                 }
206         }
207
208         /* Protects against truncate */
209         atomic_inc(&inode->i_dio_count);
210
211         retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
212
213         if ((flags & DIO_LOCKING) && (rw == READ))
214                 mutex_unlock(&inode->i_mutex);
215
216         if ((retval > 0) && end_io)
217                 end_io(iocb, pos, retval, bh.b_private);
218
219         inode_dio_done(inode);
220  out:
221         return retval;
222 }
223 EXPORT_SYMBOL_GPL(dax_do_io);