ea70883a174a53a102d336ff1ccf24d023765360
[cascardo/linux.git] / fs / nfs / blocklayout / dev.c
1 /*
2  * Copyright (c) 2014-2016 Christoph Hellwig.
3  */
4 #include <linux/sunrpc/svc.h>
5 #include <linux/blkdev.h>
6 #include <linux/nfs4.h>
7 #include <linux/nfs_fs.h>
8 #include <linux/nfs_xdr.h>
9 #include <linux/pr.h>
10
11 #include "blocklayout.h"
12
13 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
14
15 static void
16 bl_free_device(struct pnfs_block_dev *dev)
17 {
18         if (dev->nr_children) {
19                 int i;
20
21                 for (i = 0; i < dev->nr_children; i++)
22                         bl_free_device(&dev->children[i]);
23                 kfree(dev->children);
24         } else {
25                 if (dev->pr_registered) {
26                         const struct pr_ops *ops =
27                                 dev->bdev->bd_disk->fops->pr_ops;
28                         int error;
29
30                         error = ops->pr_register(dev->bdev, dev->pr_key, 0,
31                                 false);
32                         if (error)
33                                 pr_err("failed to unregister PR key.\n");
34                 }
35
36                 if (dev->bdev)
37                         blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
38         }
39 }
40
41 void
42 bl_free_deviceid_node(struct nfs4_deviceid_node *d)
43 {
44         struct pnfs_block_dev *dev =
45                 container_of(d, struct pnfs_block_dev, node);
46
47         bl_free_device(dev);
48         kfree_rcu(dev, node.rcu);
49 }
50
51 static int
52 nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
53 {
54         __be32 *p;
55         int i;
56
57         p = xdr_inline_decode(xdr, 4);
58         if (!p)
59                 return -EIO;
60         b->type = be32_to_cpup(p++);
61
62         switch (b->type) {
63         case PNFS_BLOCK_VOLUME_SIMPLE:
64                 p = xdr_inline_decode(xdr, 4);
65                 if (!p)
66                         return -EIO;
67                 b->simple.nr_sigs = be32_to_cpup(p++);
68                 if (!b->simple.nr_sigs) {
69                         dprintk("no signature\n");
70                         return -EIO;
71                 }
72
73                 b->simple.len = 4 + 4;
74                 for (i = 0; i < b->simple.nr_sigs; i++) {
75                         p = xdr_inline_decode(xdr, 8 + 4);
76                         if (!p)
77                                 return -EIO;
78                         p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
79                         b->simple.sigs[i].sig_len = be32_to_cpup(p++);
80                         if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
81                                 pr_info("signature too long: %d\n",
82                                         b->simple.sigs[i].sig_len);
83                                 return -EIO;
84                         }
85
86                         p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
87                         if (!p)
88                                 return -EIO;
89                         memcpy(&b->simple.sigs[i].sig, p,
90                                 b->simple.sigs[i].sig_len);
91
92                         b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
93                 }
94                 break;
95         case PNFS_BLOCK_VOLUME_SLICE:
96                 p = xdr_inline_decode(xdr, 8 + 8 + 4);
97                 if (!p)
98                         return -EIO;
99                 p = xdr_decode_hyper(p, &b->slice.start);
100                 p = xdr_decode_hyper(p, &b->slice.len);
101                 b->slice.volume = be32_to_cpup(p++);
102                 break;
103         case PNFS_BLOCK_VOLUME_CONCAT:
104                 p = xdr_inline_decode(xdr, 4);
105                 if (!p)
106                         return -EIO;
107                 b->concat.volumes_count = be32_to_cpup(p++);
108
109                 p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
110                 if (!p)
111                         return -EIO;
112                 for (i = 0; i < b->concat.volumes_count; i++)
113                         b->concat.volumes[i] = be32_to_cpup(p++);
114                 break;
115         case PNFS_BLOCK_VOLUME_STRIPE:
116                 p = xdr_inline_decode(xdr, 8 + 4);
117                 if (!p)
118                         return -EIO;
119                 p = xdr_decode_hyper(p, &b->stripe.chunk_size);
120                 b->stripe.volumes_count = be32_to_cpup(p++);
121
122                 p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
123                 if (!p)
124                         return -EIO;
125                 for (i = 0; i < b->stripe.volumes_count; i++)
126                         b->stripe.volumes[i] = be32_to_cpup(p++);
127                 break;
128         case PNFS_BLOCK_VOLUME_SCSI:
129                 p = xdr_inline_decode(xdr, 4 + 4 + 4);
130                 if (!p)
131                         return -EIO;
132                 b->scsi.code_set = be32_to_cpup(p++);
133                 b->scsi.designator_type = be32_to_cpup(p++);
134                 b->scsi.designator_len = be32_to_cpup(p++);
135                 p = xdr_inline_decode(xdr, b->scsi.designator_len);
136                 if (!p)
137                         return -EIO;
138                 if (b->scsi.designator_len > 256)
139                         return -EIO;
140                 memcpy(&b->scsi.designator, p, b->scsi.designator_len);
141                 p = xdr_inline_decode(xdr, 8);
142                 if (!p)
143                         return -EIO;
144                 p = xdr_decode_hyper(p, &b->scsi.pr_key);
145                 break;
146         default:
147                 dprintk("unknown volume type!\n");
148                 return -EIO;
149         }
150
151         return 0;
152 }
153
154 static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
155                 struct pnfs_block_dev_map *map)
156 {
157         map->start = dev->start;
158         map->len = dev->len;
159         map->disk_offset = dev->disk_offset;
160         map->bdev = dev->bdev;
161         return true;
162 }
163
164 static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
165                 struct pnfs_block_dev_map *map)
166 {
167         int i;
168
169         for (i = 0; i < dev->nr_children; i++) {
170                 struct pnfs_block_dev *child = &dev->children[i];
171
172                 if (child->start > offset ||
173                     child->start + child->len <= offset)
174                         continue;
175
176                 child->map(child, offset - child->start, map);
177                 return true;
178         }
179
180         dprintk("%s: ran off loop!\n", __func__);
181         return false;
182 }
183
184 static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
185                 struct pnfs_block_dev_map *map)
186 {
187         struct pnfs_block_dev *child;
188         u64 chunk;
189         u32 chunk_idx;
190         u64 disk_offset;
191
192         chunk = div_u64(offset, dev->chunk_size);
193         div_u64_rem(chunk, dev->nr_children, &chunk_idx);
194
195         if (chunk_idx > dev->nr_children) {
196                 dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
197                         __func__, chunk_idx, offset, dev->chunk_size);
198                 /* error, should not happen */
199                 return false;
200         }
201
202         /* truncate offset to the beginning of the stripe */
203         offset = chunk * dev->chunk_size;
204
205         /* disk offset of the stripe */
206         disk_offset = div_u64(offset, dev->nr_children);
207
208         child = &dev->children[chunk_idx];
209         child->map(child, disk_offset, map);
210
211         map->start += offset;
212         map->disk_offset += disk_offset;
213         map->len = dev->chunk_size;
214         return true;
215 }
216
217 static int
218 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
219                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
220
221
222 static int
223 bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
224                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
225 {
226         struct pnfs_block_volume *v = &volumes[idx];
227         dev_t dev;
228
229         dev = bl_resolve_deviceid(server, v, gfp_mask);
230         if (!dev)
231                 return -EIO;
232
233         d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
234         if (IS_ERR(d->bdev)) {
235                 printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
236                         MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
237                 return PTR_ERR(d->bdev);
238         }
239
240
241         d->len = i_size_read(d->bdev->bd_inode);
242         d->map = bl_map_simple;
243
244         printk(KERN_INFO "pNFS: using block device %s\n",
245                 d->bdev->bd_disk->disk_name);
246         return 0;
247 }
248
249 static bool
250 bl_validate_designator(struct pnfs_block_volume *v)
251 {
252         switch (v->scsi.designator_type) {
253         case PS_DESIGNATOR_EUI64:
254                 if (v->scsi.code_set != PS_CODE_SET_BINARY)
255                         return false;
256
257                 if (v->scsi.designator_len != 8 &&
258                     v->scsi.designator_len != 10 &&
259                     v->scsi.designator_len != 16)
260                         return false;
261
262                 return true;
263         case PS_DESIGNATOR_NAA:
264                 if (v->scsi.code_set != PS_CODE_SET_BINARY)
265                         return false;
266
267                 if (v->scsi.designator_len != 8 &&
268                     v->scsi.designator_len != 16)
269                         return false;
270
271                 return true;
272         case PS_DESIGNATOR_T10:
273         case PS_DESIGNATOR_NAME:
274                 pr_err("pNFS: unsupported designator "
275                         "(code set %d, type %d, len %d.\n",
276                         v->scsi.code_set,
277                         v->scsi.designator_type,
278                         v->scsi.designator_len);
279                 return false;
280         default:
281                 pr_err("pNFS: invalid designator "
282                         "(code set %d, type %d, len %d.\n",
283                         v->scsi.code_set,
284                         v->scsi.designator_type,
285                         v->scsi.designator_len);
286                 return false;
287         }
288 }
289
290 /*
291  * Try to open the udev path for the WWN.  At least on Debian the udev
292  * by-id path will always point to the dm-multipath device if one exists.
293  */
294 static struct block_device *
295 bl_open_udev_path(struct pnfs_block_volume *v)
296 {
297         struct block_device *bdev;
298         const char *devname;
299
300         devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/wwn-0x%*phN",
301                                 v->scsi.designator_len, v->scsi.designator);
302         if (!devname)
303                 return ERR_PTR(-ENOMEM);
304
305         bdev = blkdev_get_by_path(devname, FMODE_READ | FMODE_WRITE, NULL);
306         if (IS_ERR(bdev)) {
307                 pr_warn("pNFS: failed to open device %s (%ld)\n",
308                         devname, PTR_ERR(bdev));
309         }
310
311         kfree(devname);
312         return bdev;
313 }
314
315 static int
316 bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
317                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
318 {
319         struct pnfs_block_volume *v = &volumes[idx];
320         const struct pr_ops *ops;
321         int error;
322
323         if (!bl_validate_designator(v))
324                 return -EINVAL;
325
326         d->bdev = bl_open_udev_path(v);
327         if (IS_ERR(d->bdev))
328                 return PTR_ERR(d->bdev);
329
330         d->len = i_size_read(d->bdev->bd_inode);
331         d->map = bl_map_simple;
332         d->pr_key = v->scsi.pr_key;
333
334         pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
335                 d->bdev->bd_disk->disk_name, d->pr_key);
336
337         ops = d->bdev->bd_disk->fops->pr_ops;
338         if (!ops) {
339                 pr_err("pNFS: block device %s does not support reservations.",
340                                 d->bdev->bd_disk->disk_name);
341                 error = -EINVAL;
342                 goto out_blkdev_put;
343         }
344
345         error = ops->pr_register(d->bdev, 0, d->pr_key, true);
346         if (error) {
347                 pr_err("pNFS: failed to register key for block device %s.",
348                                 d->bdev->bd_disk->disk_name);
349                 goto out_blkdev_put;
350         }
351
352         d->pr_registered = true;
353         return 0;
354
355 out_blkdev_put:
356         blkdev_put(d->bdev, FMODE_READ | FMODE_WRITE);
357         return error;
358 }
359
360 static int
361 bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
362                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
363 {
364         struct pnfs_block_volume *v = &volumes[idx];
365         int ret;
366
367         ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
368         if (ret)
369                 return ret;
370
371         d->disk_offset = v->slice.start;
372         d->len = v->slice.len;
373         return 0;
374 }
375
376 static int
377 bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
378                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
379 {
380         struct pnfs_block_volume *v = &volumes[idx];
381         u64 len = 0;
382         int ret, i;
383
384         d->children = kcalloc(v->concat.volumes_count,
385                         sizeof(struct pnfs_block_dev), GFP_KERNEL);
386         if (!d->children)
387                 return -ENOMEM;
388
389         for (i = 0; i < v->concat.volumes_count; i++) {
390                 ret = bl_parse_deviceid(server, &d->children[i],
391                                 volumes, v->concat.volumes[i], gfp_mask);
392                 if (ret)
393                         return ret;
394
395                 d->nr_children++;
396                 d->children[i].start += len;
397                 len += d->children[i].len;
398         }
399
400         d->len = len;
401         d->map = bl_map_concat;
402         return 0;
403 }
404
405 static int
406 bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
407                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
408 {
409         struct pnfs_block_volume *v = &volumes[idx];
410         u64 len = 0;
411         int ret, i;
412
413         d->children = kcalloc(v->stripe.volumes_count,
414                         sizeof(struct pnfs_block_dev), GFP_KERNEL);
415         if (!d->children)
416                 return -ENOMEM;
417
418         for (i = 0; i < v->stripe.volumes_count; i++) {
419                 ret = bl_parse_deviceid(server, &d->children[i],
420                                 volumes, v->stripe.volumes[i], gfp_mask);
421                 if (ret)
422                         return ret;
423
424                 d->nr_children++;
425                 len += d->children[i].len;
426         }
427
428         d->len = len;
429         d->chunk_size = v->stripe.chunk_size;
430         d->map = bl_map_stripe;
431         return 0;
432 }
433
434 static int
435 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
436                 struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
437 {
438         switch (volumes[idx].type) {
439         case PNFS_BLOCK_VOLUME_SIMPLE:
440                 return bl_parse_simple(server, d, volumes, idx, gfp_mask);
441         case PNFS_BLOCK_VOLUME_SLICE:
442                 return bl_parse_slice(server, d, volumes, idx, gfp_mask);
443         case PNFS_BLOCK_VOLUME_CONCAT:
444                 return bl_parse_concat(server, d, volumes, idx, gfp_mask);
445         case PNFS_BLOCK_VOLUME_STRIPE:
446                 return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
447         case PNFS_BLOCK_VOLUME_SCSI:
448                 return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
449         default:
450                 dprintk("unsupported volume type: %d\n", volumes[idx].type);
451                 return -EIO;
452         }
453 }
454
455 struct nfs4_deviceid_node *
456 bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
457                 gfp_t gfp_mask)
458 {
459         struct nfs4_deviceid_node *node = NULL;
460         struct pnfs_block_volume *volumes;
461         struct pnfs_block_dev *top;
462         struct xdr_stream xdr;
463         struct xdr_buf buf;
464         struct page *scratch;
465         int nr_volumes, ret, i;
466         __be32 *p;
467
468         scratch = alloc_page(gfp_mask);
469         if (!scratch)
470                 goto out;
471
472         xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
473         xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
474
475         p = xdr_inline_decode(&xdr, sizeof(__be32));
476         if (!p)
477                 goto out_free_scratch;
478         nr_volumes = be32_to_cpup(p++);
479
480         volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
481                           gfp_mask);
482         if (!volumes)
483                 goto out_free_scratch;
484
485         for (i = 0; i < nr_volumes; i++) {
486                 ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
487                 if (ret < 0)
488                         goto out_free_volumes;
489         }
490
491         top = kzalloc(sizeof(*top), gfp_mask);
492         if (!top)
493                 goto out_free_volumes;
494
495         ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
496         if (ret) {
497                 bl_free_device(top);
498                 kfree(top);
499                 goto out_free_volumes;
500         }
501
502         node = &top->node;
503         nfs4_init_deviceid_node(node, server, &pdev->dev_id);
504
505 out_free_volumes:
506         kfree(volumes);
507 out_free_scratch:
508         __free_page(scratch);
509 out:
510         return node;
511 }