attempt_restore_of_faulty_devices() is limited to 64 when it should support
the new maximum of 253 when identifying any failed devices. It clears any
revivable devices via an MD personality hot remove and add cylce to allow
for their recovery.
Address by using existing functions to retrieve and update all failed
devices' bitfield members in the dm raid superblocks on all RAID devices
and check for any devices to clear in it.
Whilst on it, don't call attempt_restore_of_faulty_devices() for any MD
personality not providing disk hot add/remove methods (i.e. raid0 now),
because such personalities don't support reviving of failed disks.
Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
static void attempt_restore_of_faulty_devices(struct raid_set *rs)
{
int i;
static void attempt_restore_of_faulty_devices(struct raid_set *rs)
{
int i;
- uint64_t failed_devices, cleared_failed_devices = 0;
+ uint64_t cleared_failed_devices[DISKS_ARRAY_ELEMS];
struct dm_raid_superblock *sb;
struct dm_raid_superblock *sb;
+ struct mddev *mddev = &rs->md;
+ /* RAID personalities have to provide hot add/remove methods or we need to bail out. */
+ if (!mddev->pers || !mddev->pers->hot_add_disk || !mddev->pers->hot_remove_disk)
+ return;
+
+ memset(cleared_failed_devices, 0, sizeof(cleared_failed_devices));
+
for (i = 0; i < rs->md.raid_disks; i++) {
r = &rs->dev[i].rdev;
if (test_bit(Faulty, &r->flags) && r->sb_page &&
for (i = 0; i < rs->md.raid_disks; i++) {
r = &rs->dev[i].rdev;
if (test_bit(Faulty, &r->flags) && r->sb_page &&
* ourselves.
*/
if ((r->raid_disk >= 0) &&
* ourselves.
*/
if ((r->raid_disk >= 0) &&
- (r->mddev->pers->hot_remove_disk(r->mddev, r) != 0))
+ (mddev->pers->hot_remove_disk(mddev, r) != 0))
/* Failed to revive this device, try next */
continue;
/* Failed to revive this device, try next */
continue;
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
clear_bit(In_sync, &r->flags);
clear_bit(Faulty, &r->flags);
clear_bit(WriteErrorSeen, &r->flags);
clear_bit(In_sync, &r->flags);
- if (r->mddev->pers->hot_add_disk(r->mddev, r)) {
+ if (mddev->pers->hot_add_disk(mddev, r)) {
r->raid_disk = -1;
r->saved_raid_disk = -1;
r->flags = flags;
} else {
r->recovery_offset = 0;
r->raid_disk = -1;
r->saved_raid_disk = -1;
r->flags = flags;
} else {
r->recovery_offset = 0;
- cleared_failed_devices |= 1 << i;
+ set_bit(i, (void *) cleared_failed_devices);
+ cleared = true;
- if (cleared_failed_devices) {
+
+ /* If any failed devices could be cleared, update all sbs failed_devices bits */
+ if (cleared) {
+ uint64_t failed_devices[DISKS_ARRAY_ELEMS];
+
rdev_for_each(r, &rs->md) {
sb = page_address(r->sb_page);
rdev_for_each(r, &rs->md) {
sb = page_address(r->sb_page);
- failed_devices = le64_to_cpu(sb->failed_devices);
- failed_devices &= ~cleared_failed_devices;
- sb->failed_devices = cpu_to_le64(failed_devices);
+ sb_retrieve_failed_devices(sb, failed_devices);
+
+ for (i = 0; i < DISKS_ARRAY_ELEMS; i++)
+ failed_devices[i] &= ~cleared_failed_devices[i];
+
+ sb_update_failed_devices(sb, failed_devices);