From patchwork Fri Dec 3 19:56:03 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Jonthan Brassow X-Patchwork-Id: 379021 Received: from mx4-phx2.redhat.com (mx4-phx2.redhat.com [209.132.183.25]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id oB3Jvx0C008748 for ; Fri, 3 Dec 2010 19:58:21 GMT Received: from lists01.pubmisc.prod.ext.phx2.redhat.com (lists01.pubmisc.prod.ext.phx2.redhat.com [10.5.19.33]) by mx4-phx2.redhat.com (8.13.8/8.13.8) with ESMTP id oB3JuBPc016338; Fri, 3 Dec 2010 14:56:11 -0500 Received: from int-mx10.intmail.prod.int.phx2.redhat.com (int-mx10.intmail.prod.int.phx2.redhat.com [10.5.11.23]) by lists01.pubmisc.prod.ext.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id oB3JuAk4001245 for ; Fri, 3 Dec 2010 14:56:10 -0500 Received: from hydrogen.msp.redhat.com (hydrogen.msp.redhat.com [10.15.80.1]) by int-mx10.intmail.prod.int.phx2.redhat.com (8.14.4/8.14.4) with ESMTP id oB3Ju4c3025836 (version=TLSv1/SSLv3 cipher=DHE-RSA-AES256-SHA bits=256 verify=NO) for ; Fri, 3 Dec 2010 14:56:04 -0500 Received: from hydrogen.msp.redhat.com ([127.0.0.1]) by hydrogen.msp.redhat.com (8.14.1/8.14.1) with ESMTP id oB3Ju39D003783; Fri, 3 Dec 2010 13:56:03 -0600 Received: (from jbrassow@localhost) by hydrogen.msp.redhat.com (8.14.1/8.14.1/Submit) id oB3Ju333003782; Fri, 3 Dec 2010 13:56:03 -0600 Date: Fri, 3 Dec 2010 13:56:03 -0600 From: Jonathan Brassow Message-Id: <201012031956.oB3Ju333003782@hydrogen.msp.redhat.com> To: dm-devel@redhat.com X-Scanned-By: MIMEDefang 2.68 on 10.5.11.23 X-loop: dm-devel@redhat.com Subject: [dm-devel] [PATCH 14 of 15] md new superblock type X-BeenThere: dm-devel@redhat.com X-Mailman-Version: 2.1.12 Precedence: junk Reply-To: device-mapper development List-Id: device-mapper development List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , MIME-Version: 1.0 Sender: dm-devel-bounces@redhat.com Errors-To: dm-devel-bounces@redhat.com X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Fri, 03 Dec 2010 19:58:36 +0000 (UTC) Index: linux-2.6/drivers/md/md.c =================================================================== --- linux-2.6.orig/drivers/md/md.c +++ linux-2.6/drivers/md/md.c @@ -1723,6 +1723,171 @@ super_1_rdev_size_change(mdk_rdev_t *rde return num_sectors; } +/* + * This structure is never used by userspace. It is only ever + * used in these particular super block accessing functions. + * Therefore, we don't put it in any .h file. + * + * It makes sense to define a new magic number here. This way, + * no userspace application will confuse the device as a device + * that is accessible through MD operations. Devices with this + * superblock should only ever be accessed via device-mapper. + */ +#define MD_DM_SB_MAGIC 0x426E6F4A +struct mdp_superblock_2 { + __le32 magic; + __le32 flags; + + __le64 events; + __le64 reshape_position; + + __le32 num_devices; /* Number of devs in RAID, Max = 32 */ + __le32 failed_devices; /* bitmap of devs used to indicate a failure */ + + __le32 reserved[120]; /* Round out the struct to 512 bytes */ +}; + +static void super_2_sync(mddev_t *mddev, mdk_rdev_t *rdev) +{ + mdk_rdev_t *r, *t; + uint32_t failed_devices; + struct mdp_superblock_2 *sb; + + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page); + failed_devices = le32_to_cpu(sb->failed_devices); + + rdev_for_each(r, t, mddev) + if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) { + printk(KERN_INFO " Dev #%d is faulty\n", + (r->raid_disk < 0) ? + r->saved_raid_disk : r->raid_disk); + failed_devices |= (1 << r->raid_disk); + } + + memset(sb, 0, sizeof(*sb)); + + sb->magic = cpu_to_le32(MD_DM_SB_MAGIC); + sb->events = cpu_to_le64(mddev->events); + sb->num_devices = cpu_to_le32(mddev->raid_disks); + sb->failed_devices = cpu_to_le32(failed_devices); +} + +/* + * super_2_load + * + * This function creates a superblock if one is not found on the device + * and will indicate the more appropriate device whose superblock should + * be used - if given two. + * + * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise + */ +static int super_2_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) +{ + int r; + uint64_t ev1, ev2; + struct mdp_superblock_2 *sb; + struct mdp_superblock_2 *refsb; + + if (sizeof(*sb) & (sizeof(*sb) - 1)) { + printk(KERN_ERR "Programmer error: Bad sized superblock (%lu)\n", + sizeof(*sb)); + return -EIO; + } + + rdev->sb_start = 0; + rdev->sb_size = sizeof(*sb); + r = read_disk_sb(rdev, rdev->sb_size); + if (r) + return r; + + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page); + if (sb->magic != cpu_to_le32(MD_DM_SB_MAGIC)) { + printk(KERN_INFO " Superblock not found: creating new\n"); + super_2_sync(rdev->mddev, rdev); + + /* Force new superblocks to disk */ + set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); + + /* Any superblock is better than none, choose that if given */ + return refdev ? 0 : 1; + } + + ev1 = le64_to_cpu(sb->events); + if (!refdev) { + if (le32_to_cpu(sb->num_devices) != rdev->mddev->raid_disks) { + /* + * User should clear device of old superblocks before + * attempting to create something different. + */ + + printk(KERN_ERR "Configuration incompatible with on-disk information\n"); + return -EINVAL; + } + return 1; + } + + refsb = (struct mdp_superblock_2 *)page_address(refdev->sb_page); + ev2 = le64_to_cpu(refsb->events); + + if (ev1 != ev2) + printk(KERN_INFO "Comparing event counts [%llu %llu], choosing dev #%d\n", + ev1, ev2, (ev1 > ev2) ? rdev->raid_disk : + refdev->raid_disk); + + return (ev1 > ev2) ? 1 : 0; +} + +static int super_2_validate(mddev_t *mddev, mdk_rdev_t *rdev) +{ + uint64_t ev1; + uint32_t failed_devices; + struct mdp_superblock_2 *sb; + + sb = (struct mdp_superblock_2 *)page_address(rdev->sb_page); + ev1 = le64_to_cpu(sb->events); + failed_devices = le32_to_cpu(sb->failed_devices); + + if (!mddev->events) { + mdk_rdev_t *r, *t; + struct mdp_superblock_2 *sb2; + + mddev->events = ev1; + rdev_for_each(r, t, mddev) { + if (!r->sb_page) + continue; + sb2 = (struct mdp_superblock_2 *) + page_address(r->sb_page); + sb2->failed_devices = 0; + + if ((r->raid_disk >= 0) && + (failed_devices & (1 << r->raid_disk))) + set_bit(Faulty, &r->flags); + } + } + + rdev->mddev->bitmap_info.offset = 0; /* disable bitmap creation */ + rdev->mddev->bitmap_info.default_offset = 1024 >> 9; + + /* + * If the device was marked as failed when the array + * was previously active, we must mark the device as + * not In_sync + */ + if (test_bit(Faulty, &rdev->flags)) { + printk(KERN_INFO " Dev #%d marked as failed, clearing In_sync\n", + rdev->raid_disk); + clear_bit(Faulty, &rdev->flags); + clear_bit(In_sync, &rdev->flags); + rdev->recovery_offset = 0; + } + + /* FIXME: Pull these debug statements */ + if (test_bit(In_sync, &rdev->flags)) + printk(KERN_INFO " In_sync flag set\n"); + + return 0; +} + static struct super_type super_types[] = { [0] = { .name = "0.90.0", @@ -1740,6 +1905,14 @@ static struct super_type super_types[] = .sync_super = super_1_sync, .rdev_size_change = super_1_rdev_size_change, }, + [2] = { + .name = "dm", + .owner = THIS_MODULE, + .load_super = super_2_load, + .validate_super = super_2_validate, + .sync_super = super_2_sync, + .rdev_size_change = super_1_rdev_size_change, + }, }; static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) @@ -4408,6 +4581,20 @@ static void md_safemode_timeout(unsigned md_wakeup_thread(mddev->thread); } +static int should_read_super(mddev_t *mddev) +{ + mdk_rdev_t *rdev, *tmp; + + if (!mddev->raid_disks) + return 1; + + rdev_for_each(rdev, tmp, mddev) + if (rdev->meta_bdev) + return 1; + + return 0; +} + static int start_dirty_degraded; int md_run(mddev_t *mddev) @@ -4429,7 +4616,7 @@ int md_run(mddev_t *mddev) /* * Analyze all RAID superblock(s) */ - if (!mddev->raid_disks) { + if (should_read_super(mddev)) { if (!mddev->persistent) return -EINVAL; analyze_sbs(mddev);