Message ID | 20241231030929.246059-1-tomas.mudrunka@gmail.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | Export MDRAID bitmap on disk structure in UAPI header file | expand |
Hi, 在 2024/12/31 11:09, Tomas Mudrunka 写道: > When working on software that manages MD RAID disks from > userspace. Currently provided headers only contain MD superblock. > That is not enough to fully populate MD RAID metadata. > Therefore this patch adds bitmap superblock as well. > Thanks for the patch, however, Why do you want to directly manipulate the metadata instead of using mdadm? You must first provide an explanation to convince us that what you're doing makes sense, and it's best to show your work. Thanks, Kuai > Signed-off-by: Tomas Mudrunka <tomas.mudrunka@gmail.com> > --- > drivers/md/md-bitmap.h | 42 +------------------------------- > include/uapi/linux/raid/md_p.h | 44 +++++++++++++++++++++++++++++++++- > 2 files changed, 44 insertions(+), 42 deletions(-) > > diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h > index 662e6fc14..6050d422b 100644 > --- a/drivers/md/md-bitmap.h > +++ b/drivers/md/md-bitmap.h > @@ -7,7 +7,7 @@ > #ifndef BITMAP_H > #define BITMAP_H 1 > > -#define BITMAP_MAGIC 0x6d746962 > +#include <linux/raid/md_p.h> > > typedef __u16 bitmap_counter_t; > #define COUNTER_BITS 16 > @@ -18,46 +18,6 @@ typedef __u16 bitmap_counter_t; > #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) > #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) > > -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ > -enum bitmap_state { > - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ > - BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ > - BITMAP_HOSTENDIAN =15, > -}; > - > -/* the superblock at the front of the bitmap file -- little endian */ > -typedef struct bitmap_super_s { > - __le32 magic; /* 0 BITMAP_MAGIC */ > - __le32 version; /* 4 the bitmap major for now, could change... */ > - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ > - __le64 events; /* 24 event counter for the bitmap (1)*/ > - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ > - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ > - __le32 state; /* 48 bitmap state information */ > - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ > - __le32 daemon_sleep; /* 56 seconds between disk flushes */ > - __le32 write_behind; /* 60 number of outstanding write-behind writes */ > - __le32 sectors_reserved; /* 64 number of 512-byte sectors that are > - * reserved for the bitmap. */ > - __le32 nodes; /* 68 the maximum number of nodes in cluster. */ > - __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ > - __u8 pad[256 - 136]; /* set to zero */ > -} bitmap_super_t; > - > -/* notes: > - * (1) This event counter is updated before the eventcounter in the md superblock > - * When a bitmap is loaded, it is only accepted if this event counter is equal > - * to, or one greater than, the event counter in the superblock. > - * (2) This event counter is updated when the other one is *if*and*only*if* the > - * array is not degraded. As bits are not cleared when the array is degraded, > - * this represents the last time that any bits were cleared. > - * If a device is being added that has an event count with this value or > - * higher, it is accepted as conforming to the bitmap. > - * (3)This is the number of sectors represented by the bitmap, and is the range that > - * resync happens across. For raid1 and raid5/6 it is the size of individual > - * devices. For raid10 it is the size of the array. > - */ > - > struct md_bitmap_stats { > u64 events_cleared; > int behind_writes; > diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h > index 5a43c23f5..8131e7713 100644 > --- a/include/uapi/linux/raid/md_p.h > +++ b/include/uapi/linux/raid/md_p.h > @@ -1,7 +1,7 @@ > /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ > /* > md_p.h : physical layout of Linux RAID devices > - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman > + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman, Peter T. Breuer > > This program is free software; you can redistribute it and/or modify > it under the terms of the GNU General Public License as published by > @@ -426,4 +426,46 @@ struct ppl_header { > struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES]; > } __attribute__ ((__packed__)); > > +#define BITMAP_MAGIC 0x6d746962 > + > +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ > +enum bitmap_state { > + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ > + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ > + BITMAP_HOSTENDIAN =15, > +}; > + > +/* the superblock at the front of the bitmap file -- little endian */ > +typedef struct bitmap_super_s { > + __le32 magic; /* 0 BITMAP_MAGIC */ > + __le32 version; /* 4 the bitmap major for now, could change... */ > + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ > + __le64 events; /* 24 event counter for the bitmap (1)*/ > + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ > + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ > + __le32 state; /* 48 bitmap state information */ > + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ > + __le32 daemon_sleep; /* 56 seconds between disk flushes */ > + __le32 write_behind; /* 60 number of outstanding write-behind writes */ > + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are > + * reserved for the bitmap. */ > + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ > + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ > + __u8 pad[256 - 136]; /* set to zero */ > +} bitmap_super_t; > + > +/* notes: > + * (1) This event counter is updated before the eventcounter in the md superblock > + * When a bitmap is loaded, it is only accepted if this event counter is equal > + * to, or one greater than, the event counter in the superblock. > + * (2) This event counter is updated when the other one is *if*and*only*if* the > + * array is not degraded. As bits are not cleared when the array is degraded, > + * this represents the last time that any bits were cleared. > + * If a device is being added that has an event count with this value or > + * higher, it is accepted as conforming to the bitmap. > + * (3)This is the number of sectors represented by the bitmap, and is the range that > + * resync happens across. For raid1 and raid5/6 it is the size of individual > + * devices. For raid10 it is the size of the array. > + */ > + > #endif >
On Tue, 31 Dec 2024 11:47:23 +0800 Yu Kuai <yukuai1@huaweicloud.com> wrote: > Hi, > > 在 2024/12/31 11:09, Tomas Mudrunka 写道: > > When working on software that manages MD RAID disks from > > userspace. Currently provided headers only contain MD superblock. > > That is not enough to fully populate MD RAID metadata. > > Therefore this patch adds bitmap superblock as well. > > > > Thanks for the patch, however, Why do you want to directly manipulate > the metadata instead of using mdadm? You must first provide an > explanation to convince us that what you're doing makes sense, and > it's best to show your work. > > Thanks, > Kuai I'm with Kuai here. I would also add that for such purposes you can use externally managed metadata, not native. External management was proposed to address your problem however over the years it turned out to not be good conception (kernel driver relies on userspace daemon which is not secure). Thanks, Mariusz
> > Thanks for the patch, however, Why do you want to directly manipulate > > the metadata instead of using mdadm? You must first provide an > > explanation to convince us that what you're doing makes sense, and > > it's best to show your work. I am adding MD RAID support to genimage tool: https://github.com/pengutronix/genimage/ It is used to generate firmware/disk images. Without such a tool it is impossible to build disk image containing md raid metadata without actually assembling it in the kernel via losetup or something... I am already using #include <linux/raid/md_p.h> which includes references to bitmap structures: $ grep -ri bitmap /usr/include/linux/raid/md_p.h #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ __le32 feature_map; /* bit 0 set if 'bitmap_offset' is meaningful */ __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts * NOTE: signed, so bitmap can be before superblock #define MD_FEATURE_BITMAP_OFFSET 1 #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening * is guided by bitmap. #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_BITMAP \ But when i use those, the resulting metadata is invalid, unless i populate the structures from drivers/md/md-bitmap.h so i had to copypaste its contents to my code, but i am not happy about it (including half and copypasting half): https://github.com/Harvie/genimage/blob/master/image-mdraid.c > I'm with Kuai here. I would also add that for such purposes you can use > externally managed metadata, not native. External management was > proposed to address your problem however over the years it turned out > to not be good conception (kernel driver relies on userspace daemon > which is not secure). > > Thanks, > Mariusz Hope my reply is sufficient. Thank you guys! Tom
Sorry missclick, adding linux-raid now. I think we don't need to spam on linux-kernel as these are MD internals. Kuai please take a look again. Thanks, Mariusz On Tue, 31 Dec 2024 12:31:08 +0100 Mariusz Tkaczyk <mtkaczyk@kernel.org> wrote: > On Tue, 31 Dec 2024 12:00:31 +0100 > Tomáš Mudruňka <tomas.mudrunka@gmail.com> wrote: > > > > > Thanks for the patch, however, Why do you want to directly > > > > manipulate the metadata instead of using mdadm? You must first > > > > provide an explanation to convince us that what you're doing > > > > makes sense, and it's best to show your work. > > > > I am adding MD RAID support to genimage tool: > > https://github.com/pengutronix/genimage/ > > > > It is used to generate firmware/disk images. Without such a tool it > > is impossible to build disk image containing md raid metadata > > without actually assembling it in the kernel via losetup or > > something... > > > > I am already using #include <linux/raid/md_p.h> which includes > > references to bitmap structures: > > > > $ grep -ri bitmap /usr/include/linux/raid/md_p.h > > #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby > > */ __le32 feature_map; /* bit 0 set if 'bitmap_offset' is > > meaningful */ __le32 bitmap_offset; /* sectors after start of > > superblock that bitmap starts > > * NOTE: signed, so bitmap can be before > > superblock #define MD_FEATURE_BITMAP_OFFSET 1 > > #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is > > happening > > * is guided by bitmap. > > #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ > > |MD_FEATURE_RECOVERY_BITMAP \ > > > > But when i use those, the resulting metadata is invalid, unless i > > populate the structures from drivers/md/md-bitmap.h so i had to > > copypaste its contents to my code, but i am not happy about it > > (including half and copypasting half): > > https://github.com/md-raid-utilities/mdadm/blob/main/bitmap.h > > Correct me if I'm wrong but looks like it is what we did in mdadm. > Well, If you don't want to care about it, you can consider adding > mdadm as submodule in your application and use mdadm's headers. > > Just an option, I have no hard feelings here. > > Looking into that now make me more feeling that we should export this > header long time ago instead of reimplementing it in mdadm. Kuai, what > do you think? > > > > > https://github.com/Harvie/genimage/blob/master/image-mdraid.c > > > > > I'm with Kuai here. I would also add that for such purposes you > > > can use externally managed metadata, not native. External > > > management was proposed to address your problem however over the > > > years it turned out to not be good conception (kernel driver > > > relies on userspace daemon which is not secure). > > > > > > Thanks, > > > Mariusz > > > > Hope my reply is sufficient. > > > > Thank you guys! > > Tom > > Looks like old problem we get used to. If Kuai agrees too, I'm open to > add this but.. as a mdadm maintainer (primary tool to manipulate > mdraid) I would like you to handle this on mdadm site too to make sure > we have it consistent and we exported exactly what is needed. > > Hope, it has some sense now! > Thanks, > Mariusz
Hi, 在 2024/12/31 22:23, Mariusz Tkaczyk 写道: > Sorry missclick, adding linux-raid now. > I think we don't need to spam on linux-kernel as these are MD internals. > > Kuai please take a look again. > > Thanks, > Mariusz > > On Tue, 31 Dec 2024 12:31:08 +0100 > Mariusz Tkaczyk <mtkaczyk@kernel.org> wrote: > >> On Tue, 31 Dec 2024 12:00:31 +0100 >> Tomáš Mudruňka <tomas.mudrunka@gmail.com> wrote: >> >>>>> Thanks for the patch, however, Why do you want to directly >>>>> manipulate the metadata instead of using mdadm? You must first >>>>> provide an explanation to convince us that what you're doing >>>>> makes sense, and it's best to show your work. >>> >>> I am adding MD RAID support to genimage tool: >>> https://github.com/pengutronix/genimage/ >>> >>> It is used to generate firmware/disk images. Without such a tool it >>> is impossible to build disk image containing md raid metadata >>> without actually assembling it in the kernel via losetup or >>> something... >>> >>> I am already using #include <linux/raid/md_p.h> which includes >>> references to bitmap structures: >>> >>> $ grep -ri bitmap /usr/include/linux/raid/md_p.h >>> #define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby >>> */ __le32 feature_map; /* bit 0 set if 'bitmap_offset' is >>> meaningful */ __le32 bitmap_offset; /* sectors after start of >>> superblock that bitmap starts >>> * NOTE: signed, so bitmap can be before >>> superblock #define MD_FEATURE_BITMAP_OFFSET 1 >>> #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is >>> happening >>> * is guided by bitmap. >>> #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ >>> |MD_FEATURE_RECOVERY_BITMAP \ >>> >>> But when i use those, the resulting metadata is invalid, unless i >>> populate the structures from drivers/md/md-bitmap.h so i had to >>> copypaste its contents to my code, but i am not happy about it >>> (including half and copypasting half): Just curious, what you guys do for filesystems like ext4/xfs, and they just define the same structure in user-space tools. looks like your tool do support to create ext4 images, and it's using ext4's use-space tools directly. If it's true, do you consider to use mdadm directly? Thanks, Kuai >> >> https://github.com/md-raid-utilities/mdadm/blob/main/bitmap.h >> >> Correct me if I'm wrong but looks like it is what we did in mdadm. >> Well, If you don't want to care about it, you can consider adding >> mdadm as submodule in your application and use mdadm's headers. >> >> Just an option, I have no hard feelings here. >> >> Looking into that now make me more feeling that we should export this >> header long time ago instead of reimplementing it in mdadm. Kuai, what >> do you think? >> >>> >>> https://github.com/Harvie/genimage/blob/master/image-mdraid.c >>> >>>> I'm with Kuai here. I would also add that for such purposes you >>>> can use externally managed metadata, not native. External >>>> management was proposed to address your problem however over the >>>> years it turned out to not be good conception (kernel driver >>>> relies on userspace daemon which is not secure). >>>> >>>> Thanks, >>>> Mariusz >>> >>> Hope my reply is sufficient. >>> >>> Thank you guys! >>> Tom >> >> Looks like old problem we get used to. If Kuai agrees too, I'm open to >> add this but.. as a mdadm maintainer (primary tool to manipulate >> mdraid) I would like you to handle this on mdadm site too to make sure >> we have it consistent and we exported exactly what is needed. >> >> Hope, it has some sense now! >> Thanks, >> Mariusz > > > . >
> Just curious, what you guys do for filesystems like ext4/xfs, and > they just define the same structure in user-space tools. > > looks like your tool do support to create ext4 images, and it's using > ext4's use-space tools directly. If it's true, do you consider to use > mdadm directly? > > Thanks, > Kuai Yes, we do use external tools when possible. It is not possible with mdadm. Mdadm cannot create disk image of MD RAID array. Kernel does this. We want/need purely userspace generator, so we don't have to care about permissions, losetup, kernel-side mdraid runtime, etc... We just want to generate valid image without involving kernel in any way. I was using mdadm before switching to genimage and it adds complexity of handling all the edge cases of kernel states. Mkfs.ext4 can create image without involving kernel, mdadm cannot, it always instructs kernel to create the metadata for it when creating array. In my opinion we should decide whether it makes sense for kernel to export the structures in header file and either provide all of them, or provide none. That might be valid reasoning to say every userspace program should include its own definitions of structures. But providing half does not make any sense. I wonder if mdadm needs some of definitions that i have omitted from UAPI. Tom
Hi, 在 2025/01/02 19:48, Tomas Mudrunka 写道: >> Just curious, what you guys do for filesystems like ext4/xfs, and >> they just define the same structure in user-space tools. >> >> looks like your tool do support to create ext4 images, and it's using >> ext4's use-space tools directly. If it's true, do you consider to use >> mdadm directly? >> >> Thanks, >> Kuai > > Yes, we do use external tools when possible. It is not possible with mdadm. > Mdadm cannot create disk image of MD RAID array. Kernel does this. I'm a bit confused here, if you mean metadata, I think it's mdadm that write init metadata to disk, the only exception is dm-raid. > We want/need purely userspace generator, so we don't have to care about > permissions, losetup, kernel-side mdraid runtime, etc... We just want > to generate valid image without involving kernel in any way. I believe mdadm can do this, Mtkaczyk what do you think? The problem is that system service will recognize raid disks and assemble the array automatically, you might what to disable them. Thanks, Kuai > I was using mdadm before switching to genimage and it adds complexity of > handling all the edge cases of kernel states. > Mkfs.ext4 can create image without involving kernel, mdadm cannot, it always > instructs kernel to create the metadata for it when creating array. > > In my opinion we should decide whether it makes sense for kernel to export > the structures in header file and either provide all of them, or provide none. > That might be valid reasoning to say every userspace program should include > its own definitions of structures. But providing half does not make any sense. > > I wonder if mdadm needs some of definitions that i have omitted from UAPI. > > Tom > > . >
On Fri, 3 Jan 2025 09:14:30 +0800 Yu Kuai <yukuai1@huaweicloud.com> wrote: > Hi, > > 在 2025/01/02 19:48, Tomas Mudrunka 写道: > >> Just curious, what you guys do for filesystems like ext4/xfs, and > >> they just define the same structure in user-space tools. > >> > >> looks like your tool do support to create ext4 images, and it's > >> using ext4's use-space tools directly. If it's true, do you > >> consider to use mdadm directly? > >> > >> Thanks, > >> Kuai > > > > Yes, we do use external tools when possible. It is not possible > > with mdadm. Mdadm cannot create disk image of MD RAID array. Kernel > > does this. > > I'm a bit confused here, if you mean metadata, I think it's mdadm that > write init metadata to disk, the only exception is dm-raid. I think it means that currently you have to create kernel (MD) raid device (assemble using metadata) to have a chance creating image. > > > We want/need purely userspace generator, so we don't have to care > > about permissions, losetup, kernel-side mdraid runtime, etc... We > > just want to generate valid image without involving kernel in any > > way. > > I believe mdadm can do this, Mtkaczyk what do you think? I agree. The right way is to incorporate it with mdadm. We should create a volume image (data) without MD internals. With that, we will have control on this functionality. Also, we will be able to provide support for every metadata format. > > The problem is that system service will recognize raid disks and > assemble the array automatically, you might what to disable them. I don't think we need to care. They goal is to not have and use MD module so mdadm will fail to load personalities. > > Thanks, > Kuai > > > I was using mdadm before switching to genimage and it adds > > complexity of handling all the edge cases of kernel states. > > Mkfs.ext4 can create image without involving kernel, mdadm cannot, > > it always instructs kernel to create the metadata for it when > > creating array. > > > > In my opinion we should decide whether it makes sense for kernel to > > export the structures in header file and either provide all of > > them, or provide none. That might be valid reasoning to say every > > userspace program should include its own definitions of structures. > > But providing half does not make any sense. Sorry, This is old application and some solutions are here for years- they are working so nobody tried to change them. If you are looking for challenges this software is full of them! Thanks, Mariusz
> The problem is that system service will recognize raid disks and > assemble the array automatically, you might what to disable them. Actualy user is forced to work with MD device from the get go. This is how you would typicaly use mdadm to write metadata to disk: $ truncate -s 1G test.img $ mdadm --create /dev/md0 --level=1 --bitmap=internal --raid-devices=2 test.img missing mdadm: must be super-user to perform this action mdadm: test.img is not a block device. Following is unfit for my usecase: * It requires me to reference /dev/md0 (i don't want to involve kernel at all) * It requires super-user (no need, i just want to write bytes to my own file) * Refuses to work on regular file (once i run it as super-user) > I don't think we need to care. They goal is to not have and use MD > module so mdadm will fail to load personalities. No it is not the goal. Goal is not to rely on kernel. It has to work on any kernel including the ones that have MD module loaded. Possibly even on non-Linux OS. > I agree. The right way is to incorporate it with mdadm. > We should create a volume image (data) without MD internals. In that case i would still need headers with structs to parse the metadata and get offsets where to load actual data (filesystem) into the array images. But to be honest, i am pretty happy with how the genimage code works now, i don't need any help with its functionality. I don't even need those headers to be fixed. I can leave them copypasted. But i think it would be right thing to consolidate them, therefore i've proposed the patch. I just didn't wanted to hardcode definitions that are already in kernel, because i don't like duplicit code that can be included from somewhere else. > If you are looking for challenges this software is full of them! Haha. I feel you. Maybe lets tackle them step by step. Consolidating headers to provide complete ondisk format seems as a good start to me. Especialy if the mdadm could benefit from that as well. Tom
On Tue, Dec 31, 2024 at 04:09:27AM +0100, Tomas Mudrunka wrote: > When working on software that manages MD RAID disks from > userspace. Currently provided headers only contain MD superblock. > That is not enough to fully populate MD RAID metadata. > Therefore this patch adds bitmap superblock as well. the bitmap format is not a userspace ABI, it is an on-disk format. As such it does not belong into the uapi. It might make sense to create a clean standalone header just for the on-disk format that you could copy, though.
> the bitmap format is not a userspace ABI, it is an on-disk format. > As such it does not belong into the uapi. It might make sense to > create a clean standalone header just for the on-disk format that > you could copy, though. If you inspect the header in question, you'll find that this is the exact reason why this header exists. To describe "physical layout" of MD RAID devices. Which is just fancy way to say "on disk format". $ head /usr/include/linux/raid/md_p.h /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* md_p.h : physical layout of Linux RAID devices Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman po 6. 1. 2025 v 16:25 odesílatel Christoph Hellwig <hch@infradead.org> napsal: > > On Tue, Dec 31, 2024 at 04:09:27AM +0100, Tomas Mudrunka wrote: > > When working on software that manages MD RAID disks from > > userspace. Currently provided headers only contain MD superblock. > > That is not enough to fully populate MD RAID metadata. > > Therefore this patch adds bitmap superblock as well. > > the bitmap format is not a userspace ABI, it is an on-disk format. > As such it does not belong into the uapi. It might make sense to > create a clean standalone header just for the on-disk format that > you could copy, though. >
On Mon, Jan 06, 2025 at 04:40:52PM +0100, Tomáš Mudruňka wrote: > > the bitmap format is not a userspace ABI, it is an on-disk format. > > As such it does not belong into the uapi. It might make sense to > > create a clean standalone header just for the on-disk format that > > you could copy, though. > > If you inspect the header in question, you'll find that this is the exact > reason why this header exists. To describe "physical layout" of > MD RAID devices. Which is just fancy way to say "on disk format". Well, then MD already gets it wrong. No reason to do more of the same.
On Fri, 3 Jan 2025 12:54:22 +0100 Tomas Mudrunka <tomas.mudrunka@gmail.com> wrote: > > The problem is that system service will recognize raid disks and > > assemble the array automatically, you might what to disable them. > > Actualy user is forced to work with MD device from the get go. > This is how you would typicaly use mdadm to write metadata to disk: > > $ truncate -s 1G test.img > $ mdadm --create /dev/md0 --level=1 --bitmap=internal > --raid-devices=2 test.img missing mdadm: must be super-user to > perform this action mdadm: test.img is not a block device. In this case it should be something like --create --no-start (you don't want to start raid volume therefore you will not need MD stuff). --raid-devices option requires a disks. mdadm cannot guess them, you have to pass them. typically: --raid-devices=2 /dev/sda /dev/sdb in your case you would need new options like: --write-image=<file> > > Following is unfit for my usecase: > * It requires me to reference /dev/md0 (i don't want to involve > kernel at all) If we start supporting just writing metadata to chosen members then /dev/md0 reference will gone. > * It requires super-user (no need, i just want to write bytes to my > own file) What do you want to write to this file? I thought that you want to write content of the file to member disks accordingly to raid layout? Am I wrong here? wouldn't genimage generate raid array using pre-prepared image file? > * Refuses to work on regular file (once i run it as super-user) mdadm requires super user at all actions. We can challenge that in reasonable cases. Patches are welcomed. > > > I don't think we need to care. They goal is to not have and use MD > > module so mdadm will fail to load personalities. > > No it is not the goal. Goal is not to rely on kernel. It has to work > on any kernel including the ones that have MD module loaded. Possibly > even on non-Linux OS. if mdadm is not available case is simple - no possibility to activate MD arrays, unless v0.9 autostart but it is not a case here. If MD module is not available then mdadm will not try to start arrays (missing personality, no possibility. but if MD is there and genimage (or mdadm) will close descriptors after writing metadata to disks- raid volume may automatically be assembled (processing of change event). Just FYI. Kuai was concerned that volume may appear even if you don't want it. Theoretically it could happen. > > > I agree. The right way is to incorporate it with mdadm. > > We should create a volume image (data) without MD internals. > > In that case i would still need headers with structs to parse the > metadata and get offsets where to load actual data (filesystem) into > the array images. Yes. > > But to be honest, i am pretty happy with how the genimage code works > now, i don't need any help with its functionality. I don't even need > those headers to be fixed. I can leave them copypasted. But i think > it would be right thing to consolidate them, therefore i've proposed > the patch. It is fine. I mean it would be perfect to implement something like that for mdadm because other people may have a chance to use it but it is up to you. If this implementation satisfies your need then you are good to go. I will not handle this feature myself in mdadm and I don't see anyone else interested in having this that I would ask to support. > > I just didn't wanted to hardcode definitions that are already in > kernel, because i don't like duplicit code that can be included from > somewhere else. For me it is a right change. As I said, please care to fix mdadm because people are looking into mdadm as a source of truth. > > > If you are looking for challenges this software is full of them! > > Haha. I feel you. Maybe lets tackle them step by step. > Consolidating headers to provide complete ondisk format seems > as a good start to me. Especialy if the mdadm could benefit from that > as well. You have my ack! FYI, you can develop mdadm patches through github: https://github.com/md-raid-utilities/mdadm Thanks, Mariusz
Hi Tomas, On Fri, Jan 3, 2025 at 3:54 AM Tomas Mudrunka <tomas.mudrunka@gmail.com> wrote: > > > The problem is that system service will recognize raid disks and > > assemble the array automatically, you might what to disable them. > > Actualy user is forced to work with MD device from the get go. > This is how you would typicaly use mdadm to write metadata to disk: > > $ truncate -s 1G test.img > $ mdadm --create /dev/md0 --level=1 --bitmap=internal --raid-devices=2 test.img missing > mdadm: must be super-user to perform this action > mdadm: test.img is not a block device. > > Following is unfit for my usecase: > * It requires me to reference /dev/md0 (i don't want to involve kernel at all) > * It requires super-user (no need, i just want to write bytes to my own file) > * Refuses to work on regular file (once i run it as super-user) I think I understand the use case now. One question though: Do we really need to write the bitmap data at "mdadm --create" time? Can we instead wait until the array is assembled by the kernel? Thanks, Song [...]
> I think I understand the use case now. One question though: Do > we really need to write the bitmap data at "mdadm --create" time? > Can we instead wait until the array is assembled by the kernel? > > Thanks, > Song Thanks for reply. This is kinda what happens already. Most of bitmap is populated with valid data during first time kernel assembles the array. BUT unless there is at least some basic structure present in bitmap superblock (magic and geometry), the kernel will completely ignore the reference to bitmap in the md superblock. So does the mdadm when examining the array. There seems to be some sanity check and i honestly think it's not a bad thing. It just prevents kernel from overwriting data at some random offset, if the bitmap offset field ever gets corrupted for some reason... Tom
diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 662e6fc14..6050d422b 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -7,7 +7,7 @@ #ifndef BITMAP_H #define BITMAP_H 1 -#define BITMAP_MAGIC 0x6d746962 +#include <linux/raid/md_p.h> typedef __u16 bitmap_counter_t; #define COUNTER_BITS 16 @@ -18,46 +18,6 @@ typedef __u16 bitmap_counter_t; #define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) #define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ -enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ - BITMAP_HOSTENDIAN =15, -}; - -/* the superblock at the front of the bitmap file -- little endian */ -typedef struct bitmap_super_s { - __le32 magic; /* 0 BITMAP_MAGIC */ - __le32 version; /* 4 the bitmap major for now, could change... */ - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ - __le64 events; /* 24 event counter for the bitmap (1)*/ - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ - __le32 state; /* 48 bitmap state information */ - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ - __le32 daemon_sleep; /* 56 seconds between disk flushes */ - __le32 write_behind; /* 60 number of outstanding write-behind writes */ - __le32 sectors_reserved; /* 64 number of 512-byte sectors that are - * reserved for the bitmap. */ - __le32 nodes; /* 68 the maximum number of nodes in cluster. */ - __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ - __u8 pad[256 - 136]; /* set to zero */ -} bitmap_super_t; - -/* notes: - * (1) This event counter is updated before the eventcounter in the md superblock - * When a bitmap is loaded, it is only accepted if this event counter is equal - * to, or one greater than, the event counter in the superblock. - * (2) This event counter is updated when the other one is *if*and*only*if* the - * array is not degraded. As bits are not cleared when the array is degraded, - * this represents the last time that any bits were cleared. - * If a device is being added that has an event count with this value or - * higher, it is accepted as conforming to the bitmap. - * (3)This is the number of sectors represented by the bitmap, and is the range that - * resync happens across. For raid1 and raid5/6 it is the size of individual - * devices. For raid10 it is the size of the array. - */ - struct md_bitmap_stats { u64 events_cleared; int behind_writes; diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 5a43c23f5..8131e7713 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* md_p.h : physical layout of Linux RAID devices - Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman, Peter T. Breuer This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -426,4 +426,46 @@ struct ppl_header { struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES]; } __attribute__ ((__packed__)); +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_HOSTENDIAN =15, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + #endif
When working on software that manages MD RAID disks from userspace. Currently provided headers only contain MD superblock. That is not enough to fully populate MD RAID metadata. Therefore this patch adds bitmap superblock as well. Signed-off-by: Tomas Mudrunka <tomas.mudrunka@gmail.com> --- drivers/md/md-bitmap.h | 42 +------------------------------- include/uapi/linux/raid/md_p.h | 44 +++++++++++++++++++++++++++++++++- 2 files changed, 44 insertions(+), 42 deletions(-)