Message ID | 20230301204135.39230-6-logang@deltatee.com (mailing list archive) |
---|---|
State | Accepted, archived |
Headers | show |
Series | Write Zeroes option for Creating Arrays | expand |
On Wed, Mar 01, 2023 at 01:41:33PM -0700, Logan Gunthorpe wrote: > Add the --write-zeros option for Create which will send a write zeros > request to all the disks before assembling the array. After zeroing > the array, the disks will be in a known clean state and the initial > sync may be skipped. > > Writing zeroes is best used when there is a hardware offload method > to zero the data. But even still, zeroing can take several minutes on > a large device. Because of this, all disks are zeroed in parallel using > their own forked process and a message is printed to the user. The main > process will proceed only after all the zeroing processes have completed > successfully. > > Signed-off-by: Logan Gunthorpe <logang@deltatee.com> > Acked-by: Kinga Tanska <kinga.tanska@linux.intel.com> > Reviewed-by: Xiao Ni <xni@redhat.com> > Reviewed-by: Chaitanya Kulkarni <kch@nvidia.com> Acked-by: Coly Li <colyli@suse.de> Thanks. > --- > Create.c | 176 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- > ReadMe.c | 2 + > mdadm.c | 9 +++ > mdadm.h | 5 ++ > 4 files changed, 190 insertions(+), 2 deletions(-) > > diff --git a/Create.c b/Create.c > index 4acda30c5256..bbe9e13dc76d 100644 > --- a/Create.c > +++ b/Create.c > @@ -26,6 +26,10 @@ > #include "md_u.h" > #include "md_p.h" > #include <ctype.h> > +#include <fcntl.h> > +#include <signal.h> > +#include <sys/signalfd.h> > +#include <sys/wait.h> > > static int round_size_and_verify(unsigned long long *size, int chunk) > { > @@ -91,9 +95,149 @@ int default_layout(struct supertype *st, int level, int verbose) > return layout; > } > > +static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st, > + struct mddev_dev *dv) > + > +{ > + const unsigned long long req_size = 1 << 30; > + unsigned long long offset_bytes, size_bytes, sz; > + sigset_t sigset; > + int ret = 0; > + pid_t pid; > + > + size_bytes = KIB_TO_BYTES(s->size); > + > + /* > + * If size_bytes is zero, this is a zoned raid array where > + * each disk is of a different size and uses its full > + * disk. Thus zero the entire disk. > + */ > + if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes)) > + return -1; > + > + if (dv->data_offset != INVALID_SECTORS) > + offset_bytes = SEC_TO_BYTES(dv->data_offset); > + else > + offset_bytes = SEC_TO_BYTES(st->data_offset); > + > + pr_info("zeroing data from %lld to %lld on: %s\n", > + offset_bytes, size_bytes, dv->devname); > + > + pid = fork(); > + if (pid < 0) { > + pr_err("Could not fork to zero disks: %s\n", strerror(errno)); > + return pid; > + } else if (pid != 0) { > + return pid; > + } > + > + sigemptyset(&sigset); > + sigaddset(&sigset, SIGINT); > + sigprocmask(SIG_UNBLOCK, &sigset, NULL); > + > + while (size_bytes) { > + /* > + * Split requests to the kernel into 1GB chunks seeing the > + * fallocate() call is not interruptible and blocking a > + * ctrl-c for several minutes is not desirable. > + * > + * 1GB is chosen as a compromise: the user may still have > + * to wait several seconds if they ctrl-c on devices that > + * zero slowly, but will reduce the number of requests > + * required and thus the overhead on devices that perform > + * better. > + */ > + sz = size_bytes; > + if (sz >= req_size) > + sz = req_size; > + > + if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, > + offset_bytes, sz)) { > + pr_err("zeroing %s failed: %s\n", dv->devname, > + strerror(errno)); > + ret = 1; > + break; > + } > + > + offset_bytes += sz; > + size_bytes -= sz; > + } > + > + exit(ret); > +} > + > +static int wait_for_zero_forks(int *zero_pids, int count) > +{ > + int wstatus, ret = 0, i, sfd, wait_count = 0; > + struct signalfd_siginfo fdsi; > + bool interrupted = false; > + sigset_t sigset; > + ssize_t s; > + > + for (i = 0; i < count; i++) > + if (zero_pids[i]) > + wait_count++; > + if (!wait_count) > + return 0; > + > + sigemptyset(&sigset); > + sigaddset(&sigset, SIGINT); > + sigaddset(&sigset, SIGCHLD); > + sigprocmask(SIG_BLOCK, &sigset, NULL); > + > + sfd = signalfd(-1, &sigset, 0); > + if (sfd < 0) { > + pr_err("Unable to create signalfd: %s\n", strerror(errno)); > + return 1; > + } > + > + while (1) { > + s = read(sfd, &fdsi, sizeof(fdsi)); > + if (s != sizeof(fdsi)) { > + pr_err("Invalid signalfd read: %s\n", strerror(errno)); > + close(sfd); > + return 1; > + } > + > + if (fdsi.ssi_signo == SIGINT) { > + printf("\n"); > + pr_info("Interrupting zeroing processes, please wait...\n"); > + interrupted = true; > + } else if (fdsi.ssi_signo == SIGCHLD) { > + if (!--wait_count) > + break; > + } > + } > + > + close(sfd); > + > + for (i = 0; i < count; i++) { > + if (!zero_pids[i]) > + continue; > + > + waitpid(zero_pids[i], &wstatus, 0); > + zero_pids[i] = 0; > + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) > + ret = 1; > + } > + > + if (interrupted) { > + pr_err("zeroing interrupted!\n"); > + return 1; > + } > + > + if (ret) > + pr_err("zeroing failed!\n"); > + else > + pr_info("zeroing finished\n"); > + > + return ret; > +} > + > static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, > struct supertype *st, struct mddev_dev *dv, > - struct mdinfo *info, int have_container, int major_num) > + struct mdinfo *info, int have_container, int major_num, > + int *zero_pid) > { > dev_t rdev; > int fd; > @@ -148,6 +292,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, > } > st->ss->getinfo_super(st, info, NULL); > > + if (fd >= 0 && s->write_zeroes) { > + *zero_pid = write_zeroes_fork(fd, s, st, dv); > + if (*zero_pid <= 0) { > + ioctl(mdfd, STOP_ARRAY, NULL); > + return 1; > + } > + } > + > if (have_container && c->verbose > 0) > pr_err("Using %s for device %d\n", > map_dev(info->disk.major, info->disk.minor, 0), > @@ -224,10 +376,23 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, > { > struct mddev_dev *moved_disk = NULL; > int pass, raid_disk_num, dnum; > + int zero_pids[total_slots]; > struct mddev_dev *dv; > struct mdinfo *infos; > + sigset_t sigset, orig_sigset; > int ret = 0; > > + /* > + * Block SIGINT so the main thread will always wait for the > + * zeroing processes when being interrupted. Otherwise the > + * zeroing processes will finish their work in the background > + * keeping the disk busy. > + */ > + sigemptyset(&sigset); > + sigaddset(&sigset, SIGINT); > + sigprocmask(SIG_BLOCK, &sigset, &orig_sigset); > + memset(zero_pids, 0, sizeof(zero_pids)); > + > infos = xmalloc(sizeof(*infos) * total_slots); > enable_fds(total_slots); > for (pass = 1; pass <= 2; pass++) { > @@ -261,7 +426,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, > > ret = add_disk_to_super(mdfd, s, c, st, dv, > &infos[dnum], have_container, > - major_num); > + major_num, &zero_pids[dnum]); > if (ret) > goto out; > > @@ -287,6 +452,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, > } > > if (pass == 1) { > + ret = wait_for_zero_forks(zero_pids, total_slots); > + if (ret) > + goto out; > + > ret = update_metadata(mdfd, s, st, map, info, > chosen_name); > if (ret) > @@ -295,7 +464,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, > } > > out: > + if (ret) > + wait_for_zero_forks(zero_pids, total_slots); > free(infos); > + sigprocmask(SIG_SETMASK, &orig_sigset, NULL); > return ret; > } > > diff --git a/ReadMe.c b/ReadMe.c > index bd8d50d28661..db251ed2f3d4 100644 > --- a/ReadMe.c > +++ b/ReadMe.c > @@ -138,6 +138,7 @@ struct option long_options[] = { > {"size", 1, 0, 'z'}, > {"auto", 1, 0, Auto}, /* also for --assemble */ > {"assume-clean",0,0, AssumeClean }, > + {"write-zeroes",0,0, WriteZeroes }, > {"metadata", 1, 0, 'e'}, /* superblock format */ > {"bitmap", 1, 0, Bitmap}, > {"bitmap-chunk", 1, 0, BitmapChunk}, > @@ -390,6 +391,7 @@ char Help_create[] = > " --write-journal= : Specify journal device for RAID-4/5/6 array\n" > " --consistency-policy= : Specify the policy that determines how the array\n" > " -k : maintains consistency in case of unexpected shutdown.\n" > +" --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n" > "\n" > ; > > diff --git a/mdadm.c b/mdadm.c > index 57e8e6fa64b9..4685ad6b06c2 100644 > --- a/mdadm.c > +++ b/mdadm.c > @@ -590,6 +590,10 @@ int main(int argc, char *argv[]) > s.assume_clean = 1; > continue; > > + case O(CREATE, WriteZeroes): > + s.write_zeroes = 1; > + continue; > + > case O(GROW,'n'): > case O(CREATE,'n'): > case O(BUILD,'n'): /* number of raid disks */ > @@ -1251,6 +1255,11 @@ int main(int argc, char *argv[]) > } > } > > + if (s.write_zeroes && !s.assume_clean) { > + pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n"); > + s.assume_clean = 1; > + } > + > if (!mode && devs_found) { > mode = MISC; > devmode = 'Q'; > diff --git a/mdadm.h b/mdadm.h > index 8bd65fba1887..211cfcd54f92 100644 > --- a/mdadm.h > +++ b/mdadm.h > @@ -275,6 +275,9 @@ static inline void __put_unaligned32(__u32 val, void *p) > > #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) > > +#define KIB_TO_BYTES(x) ((x) << 10) > +#define SEC_TO_BYTES(x) ((x) << 9) > + > extern const char Name[]; > > struct md_bb_entry { > @@ -435,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[], > */ > enum special_options { > AssumeClean = 300, > + WriteZeroes, > BitmapChunk, > WriteBehind, > ReAdd, > @@ -640,6 +644,7 @@ struct shape { > int bitmap_chunk; > char *bitmap_file; > int assume_clean; > + bool write_zeroes; > int write_behind; > unsigned long long size; > unsigned long long data_offset; > -- > 2.30.2 >
diff --git a/Create.c b/Create.c index 4acda30c5256..bbe9e13dc76d 100644 --- a/Create.c +++ b/Create.c @@ -26,6 +26,10 @@ #include "md_u.h" #include "md_p.h" #include <ctype.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/signalfd.h> +#include <sys/wait.h> static int round_size_and_verify(unsigned long long *size, int chunk) { @@ -91,9 +95,149 @@ int default_layout(struct supertype *st, int level, int verbose) return layout; } +static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st, + struct mddev_dev *dv) + +{ + const unsigned long long req_size = 1 << 30; + unsigned long long offset_bytes, size_bytes, sz; + sigset_t sigset; + int ret = 0; + pid_t pid; + + size_bytes = KIB_TO_BYTES(s->size); + + /* + * If size_bytes is zero, this is a zoned raid array where + * each disk is of a different size and uses its full + * disk. Thus zero the entire disk. + */ + if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes)) + return -1; + + if (dv->data_offset != INVALID_SECTORS) + offset_bytes = SEC_TO_BYTES(dv->data_offset); + else + offset_bytes = SEC_TO_BYTES(st->data_offset); + + pr_info("zeroing data from %lld to %lld on: %s\n", + offset_bytes, size_bytes, dv->devname); + + pid = fork(); + if (pid < 0) { + pr_err("Could not fork to zero disks: %s\n", strerror(errno)); + return pid; + } else if (pid != 0) { + return pid; + } + + sigemptyset(&sigset); + sigaddset(&sigset, SIGINT); + sigprocmask(SIG_UNBLOCK, &sigset, NULL); + + while (size_bytes) { + /* + * Split requests to the kernel into 1GB chunks seeing the + * fallocate() call is not interruptible and blocking a + * ctrl-c for several minutes is not desirable. + * + * 1GB is chosen as a compromise: the user may still have + * to wait several seconds if they ctrl-c on devices that + * zero slowly, but will reduce the number of requests + * required and thus the overhead on devices that perform + * better. + */ + sz = size_bytes; + if (sz >= req_size) + sz = req_size; + + if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, + offset_bytes, sz)) { + pr_err("zeroing %s failed: %s\n", dv->devname, + strerror(errno)); + ret = 1; + break; + } + + offset_bytes += sz; + size_bytes -= sz; + } + + exit(ret); +} + +static int wait_for_zero_forks(int *zero_pids, int count) +{ + int wstatus, ret = 0, i, sfd, wait_count = 0; + struct signalfd_siginfo fdsi; + bool interrupted = false; + sigset_t sigset; + ssize_t s; + + for (i = 0; i < count; i++) + if (zero_pids[i]) + wait_count++; + if (!wait_count) + return 0; + + sigemptyset(&sigset); + sigaddset(&sigset, SIGINT); + sigaddset(&sigset, SIGCHLD); + sigprocmask(SIG_BLOCK, &sigset, NULL); + + sfd = signalfd(-1, &sigset, 0); + if (sfd < 0) { + pr_err("Unable to create signalfd: %s\n", strerror(errno)); + return 1; + } + + while (1) { + s = read(sfd, &fdsi, sizeof(fdsi)); + if (s != sizeof(fdsi)) { + pr_err("Invalid signalfd read: %s\n", strerror(errno)); + close(sfd); + return 1; + } + + if (fdsi.ssi_signo == SIGINT) { + printf("\n"); + pr_info("Interrupting zeroing processes, please wait...\n"); + interrupted = true; + } else if (fdsi.ssi_signo == SIGCHLD) { + if (!--wait_count) + break; + } + } + + close(sfd); + + for (i = 0; i < count; i++) { + if (!zero_pids[i]) + continue; + + waitpid(zero_pids[i], &wstatus, 0); + zero_pids[i] = 0; + if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus)) + ret = 1; + } + + if (interrupted) { + pr_err("zeroing interrupted!\n"); + return 1; + } + + if (ret) + pr_err("zeroing failed!\n"); + else + pr_info("zeroing finished\n"); + + return ret; +} + static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, struct supertype *st, struct mddev_dev *dv, - struct mdinfo *info, int have_container, int major_num) + struct mdinfo *info, int have_container, int major_num, + int *zero_pid) { dev_t rdev; int fd; @@ -148,6 +292,14 @@ static int add_disk_to_super(int mdfd, struct shape *s, struct context *c, } st->ss->getinfo_super(st, info, NULL); + if (fd >= 0 && s->write_zeroes) { + *zero_pid = write_zeroes_fork(fd, s, st, dv); + if (*zero_pid <= 0) { + ioctl(mdfd, STOP_ARRAY, NULL); + return 1; + } + } + if (have_container && c->verbose > 0) pr_err("Using %s for device %d\n", map_dev(info->disk.major, info->disk.minor, 0), @@ -224,10 +376,23 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, { struct mddev_dev *moved_disk = NULL; int pass, raid_disk_num, dnum; + int zero_pids[total_slots]; struct mddev_dev *dv; struct mdinfo *infos; + sigset_t sigset, orig_sigset; int ret = 0; + /* + * Block SIGINT so the main thread will always wait for the + * zeroing processes when being interrupted. Otherwise the + * zeroing processes will finish their work in the background + * keeping the disk busy. + */ + sigemptyset(&sigset); + sigaddset(&sigset, SIGINT); + sigprocmask(SIG_BLOCK, &sigset, &orig_sigset); + memset(zero_pids, 0, sizeof(zero_pids)); + infos = xmalloc(sizeof(*infos) * total_slots); enable_fds(total_slots); for (pass = 1; pass <= 2; pass++) { @@ -261,7 +426,7 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, ret = add_disk_to_super(mdfd, s, c, st, dv, &infos[dnum], have_container, - major_num); + major_num, &zero_pids[dnum]); if (ret) goto out; @@ -287,6 +452,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, } if (pass == 1) { + ret = wait_for_zero_forks(zero_pids, total_slots); + if (ret) + goto out; + ret = update_metadata(mdfd, s, st, map, info, chosen_name); if (ret) @@ -295,7 +464,10 @@ static int add_disks(int mdfd, struct mdinfo *info, struct shape *s, } out: + if (ret) + wait_for_zero_forks(zero_pids, total_slots); free(infos); + sigprocmask(SIG_SETMASK, &orig_sigset, NULL); return ret; } diff --git a/ReadMe.c b/ReadMe.c index bd8d50d28661..db251ed2f3d4 100644 --- a/ReadMe.c +++ b/ReadMe.c @@ -138,6 +138,7 @@ struct option long_options[] = { {"size", 1, 0, 'z'}, {"auto", 1, 0, Auto}, /* also for --assemble */ {"assume-clean",0,0, AssumeClean }, + {"write-zeroes",0,0, WriteZeroes }, {"metadata", 1, 0, 'e'}, /* superblock format */ {"bitmap", 1, 0, Bitmap}, {"bitmap-chunk", 1, 0, BitmapChunk}, @@ -390,6 +391,7 @@ char Help_create[] = " --write-journal= : Specify journal device for RAID-4/5/6 array\n" " --consistency-policy= : Specify the policy that determines how the array\n" " -k : maintains consistency in case of unexpected shutdown.\n" +" --write-zeroes : Write zeroes to the disks before creating. This will bypass initial sync.\n" "\n" ; diff --git a/mdadm.c b/mdadm.c index 57e8e6fa64b9..4685ad6b06c2 100644 --- a/mdadm.c +++ b/mdadm.c @@ -590,6 +590,10 @@ int main(int argc, char *argv[]) s.assume_clean = 1; continue; + case O(CREATE, WriteZeroes): + s.write_zeroes = 1; + continue; + case O(GROW,'n'): case O(CREATE,'n'): case O(BUILD,'n'): /* number of raid disks */ @@ -1251,6 +1255,11 @@ int main(int argc, char *argv[]) } } + if (s.write_zeroes && !s.assume_clean) { + pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n"); + s.assume_clean = 1; + } + if (!mode && devs_found) { mode = MISC; devmode = 'Q'; diff --git a/mdadm.h b/mdadm.h index 8bd65fba1887..211cfcd54f92 100644 --- a/mdadm.h +++ b/mdadm.h @@ -275,6 +275,9 @@ static inline void __put_unaligned32(__u32 val, void *p) #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#define KIB_TO_BYTES(x) ((x) << 10) +#define SEC_TO_BYTES(x) ((x) << 9) + extern const char Name[]; struct md_bb_entry { @@ -435,6 +438,7 @@ extern char Version[], Usage[], Help[], OptionHelp[], */ enum special_options { AssumeClean = 300, + WriteZeroes, BitmapChunk, WriteBehind, ReAdd, @@ -640,6 +644,7 @@ struct shape { int bitmap_chunk; char *bitmap_file; int assume_clean; + bool write_zeroes; int write_behind; unsigned long long size; unsigned long long data_offset;