diff mbox series

[mdadm,v3,5/7] mdadm: Add --write-zeros option for Create

Message ID 20220921204356.4336-6-logang@deltatee.com (mailing list archive)
State Superseded, archived
Headers show
Series Write Zeroes option for Creating Arrays | expand

Commit Message

Logan Gunthorpe Sept. 21, 2022, 8:43 p.m. UTC
Add the --write-zeros option for Create which will send a write zeros
request to all the disks before assembling the array. After zeroing
the array, the disks will be in a known clean state and the initial
sync may be skipped.

Writing zeroes is best used when there is a hardware offload method
to zero the data. But even still, zeroing can take several minutes on
a large device. Because of this, all disks are zeroed in parallel using
their own forked process and a message is printed to the user. The main
process will proceed only after all the zeroing processes have completed
successfully.

Signed-off-by: Logan Gunthorpe <logang@deltatee.com>

fixup! mdadm: Add --write-zeros option for Create
---
 Create.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 ReadMe.c |  2 ++
 mdadm.c  |  9 ++++++
 mdadm.h  |  7 +++++
 4 files changed, 101 insertions(+)

Comments

Mariusz Tkaczyk Sept. 23, 2022, 11:20 a.m. UTC | #1
Hi Logan,
One comment from my side.

Thanks,
Mariusz

On Wed, 21 Sep 2022 14:43:54 -0600
Logan Gunthorpe <logang@deltatee.com> wrote:

> Add the --write-zeros option for Create which will send a write zeros
> request to all the disks before assembling the array. After zeroing
> the array, the disks will be in a known clean state and the initial
> sync may be skipped.
> 
> Writing zeroes is best used when there is a hardware offload method
> to zero the data. But even still, zeroing can take several minutes on
> a large device. Because of this, all disks are zeroed in parallel using
> their own forked process and a message is printed to the user. The main
> process will proceed only after all the zeroing processes have completed
> successfully.
> 
> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
> 
> fixup! mdadm: Add --write-zeros option for Create
> ---

> diff --git a/mdadm.h b/mdadm.h
> index 1ab31564efef..c7e00195d8c8 100644
> --- a/mdadm.h
> +++ b/mdadm.h
> @@ -273,6 +273,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
>  
>  #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
>  
> +#define KIB_TO_BYTES(x)	((x) << 10)
> +#define SEC_TO_BYTES(x)	((x) << 9)
> +
>  extern const char Name[];
>  
>  struct md_bb_entry {
> @@ -387,6 +390,8 @@ struct mdinfo {
>  		ARRAY_UNKNOWN_STATE,
>  	} array_state;
>  	struct md_bb bb;
> +
> +	pid_t zero_pid;
>  };

mdinfo is  used  for raid properties. It is used for both system (sysfs_read())
and metadata(getinfo_super()). zero_pid property doesn't fit well there, it is
used once during creation. Could you please add it to mddev_dev struct or add
it to local array /list?
Xiao Ni Sept. 30, 2022, 5:23 a.m. UTC | #2
在 2022/9/23 下午7:20, Mariusz Tkaczyk 写道:
> Hi Logan,
> One comment from my side.
>
> Thanks,
> Mariusz
>
> On Wed, 21 Sep 2022 14:43:54 -0600
> Logan Gunthorpe <logang@deltatee.com> wrote:
>
>> Add the --write-zeros option for Create which will send a write zeros
>> request to all the disks before assembling the array. After zeroing
>> the array, the disks will be in a known clean state and the initial
>> sync may be skipped.
>>
>> Writing zeroes is best used when there is a hardware offload method
>> to zero the data. But even still, zeroing can take several minutes on
>> a large device. Because of this, all disks are zeroed in parallel using
>> their own forked process and a message is printed to the user. The main
>> process will proceed only after all the zeroing processes have completed
>> successfully.
>>
>> Signed-off-by: Logan Gunthorpe <logang@deltatee.com>
>>
>> fixup! mdadm: Add --write-zeros option for Create
>> ---
>> diff --git a/mdadm.h b/mdadm.h
>> index 1ab31564efef..c7e00195d8c8 100644
>> --- a/mdadm.h
>> +++ b/mdadm.h
>> @@ -273,6 +273,9 @@ static inline void __put_unaligned32(__u32 val, void *p)
>>   
>>   #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
>>   
>> +#define KIB_TO_BYTES(x)	((x) << 10)
>> +#define SEC_TO_BYTES(x)	((x) << 9)
>> +
>>   extern const char Name[];
>>   
>>   struct md_bb_entry {
>> @@ -387,6 +390,8 @@ struct mdinfo {
>>   		ARRAY_UNKNOWN_STATE,
>>   	} array_state;
>>   	struct md_bb bb;
>> +
>> +	pid_t zero_pid;
>>   };
> mdinfo is  used  for raid properties. It is used for both system (sysfs_read())
> and metadata(getinfo_super()). zero_pid property doesn't fit well there, it is
> used once during creation. Could you please add it to mddev_dev struct or add
> it to local array /list?


Hi Logan

I have the same question about this.

Beside this problem, the others are good for me. By the way, it needs to 
pull to the latest upstream. Now

it has conficts when mergeing v3 patch.

Regards

Xiao

>
Logan Gunthorpe Sept. 30, 2022, 3:39 p.m. UTC | #3
On 2022-09-29 23:23, Xiao Ni wrote:
>>>   extern const char Name[];
>>>   
>>>   struct md_bb_entry {
>>> @@ -387,6 +390,8 @@ struct mdinfo {
>>>   		ARRAY_UNKNOWN_STATE,
>>>   	} array_state;
>>>   	struct md_bb bb;
>>> +
>>> +	pid_t zero_pid;
>>>   };
>> mdinfo is  used  for raid properties. It is used for both system (sysfs_read())
>> and metadata(getinfo_super()). zero_pid property doesn't fit well there, it is
>> used once during creation. Could you please add it to mddev_dev struct or add
>> it to local array /list?
> 
> 
> Hi Logan
> 
> I have the same question about this.
> 
> Beside this problem, the others are good for me. By the way, it needs to 
> pull to the latest upstream. Now
> 
> it has conficts when mergeing v3 patch.

Yup, thanks, I'll make the changes for v4 next week.

Logan
diff mbox series

Patch

diff --git a/Create.c b/Create.c
index 1e94b90b96bd..31bed3a37a90 100644
--- a/Create.c
+++ b/Create.c
@@ -26,6 +26,8 @@ 
 #include	"md_u.h"
 #include	"md_p.h"
 #include	<ctype.h>
+#include	<fcntl.h>
+#include	<sys/wait.h>
 
 static int round_size_and_verify(unsigned long long *size, int chunk)
 {
@@ -91,6 +93,73 @@  int default_layout(struct supertype *st, int level, int verbose)
 	return layout;
 }
 
+static pid_t write_zeroes_fork(int fd, struct shape *s, struct supertype *st,
+			       struct mddev_dev *dv)
+
+{
+	unsigned long long offset_bytes, size_bytes;
+	int ret = 0;
+	pid_t pid;
+
+	size_bytes = KIB_TO_BYTES(s->size);
+
+	/*
+	 * If size_bytes is zero, this is a zoned raid array where
+	 * each disk is of a different size and uses its full
+	 * disk. Thus zero the entire disk.
+	 */
+	if (!size_bytes && !get_dev_size(fd, dv->devname, &size_bytes))
+		return -1;
+
+	if (dv->data_offset != INVALID_SECTORS)
+		offset_bytes = SEC_TO_BYTES(dv->data_offset);
+	else
+		offset_bytes = SEC_TO_BYTES(st->data_offset);
+
+	pr_info("zeroing data from %lld to %lld on: %s\n",
+		offset_bytes, size_bytes, dv->devname);
+
+	pid = fork();
+	if (pid < 0) {
+		pr_err("Could not fork to zero disks: %m\n");
+		return pid;
+	} else if (pid != 0) {
+		return pid;
+	}
+
+	if (fallocate(fd, FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE,
+		      offset_bytes, size_bytes)) {
+		pr_err("zeroing %s failed: %m\n", dv->devname);
+		ret = 1;
+	}
+
+	exit(ret);
+}
+
+static int wait_for_zero_forks(struct mdinfo *info, int count)
+{
+	int wstatus, ret = 0, i;
+	bool waited = false;
+
+	for (i = 0; i < count; i++) {
+		if (!info[i].zero_pid)
+			continue;
+
+		waited = true;
+		waitpid(info[i].zero_pid, &wstatus, 0);
+
+		if (!WIFEXITED(wstatus) || WEXITSTATUS(wstatus))
+			ret = 1;
+	}
+
+	if (ret)
+		pr_err("zeroing failed!\n");
+	else if (waited)
+		pr_info("zeroing finished\n");
+
+	return ret;
+}
+
 static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 		struct supertype *st, struct mddev_dev *dv,
 		struct mdinfo *info, int have_container, int major_num)
@@ -148,6 +217,14 @@  static int add_disk_to_super(int mdfd, struct shape *s, struct context *c,
 	}
 	st->ss->getinfo_super(st, info, NULL);
 
+	if (s->write_zeroes) {
+		info->zero_pid = write_zeroes_fork(fd, s, st, dv);
+		if (info->zero_pid <= 0) {
+			ioctl(mdfd, STOP_ARRAY, NULL);
+			return 1;
+		}
+	}
+
 	if (have_container && c->verbose > 0)
 		pr_err("Using %s for device %d\n",
 		       map_dev(info->disk.major, info->disk.minor, 0),
@@ -287,6 +364,10 @@  static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 		}
 
 		if (pass == 1) {
+			ret = wait_for_zero_forks(infos, total_slots);
+			if (ret)
+				goto out;
+
 			ret = update_metadata(mdfd, s, st, map, info,
 					      chosen_name);
 			if (ret)
@@ -295,6 +376,8 @@  static int add_disks(int mdfd, struct mdinfo *info, struct shape *s,
 	}
 
 out:
+	if (ret)
+		wait_for_zero_forks(infos, total_slots);
 	free(infos);
 	return ret;
 }
diff --git a/ReadMe.c b/ReadMe.c
index 7f94847e9769..50913dabca40 100644
--- a/ReadMe.c
+++ b/ReadMe.c
@@ -138,6 +138,7 @@  struct option long_options[] = {
     {"size",	  1, 0, 'z'},
     {"auto",	  1, 0, Auto}, /* also for --assemble */
     {"assume-clean",0,0, AssumeClean },
+    {"write-zeroes",0,0, WriteZeroes },
     {"metadata",  1, 0, 'e'}, /* superblock format */
     {"bitmap",	  1, 0, Bitmap},
     {"bitmap-chunk", 1, 0, BitmapChunk},
@@ -390,6 +391,7 @@  char Help_create[] =
 "  --write-journal=      : Specify journal device for RAID-4/5/6 array\n"
 "  --consistency-policy= : Specify the policy that determines how the array\n"
 "                     -k : maintains consistency in case of unexpected shutdown.\n"
+"  --write-zeroes        : Write zeroes to the disks before creating. This will bypass initial sync.\n"
 "\n"
 ;
 
diff --git a/mdadm.c b/mdadm.c
index 972adb524dfb..141838bd394f 100644
--- a/mdadm.c
+++ b/mdadm.c
@@ -602,6 +602,10 @@  int main(int argc, char *argv[])
 			s.assume_clean = 1;
 			continue;
 
+		case O(CREATE, WriteZeroes):
+			s.write_zeroes = 1;
+			continue;
+
 		case O(GROW,'n'):
 		case O(CREATE,'n'):
 		case O(BUILD,'n'): /* number of raid disks */
@@ -1306,6 +1310,11 @@  int main(int argc, char *argv[])
 		}
 	}
 
+	if (s.write_zeroes && !s.assume_clean) {
+		pr_info("Disk zeroing requested, setting --assume-clean to skip resync\n");
+		s.assume_clean = 1;
+	}
+
 	if (!mode && devs_found) {
 		mode = MISC;
 		devmode = 'Q';
diff --git a/mdadm.h b/mdadm.h
index 1ab31564efef..c7e00195d8c8 100644
--- a/mdadm.h
+++ b/mdadm.h
@@ -273,6 +273,9 @@  static inline void __put_unaligned32(__u32 val, void *p)
 
 #define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
 
+#define KIB_TO_BYTES(x)	((x) << 10)
+#define SEC_TO_BYTES(x)	((x) << 9)
+
 extern const char Name[];
 
 struct md_bb_entry {
@@ -387,6 +390,8 @@  struct mdinfo {
 		ARRAY_UNKNOWN_STATE,
 	} array_state;
 	struct md_bb bb;
+
+	pid_t zero_pid;
 };
 
 struct createinfo {
@@ -433,6 +438,7 @@  extern char Version[], Usage[], Help[], OptionHelp[],
  */
 enum special_options {
 	AssumeClean = 300,
+	WriteZeroes,
 	BitmapChunk,
 	WriteBehind,
 	ReAdd,
@@ -593,6 +599,7 @@  struct shape {
 	int	bitmap_chunk;
 	char	*bitmap_file;
 	int	assume_clean;
+	bool	write_zeroes;
 	int	write_behind;
 	unsigned long long size;
 	unsigned long long data_offset;