diff mbox series

[1/6] md: Revert "md: Don't register sync_thread for reshape directly"

Message ID 20240229154941.99557-2-xni@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Mike Snitzer
Headers show
Series Fix dmraid regression bugs | expand

Commit Message

Xiao Ni Feb. 29, 2024, 3:49 p.m. UTC
This reverts commit ad39c08186f8a0f221337985036ba86731d6aafe.

Function stop_sync_thread only wakes up sync task. It also needs to
wake up sync thread. This problem will be fixed in the following
patch.

Signed-off-by: Xiao Ni <xni@redhat.com>
---
 drivers/md/md.c     |  5 +----
 drivers/md/raid10.c | 16 ++++++++++++++--
 drivers/md/raid5.c  | 29 +++++++++++++++++++++++++++--
 3 files changed, 42 insertions(+), 8 deletions(-)

Comments

Yu Kuai March 1, 2024, 2:38 a.m. UTC | #1
Hi,

在 2024/02/29 23:49, Xiao Ni 写道:
> This reverts commit ad39c08186f8a0f221337985036ba86731d6aafe.
> 
> Function stop_sync_thread only wakes up sync task. It also needs to
> wake up sync thread. This problem will be fixed in the following
> patch.

I don't think so, unlike mddev->thread, sync_thread will only be
executed once and must be executed each time it's registered, and caller
must make sure to wake up registered sync_thread.

Thanks,
Kuai
> 
> Signed-off-by: Xiao Ni <xni@redhat.com>
> ---
>   drivers/md/md.c     |  5 +----
>   drivers/md/raid10.c | 16 ++++++++++++++--
>   drivers/md/raid5.c  | 29 +++++++++++++++++++++++++++--
>   3 files changed, 42 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/md/md.c b/drivers/md/md.c
> index 9e41a9aaba8b..db4743ba7f6c 100644
> --- a/drivers/md/md.c
> +++ b/drivers/md/md.c
> @@ -9376,7 +9376,6 @@ static void md_start_sync(struct work_struct *ws)
>   	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
>   	int spares = 0;
>   	bool suspend = false;
> -	char *name;
>   
>   	/*
>   	 * If reshape is still in progress, spares won't be added or removed
> @@ -9414,10 +9413,8 @@ static void md_start_sync(struct work_struct *ws)
>   	if (spares)
>   		md_bitmap_write_all(mddev->bitmap);
>   
> -	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
> -			"reshape" : "resync";
>   	rcu_assign_pointer(mddev->sync_thread,
> -			   md_register_thread(md_do_sync, mddev, name));
> +			   md_register_thread(md_do_sync, mddev, "resync"));
>   	if (!mddev->sync_thread) {
>   		pr_warn("%s: could not start resync thread...\n",
>   			mdname(mddev));
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index a5f8419e2df1..7412066ea22c 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -4175,7 +4175,11 @@ static int raid10_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> +		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> +		rcu_assign_pointer(mddev->sync_thread,
> +			md_register_thread(md_do_sync, mddev, "reshape"));
> +		if (!mddev->sync_thread)
> +			goto out_free_conf;
>   	}
>   
>   	return 0;
> @@ -4569,8 +4573,16 @@ static int raid10_start_reshape(struct mddev *mddev)
>   	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> +	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> +
> +	rcu_assign_pointer(mddev->sync_thread,
> +			   md_register_thread(md_do_sync, mddev, "reshape"));
> +	if (!mddev->sync_thread) {
> +		ret = -EAGAIN;
> +		goto abort;
> +	}
>   	conf->reshape_checkpoint = jiffies;
> +	md_wakeup_thread(mddev->sync_thread);
>   	md_new_event();
>   	return 0;
>   
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index 6a7a32f7fb91..8497880135ee 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -7936,7 +7936,11 @@ static int raid5_run(struct mddev *mddev)
>   		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
>   		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> +		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> +		rcu_assign_pointer(mddev->sync_thread,
> +			md_register_thread(md_do_sync, mddev, "reshape"));
> +		if (!mddev->sync_thread)
> +			goto abort;
>   	}
>   
>   	/* Ok, everything is just fine now */
> @@ -8502,8 +8506,29 @@ static int raid5_start_reshape(struct mddev *mddev)
>   	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
>   	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
>   	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> -	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> +	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> +	rcu_assign_pointer(mddev->sync_thread,
> +			   md_register_thread(md_do_sync, mddev, "reshape"));
> +	if (!mddev->sync_thread) {
> +		mddev->recovery = 0;
> +		spin_lock_irq(&conf->device_lock);
> +		write_seqcount_begin(&conf->gen_lock);
> +		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
> +		mddev->new_chunk_sectors =
> +			conf->chunk_sectors = conf->prev_chunk_sectors;
> +		mddev->new_layout = conf->algorithm = conf->prev_algo;
> +		rdev_for_each(rdev, mddev)
> +			rdev->new_data_offset = rdev->data_offset;
> +		smp_wmb();
> +		conf->generation--;
> +		conf->reshape_progress = MaxSector;
> +		mddev->reshape_position = MaxSector;
> +		write_seqcount_end(&conf->gen_lock);
> +		spin_unlock_irq(&conf->device_lock);
> +		return -EAGAIN;
> +	}
>   	conf->reshape_checkpoint = jiffies;
> +	md_wakeup_thread(mddev->sync_thread);
>   	md_new_event();
>   	return 0;
>   }
>
Xiao Ni March 1, 2024, 4:41 a.m. UTC | #2
On Fri, Mar 1, 2024 at 10:38 AM Yu Kuai <yukuai1@huaweicloud.com> wrote:
>
> Hi,
>
> 在 2024/02/29 23:49, Xiao Ni 写道:
> > This reverts commit ad39c08186f8a0f221337985036ba86731d6aafe.
> >
> > Function stop_sync_thread only wakes up sync task. It also needs to
> > wake up sync thread. This problem will be fixed in the following
> > patch.
>
> I don't think so, unlike mddev->thread, sync_thread will only be
> executed once and must be executed each time it's registered, and caller
> must make sure to wake up registered sync_thread.

Hi Kuai

I'll modify the comments. But it should be right to
wake_up(mddev->sync_thread) in function stop_sync_thread too? You gave
the same patch yesterday too. I know the caller should wake up sync
thread too.

"However, I think the one to register sync_thread is responsible to
wake it up." I put your comments here. If I understand correctly, we
can do something like this?
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7937,6 +7937,7 @@ static int raid5_run(struct mddev *mddev)
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                rcu_assign_pointer(mddev->sync_thread,
                        md_register_thread(md_do_sync, mddev, "reshape"));
+               md_wakeup_thread(mddev->sync_thread);
                if (!mddev->sync_thread)
                        goto abort;
        }


And at first, I didn't revert
ad39c08186f8a0f221337985036ba86731d6aafe. But with my patch set, it
can cause failure in lvm2 test suit. And the patch you gave yesterday
is part of my patch01, so I revert it. Are you good if I change the
comments and with the modification (wake up sync thread after
registering reshape)?

Best Regards
Xiao

>
> Thanks,
> Kuai
> >
> > Signed-off-by: Xiao Ni <xni@redhat.com>
> > ---
> >   drivers/md/md.c     |  5 +----
> >   drivers/md/raid10.c | 16 ++++++++++++++--
> >   drivers/md/raid5.c  | 29 +++++++++++++++++++++++++++--
> >   3 files changed, 42 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/md/md.c b/drivers/md/md.c
> > index 9e41a9aaba8b..db4743ba7f6c 100644
> > --- a/drivers/md/md.c
> > +++ b/drivers/md/md.c
> > @@ -9376,7 +9376,6 @@ static void md_start_sync(struct work_struct *ws)
> >       struct mddev *mddev = container_of(ws, struct mddev, sync_work);
> >       int spares = 0;
> >       bool suspend = false;
> > -     char *name;
> >
> >       /*
> >        * If reshape is still in progress, spares won't be added or removed
> > @@ -9414,10 +9413,8 @@ static void md_start_sync(struct work_struct *ws)
> >       if (spares)
> >               md_bitmap_write_all(mddev->bitmap);
> >
> > -     name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
> > -                     "reshape" : "resync";
> >       rcu_assign_pointer(mddev->sync_thread,
> > -                        md_register_thread(md_do_sync, mddev, name));
> > +                        md_register_thread(md_do_sync, mddev, "resync"));
> >       if (!mddev->sync_thread) {
> >               pr_warn("%s: could not start resync thread...\n",
> >                       mdname(mddev));
> > diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> > index a5f8419e2df1..7412066ea22c 100644
> > --- a/drivers/md/raid10.c
> > +++ b/drivers/md/raid10.c
> > @@ -4175,7 +4175,11 @@ static int raid10_run(struct mddev *mddev)
> >               clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
> >               clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
> >               set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> > -             set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > +             set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> > +             rcu_assign_pointer(mddev->sync_thread,
> > +                     md_register_thread(md_do_sync, mddev, "reshape"));
> > +             if (!mddev->sync_thread)
> > +                     goto out_free_conf;
> >       }
> >
> >       return 0;
> > @@ -4569,8 +4573,16 @@ static int raid10_start_reshape(struct mddev *mddev)
> >       clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
> >       clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
> >       set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> > -     set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > +     set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> > +
> > +     rcu_assign_pointer(mddev->sync_thread,
> > +                        md_register_thread(md_do_sync, mddev, "reshape"));
> > +     if (!mddev->sync_thread) {
> > +             ret = -EAGAIN;
> > +             goto abort;
> > +     }
> >       conf->reshape_checkpoint = jiffies;
> > +     md_wakeup_thread(mddev->sync_thread);
> >       md_new_event();
> >       return 0;
> >
> > diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> > index 6a7a32f7fb91..8497880135ee 100644
> > --- a/drivers/md/raid5.c
> > +++ b/drivers/md/raid5.c
> > @@ -7936,7 +7936,11 @@ static int raid5_run(struct mddev *mddev)
> >               clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
> >               clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
> >               set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> > -             set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > +             set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> > +             rcu_assign_pointer(mddev->sync_thread,
> > +                     md_register_thread(md_do_sync, mddev, "reshape"));
> > +             if (!mddev->sync_thread)
> > +                     goto abort;
> >       }
> >
> >       /* Ok, everything is just fine now */
> > @@ -8502,8 +8506,29 @@ static int raid5_start_reshape(struct mddev *mddev)
> >       clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
> >       clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
> >       set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
> > -     set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
> > +     set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
> > +     rcu_assign_pointer(mddev->sync_thread,
> > +                        md_register_thread(md_do_sync, mddev, "reshape"));
> > +     if (!mddev->sync_thread) {
> > +             mddev->recovery = 0;
> > +             spin_lock_irq(&conf->device_lock);
> > +             write_seqcount_begin(&conf->gen_lock);
> > +             mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
> > +             mddev->new_chunk_sectors =
> > +                     conf->chunk_sectors = conf->prev_chunk_sectors;
> > +             mddev->new_layout = conf->algorithm = conf->prev_algo;
> > +             rdev_for_each(rdev, mddev)
> > +                     rdev->new_data_offset = rdev->data_offset;
> > +             smp_wmb();
> > +             conf->generation--;
> > +             conf->reshape_progress = MaxSector;
> > +             mddev->reshape_position = MaxSector;
> > +             write_seqcount_end(&conf->gen_lock);
> > +             spin_unlock_irq(&conf->device_lock);
> > +             return -EAGAIN;
> > +     }
> >       conf->reshape_checkpoint = jiffies;
> > +     md_wakeup_thread(mddev->sync_thread);
> >       md_new_event();
> >       return 0;
> >   }
> >
>
diff mbox series

Patch

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9e41a9aaba8b..db4743ba7f6c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -9376,7 +9376,6 @@  static void md_start_sync(struct work_struct *ws)
 	struct mddev *mddev = container_of(ws, struct mddev, sync_work);
 	int spares = 0;
 	bool suspend = false;
-	char *name;
 
 	/*
 	 * If reshape is still in progress, spares won't be added or removed
@@ -9414,10 +9413,8 @@  static void md_start_sync(struct work_struct *ws)
 	if (spares)
 		md_bitmap_write_all(mddev->bitmap);
 
-	name = test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ?
-			"reshape" : "resync";
 	rcu_assign_pointer(mddev->sync_thread,
-			   md_register_thread(md_do_sync, mddev, name));
+			   md_register_thread(md_do_sync, mddev, "resync"));
 	if (!mddev->sync_thread) {
 		pr_warn("%s: could not start resync thread...\n",
 			mdname(mddev));
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a5f8419e2df1..7412066ea22c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4175,7 +4175,11 @@  static int raid10_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
+		if (!mddev->sync_thread)
+			goto out_free_conf;
 	}
 
 	return 0;
@@ -4569,8 +4573,16 @@  static int raid10_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
+	if (!mddev->sync_thread) {
+		ret = -EAGAIN;
+		goto abort;
+	}
 	conf->reshape_checkpoint = jiffies;
+	md_wakeup_thread(mddev->sync_thread);
 	md_new_event();
 	return 0;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6a7a32f7fb91..8497880135ee 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7936,7 +7936,11 @@  static int raid5_run(struct mddev *mddev)
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+		rcu_assign_pointer(mddev->sync_thread,
+			md_register_thread(md_do_sync, mddev, "reshape"));
+		if (!mddev->sync_thread)
+			goto abort;
 	}
 
 	/* Ok, everything is just fine now */
@@ -8502,8 +8506,29 @@  static int raid5_start_reshape(struct mddev *mddev)
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	rcu_assign_pointer(mddev->sync_thread,
+			   md_register_thread(md_do_sync, mddev, "reshape"));
+	if (!mddev->sync_thread) {
+		mddev->recovery = 0;
+		spin_lock_irq(&conf->device_lock);
+		write_seqcount_begin(&conf->gen_lock);
+		mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
+		mddev->new_chunk_sectors =
+			conf->chunk_sectors = conf->prev_chunk_sectors;
+		mddev->new_layout = conf->algorithm = conf->prev_algo;
+		rdev_for_each(rdev, mddev)
+			rdev->new_data_offset = rdev->data_offset;
+		smp_wmb();
+		conf->generation--;
+		conf->reshape_progress = MaxSector;
+		mddev->reshape_position = MaxSector;
+		write_seqcount_end(&conf->gen_lock);
+		spin_unlock_irq(&conf->device_lock);
+		return -EAGAIN;
+	}
 	conf->reshape_checkpoint = jiffies;
+	md_wakeup_thread(mddev->sync_thread);
 	md_new_event();
 	return 0;
 }