[PATCHv2] tcmu: allow userspace to reset netlink

Message ID	1522903193-13947-1-git-send-email-xiubli@redhat.com (mailing list archive)
State	Not Applicable
Headers	show Return-Path: <linux-scsi-owner@kernel.org> From: xiubli@redhat.com To: mchristi@redhat.com, nab@linux-iscsi.org Cc: pkalever@redhat.com, pkarampu@redhat.com, linux-scsi@vger.kernel.org, target-devel@vger.kernel.org, Xiubo Li <xiubli@redhat.com> Subject: [PATCHv2] tcmu: allow userspace to reset netlink Date: Thu, 5 Apr 2018 00:39:53 -0400 Message-Id: <1522903193-13947-1-git-send-email-xiubli@redhat.com> Sender: linux-scsi-owner@vger.kernel.org Precedence: bulk

Message ID

1522903193-13947-1-git-send-email-xiubli@redhat.com (mailing list archive)

State

Not Applicable

Headers

From: xiubli@redhat.com
To: mchristi@redhat.com, nab@linux-iscsi.org
Cc: pkalever@redhat.com, pkarampu@redhat.com,
	linux-scsi@vger.kernel.org, target-devel@vger.kernel.org,
	Xiubo Li <xiubli@redhat.com>
Subject: [PATCHv2] tcmu: allow userspace to reset netlink
Date: Thu,  5 Apr 2018 00:39:53 -0400
Message-Id: <1522903193-13947-1-git-send-email-xiubli@redhat.com>
Sender: linux-scsi-owner@vger.kernel.org
Precedence: bulk

Commit Message

Xiubo Li April 5, 2018, 4:39 a.m. UTC

From: Xiubo Li <xiubli@redhat.com>

This patch adds 1 tcmu attr to reset and complete all the blocked
netlink waiting threads. It's used when the userspace daemon like
tcmu-runner has crashed or forced to shutdown just before the
netlink requests be replied to the kernel, then the netlink requeting
threads will get stuck forever. We must reboot the machine to recover
from it and by this the rebootng is not a must then.

The Call Trace will be something like:

Comments

Mike Christie April 12, 2018, 5:48 p.m. UTC | #1

On 04/04/2018 11:39 PM, xiubli@redhat.com wrote:
> From: Xiubo Li <xiubli@redhat.com>
> 
> This patch adds 1 tcmu attr to reset and complete all the blocked
> netlink waiting threads. It's used when the userspace daemon like
> tcmu-runner has crashed or forced to shutdown just before the
> netlink requests be replied to the kernel, then the netlink requeting
> threads will get stuck forever. We must reboot the machine to recover
> from it and by this the rebootng is not a must then.
> 
> The Call Trace will be something like:
> ==============
> INFO: task targetctl:22655 blocked for more than 120 seconds.
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> targetctl       D ffff880169718fd0     0 22655  17249 0x00000080
> Call Trace:
>  [<ffffffff816ab6d9>] schedule+0x29/0x70
>  [<ffffffff816a90e9>] schedule_timeout+0x239/0x2c0
>  [<ffffffff81574d42>] ? skb_release_data+0xf2/0x140
>  [<ffffffff816aba8d>] wait_for_completion+0xfd/0x140
>  [<ffffffff810c6440>] ? wake_up_state+0x20/0x20
>  [<ffffffffc0159f5a>] tcmu_netlink_event+0x26a/0x3a0 [target_core_user]
>  [<ffffffff810b34b0>] ? wake_up_atomic_t+0x30/0x30
>  [<ffffffffc015a2c6>] tcmu_configure_device+0x236/0x350 [target_core_user]
>  [<ffffffffc05085df>] target_configure_device+0x3f/0x3b0 [target_core_mod]
>  [<ffffffffc0502e7c>] target_core_store_dev_enable+0x2c/0x60 [target_core_mod]
>  [<ffffffffc0501244>] target_core_dev_store+0x24/0x40 [target_core_mod]
>  [<ffffffff8128a0e4>] configfs_write_file+0xc4/0x130
>  [<ffffffff81202aed>] vfs_write+0xbd/0x1e0
>  [<ffffffff812038ff>] SyS_write+0x7f/0xe0
>  [<ffffffff816b89fd>] system_call_fastpath+0x16/0x1b
> ==============
> 
> Be careful of using this, it could reset the normal netlink requesting
> operations, so we should use this only when the user space daemon from
> starting and just before the daemon could receive and handle the nl
> requests.
> 
> Changes since v1(suggested by Mike Christie):
> v2: - Makes the reset per device.
> 
> Signed-off-by: Xiubo Li <xiubli@redhat.com>
> ---
>  drivers/target/target_core_user.c | 52 +++++++++++++++++++++++++++++++++------
>  1 file changed, 45 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index 4ad89ea..7271da8 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -105,7 +105,8 @@ struct tcmu_hba {
>  
>  struct tcmu_nl_cmd {
>  	/* wake up thread waiting for reply */
> -	struct completion complete;
> +	bool complete;
> +
>  	int cmd;
>  	int status;
>  };
> @@ -159,9 +160,12 @@ struct tcmu_dev {
>  
>  	spinlock_t nl_cmd_lock;
>  	struct tcmu_nl_cmd curr_nl_cmd;
> -	/* wake up threads waiting on curr_nl_cmd */
> +	/* wake up threads waiting on nl_cmd_wq */
>  	wait_queue_head_t nl_cmd_wq;
>  
> +	/* complete thread waiting complete_wq */
> +	wait_queue_head_t complete_wq;
> +
>  	char dev_config[TCMU_CONFIG_LEN];
>  
>  	int nl_reply_supported;
> @@ -307,11 +311,13 @@ static int tcmu_genl_cmd_done(struct genl_info *info, int completed_cmd)
>  		nl_cmd->status = rc;
>  	}
>  
> -	spin_unlock(&udev->nl_cmd_lock);
>  	if (!is_removed)
>  		 target_undepend_item(&dev->dev_group.cg_item);
> -	if (!ret)
> -		complete(&nl_cmd->complete);
> +	if (!ret) {
> +		nl_cmd->complete = true;
> +		wake_up(&udev->complete_wq);
> +	}
> +	spin_unlock(&udev->nl_cmd_lock);
>  	return ret;
>  }
>  
> @@ -1258,6 +1264,7 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
>  	timer_setup(&udev->cmd_timer, tcmu_cmd_timedout, 0);
>  
>  	init_waitqueue_head(&udev->nl_cmd_wq);
> +	init_waitqueue_head(&udev->complete_wq);
>  	spin_lock_init(&udev->nl_cmd_lock);
>  
>  	INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
> @@ -1555,7 +1562,6 @@ static void tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd)
>  
>  	memset(nl_cmd, 0, sizeof(*nl_cmd));
>  	nl_cmd->cmd = cmd;
> -	init_completion(&nl_cmd->complete);
>  
>  	spin_unlock(&udev->nl_cmd_lock);
>  }
> @@ -1573,9 +1579,10 @@ static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev)
>  		return 0;
>  
>  	pr_debug("sleeping for nl reply\n");
> -	wait_for_completion(&nl_cmd->complete);
> +	wait_event(udev->complete_wq, nl_cmd->complete);
>  
>  	spin_lock(&udev->nl_cmd_lock);
> +	nl_cmd->complete = false;
>  	nl_cmd->cmd = TCMU_CMD_UNSPEC;
>  	ret = nl_cmd->status;
>  	nl_cmd->status = 0;
> @@ -2323,6 +2330,36 @@ static ssize_t tcmu_block_dev_store(struct config_item *item, const char *page,
>  }
>  CONFIGFS_ATTR(tcmu_, block_dev);
>  
> +static ssize_t tcmu_reset_netlink_store(struct config_item *item, const char *page,
> +				    size_t count)
> +{
> +	struct se_device *se_dev = container_of(to_config_group(item),
> +						struct se_device,
> +						dev_action_group);
> +	struct tcmu_dev *udev = TCMU_DEV(se_dev);
> +	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
> +	u8 val;
> +	int ret;
> +
> +	ret = kstrtou8(page, 0, &val);
> +	if (ret < 0)
> +		return ret;
> +
> +	if (val != 1) {
> +		pr_err("Invalid block value %d\n", val);

I think you wanted

"Invalid reset value %d\n"


> +		return -EINVAL;
> +	}
> +
> +	spin_unlock(&udev->nl_cmd_lock);

Need spin_lock() instead of unlock.


I think before calling the code below you need to check if a nl command
is even waiting. If you just run this with no nl commands waiting then
next time we send a command nl_cmd->complete will be true and
tcmu_wait_genl_cmd_reply's wait_event call will return right away.


> +	nl_cmd->complete = true;
> +	nl_cmd->status = -EINTR;
> +	wake_up(&udev->complete_wq);

Instead of the code above, I think you need to take some of the guts out
of tcmu_genl_cmd_done to handle removal which is an odd cases due to the
refcoutning/locking and configfs use.

For non removal commands we need to do a target_undepend_item. For
remove, we don't want to since we came through configfs and teardown
already started.

So instead of the above lines take:

        if (nl_cmd->cmd != completed_cmd) {
                printk(KERN_ERR "Mismatched commands (Expecting reply
for %d. Current %d).\n",
                       completed_cmd, nl_cmd->cmd);
                ret = -EINVAL;
        } else {
                nl_cmd->status = rc;
        }

	// Note: I changed this from the is_removed
        if (completed_cmd != TCMU_CMD_REMOVED_DEVICE)
                 target_undepend_item(&dev->dev_group.cg_item);
        if (!ret) {
                nl_cmd->complete = true;
                wake_up(&udev->complete_wq);
        }


from tcmu_genl_cmd_done and make a function that takes the status code
and completed_cmd. You would just then do

__tcmu_genl_cmd_done(&udev->curr_nl_cmd.cmd, -EINTR);




> +	spin_unlock(&udev->nl_cmd_lock);
> +
> +	return count;
> +}
> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
> +
>  static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>  				     size_t count)
>  {
> @@ -2363,6 +2400,7 @@ static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>  static struct configfs_attribute *tcmu_action_attrs[] = {
>  	&tcmu_attr_block_dev,
>  	&tcmu_attr_reset_ring,
> +	&tcmu_attr_reset_netlink,
>  	NULL,
>  };
>  
>

Xiubo Li April 13, 2018, 3:08 a.m. UTC | #2

>> +
>> +	if (val != 1) {
>> +		pr_err("Invalid block value %d\n", val);
> I think you wanted
>
> "Invalid reset value %d\n"
Yeah, just copied it from other place.
>
>> +		return -EINVAL;
>> +	}
>> +
>> +	spin_unlock(&udev->nl_cmd_lock);
> Need spin_lock() instead of unlock.
>
>
> I think before calling the code below you need to check if a nl command
> is even waiting. If you just run this with no nl commands waiting then
> next time we send a command nl_cmd->complete will be true and
> tcmu_wait_genl_cmd_reply's wait_event call will return right away.
Will fix this.

>
>> +	nl_cmd->complete = true;
>> +	nl_cmd->status = -EINTR;
>> +	wake_up(&udev->complete_wq);
> Instead of the code above, I think you need to take some of the guts out
> of tcmu_genl_cmd_done to handle removal which is an odd cases due to the
> refcoutning/locking and configfs use.
>
> For non removal commands we need to do a target_undepend_item. For
> remove, we don't want to since we came through configfs and teardown
> already started.
>
> So instead of the above lines take:
>
>          if (nl_cmd->cmd != completed_cmd) {
>                  printk(KERN_ERR "Mismatched commands (Expecting reply
> for %d. Current %d).\n",
>                         completed_cmd, nl_cmd->cmd);
>                  ret = -EINVAL;
>          } else {
>                  nl_cmd->status = rc;
>          }
>
> 	// Note: I changed this from the is_removed
>          if (completed_cmd != TCMU_CMD_REMOVED_DEVICE)
>                   target_undepend_item(&dev->dev_group.cg_item);
>          if (!ret) {
>                  nl_cmd->complete = true;
>                  wake_up(&udev->complete_wq);
>          }
>
Sorry, this part I couldn't totally understand.

Do you mean take the code above you pasted by introducing 
target_undepend_item() for none removal paths just like in 
tcmu_genl_cmd_done() does?
While in tcmu_genl_cmd_done(), the target_depend_item() is called by 
target_find_device() and then it should do target_undepend_item() after 
that, but in
reset_netlink_store(), should it need target_undepend_item() ? If so 
they won't be in pairs.

I think we should only take this part:

         if (nl_cmd->cmd != completed_cmd) {
                 printk(KERN_ERR "Mismatched commands (Expecting reply
for %d. Current %d).\n",
                        completed_cmd, nl_cmd->cmd);
                 ret = -EINVAL;
         } else {
                 nl_cmd->status = rc;
         }

from the tcmu_genl_cmd_done(), right ?

Thanks,
BRs


> from tcmu_genl_cmd_done and make a function that takes the status code
> and completed_cmd. You would just then do
>
> __tcmu_genl_cmd_done(&udev->curr_nl_cmd.cmd, -EINTR);
>
>
>
>
>> +	spin_unlock(&udev->nl_cmd_lock);
>> +
>> +	return count;
>> +}
>> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
>> +
>>   static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>>   				     size_t count)
>>   {
>> @@ -2363,6 +2400,7 @@ static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
>>   static struct configfs_attribute *tcmu_action_attrs[] = {
>>   	&tcmu_attr_block_dev,
>>   	&tcmu_attr_reset_ring,
>> +	&tcmu_attr_reset_netlink,
>>   	NULL,
>>   };
>>   
>>

Mike Christie April 13, 2018, 5:21 p.m. UTC | #3

On 04/12/2018 10:08 PM, Xiubo Li wrote:
> 
>>> +
>>> +    if (val != 1) {
>>> +        pr_err("Invalid block value %d\n", val);
>> I think you wanted
>>
>> "Invalid reset value %d\n"
> Yeah, just copied it from other place.
>>
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    spin_unlock(&udev->nl_cmd_lock);
>> Need spin_lock() instead of unlock.
>>
>>
>> I think before calling the code below you need to check if a nl command
>> is even waiting. If you just run this with no nl commands waiting then
>> next time we send a command nl_cmd->complete will be true and
>> tcmu_wait_genl_cmd_reply's wait_event call will return right away.
> Will fix this.
> 
>>
>>> +    nl_cmd->complete = true;
>>> +    nl_cmd->status = -EINTR;
>>> +    wake_up(&udev->complete_wq);
>> Instead of the code above, I think you need to take some of the guts out
>> of tcmu_genl_cmd_done to handle removal which is an odd cases due to the
>> refcoutning/locking and configfs use.
>>
>> For non removal commands we need to do a target_undepend_item. For
>> remove, we don't want to since we came through configfs and teardown
>> already started.
>>
>> So instead of the above lines take:
>>
>>          if (nl_cmd->cmd != completed_cmd) {
>>                  printk(KERN_ERR "Mismatched commands (Expecting reply
>> for %d. Current %d).\n",
>>                         completed_cmd, nl_cmd->cmd);
>>                  ret = -EINVAL;
>>          } else {
>>                  nl_cmd->status = rc;
>>          }
>>
>>     // Note: I changed this from the is_removed
>>          if (completed_cmd != TCMU_CMD_REMOVED_DEVICE)
>>                   target_undepend_item(&dev->dev_group.cg_item);
>>          if (!ret) {
>>                  nl_cmd->complete = true;
>>                  wake_up(&udev->complete_wq);
>>          }
>>
> Sorry, this part I couldn't totally understand.
> 
> Do you mean take the code above you pasted by introducing
> target_undepend_item() for none removal paths just like in
> tcmu_genl_cmd_done() does?
> While in tcmu_genl_cmd_done(), the target_depend_item() is called by
> target_find_device() and then it should do target_undepend_item() after
> that, but in
> reset_netlink_store(), should it need target_undepend_item() ? If so
> they won't be in pairs.
> 

Yes, you are right. I thought we took a count when we sent the nl cmd.

> I think we should only take this part:
> 
>         if (nl_cmd->cmd != completed_cmd) {
>                 printk(KERN_ERR "Mismatched commands (Expecting reply
> for %d. Current %d).\n",
>                        completed_cmd, nl_cmd->cmd);
>                 ret = -EINVAL;
>         } else {
>                 nl_cmd->status = rc;
>         }
> 
> from the tcmu_genl_cmd_done(), right ?
> 

We want the complete and wake up part too right?

>>                  nl_cmd->complete = true;
>>                  wake_up(&udev->complete_wq);


> Thanks,
> BRs
> 
> 
>> from tcmu_genl_cmd_done and make a function that takes the status code
>> and completed_cmd. You would just then do
>>
>> __tcmu_genl_cmd_done(&udev->curr_nl_cmd.cmd, -EINTR);
>>
>>
>>
>>
>>> +    spin_unlock(&udev->nl_cmd_lock);
>>> +
>>> +    return count;
>>> +}
>>> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
>>> +
>>>   static ssize_t tcmu_reset_ring_store(struct config_item *item,
>>> const char *page,
>>>                        size_t count)
>>>   {
>>> @@ -2363,6 +2400,7 @@ static ssize_t tcmu_reset_ring_store(struct
>>> config_item *item, const char *page,
>>>   static struct configfs_attribute *tcmu_action_attrs[] = {
>>>       &tcmu_attr_block_dev,
>>>       &tcmu_attr_reset_ring,
>>> +    &tcmu_attr_reset_netlink,
>>>       NULL,
>>>   };
>>>  
>

Xiubo Li April 14, 2018, 12:16 a.m. UTC | #4

On 2018/4/14 1:21, Mike Christie wrote:
> On 04/12/2018 10:08 PM, Xiubo Li wrote:
>>>> +
>>>> +    if (val != 1) {
>>>> +        pr_err("Invalid block value %d\n", val);
>>> I think you wanted
>>>
>>> "Invalid reset value %d\n"
>> Yeah, just copied it from other place.
>>>> +        return -EINVAL;
>>>> +    }
>>>> +
>>>> +    spin_unlock(&udev->nl_cmd_lock);
>>> Need spin_lock() instead of unlock.
>>>
>>>
>>> I think before calling the code below you need to check if a nl command
>>> is even waiting. If you just run this with no nl commands waiting then
>>> next time we send a command nl_cmd->complete will be true and
>>> tcmu_wait_genl_cmd_reply's wait_event call will return right away.
>> Will fix this.
>>
>>>> +    nl_cmd->complete = true;
>>>> +    nl_cmd->status = -EINTR;
>>>> +    wake_up(&udev->complete_wq);
>>> Instead of the code above, I think you need to take some of the guts out
>>> of tcmu_genl_cmd_done to handle removal which is an odd cases due to the
>>> refcoutning/locking and configfs use.
>>>
>>> For non removal commands we need to do a target_undepend_item. For
>>> remove, we don't want to since we came through configfs and teardown
>>> already started.
>>>
>>> So instead of the above lines take:
>>>
>>>           if (nl_cmd->cmd != completed_cmd) {
>>>                   printk(KERN_ERR "Mismatched commands (Expecting reply
>>> for %d. Current %d).\n",
>>>                          completed_cmd, nl_cmd->cmd);
>>>                   ret = -EINVAL;
>>>           } else {
>>>                   nl_cmd->status = rc;
>>>           }
>>>
>>>      // Note: I changed this from the is_removed
>>>           if (completed_cmd != TCMU_CMD_REMOVED_DEVICE)
>>>                    target_undepend_item(&dev->dev_group.cg_item);
>>>           if (!ret) {
>>>                   nl_cmd->complete = true;
>>>                   wake_up(&udev->complete_wq);
>>>           }
>>>
>> Sorry, this part I couldn't totally understand.
>>
>> Do you mean take the code above you pasted by introducing
>> target_undepend_item() for none removal paths just like in
>> tcmu_genl_cmd_done() does?
>> While in tcmu_genl_cmd_done(), the target_depend_item() is called by
>> target_find_device() and then it should do target_undepend_item() after
>> that, but in
>> reset_netlink_store(), should it need target_undepend_item() ? If so
>> they won't be in pairs.
>>
> Yes, you are right. I thought we took a count when we sent the nl cmd.
>
>> I think we should only take this part:
>>
>>          if (nl_cmd->cmd != completed_cmd) {
>>                  printk(KERN_ERR "Mismatched commands (Expecting reply
>> for %d. Current %d).\n",
>>                         completed_cmd, nl_cmd->cmd);
>>                  ret = -EINVAL;
>>          } else {
>>                  nl_cmd->status = rc;
>>          }
>>
>> from the tcmu_genl_cmd_done(), right ?
>>
> We want the complete and wake up part too right?
Yeah, I will fix them all you mentioned above.

Thanks,
BRs


>>>                   nl_cmd->complete = true;
>>>                   wake_up(&udev->complete_wq);
>
>> Thanks,
>> BRs
>>
>>
>>> from tcmu_genl_cmd_done and make a function that takes the status code
>>> and completed_cmd. You would just then do
>>>
>>> __tcmu_genl_cmd_done(&udev->curr_nl_cmd.cmd, -EINTR);
>>>
>>>
>>>
>>>
>>>> +    spin_unlock(&udev->nl_cmd_lock);
>>>> +
>>>> +    return count;
>>>> +}
>>>> +CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
>>>> +
>>>>    static ssize_t tcmu_reset_ring_store(struct config_item *item,
>>>> const char *page,
>>>>                         size_t count)
>>>>    {
>>>> @@ -2363,6 +2400,7 @@ static ssize_t tcmu_reset_ring_store(struct
>>>> config_item *item, const char *page,
>>>>    static struct configfs_attribute *tcmu_action_attrs[] = {
>>>>        &tcmu_attr_block_dev,
>>>>        &tcmu_attr_reset_ring,
>>>> +    &tcmu_attr_reset_netlink,
>>>>        NULL,
>>>>    };
>>>>   
> --
> To unsubscribe from this list: send the line "unsubscribe target-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

==============
INFO: task targetctl:22655 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
targetctl       D ffff880169718fd0     0 22655  17249 0x00000080
Call Trace:
 [<ffffffff816ab6d9>] schedule+0x29/0x70
 [<ffffffff816a90e9>] schedule_timeout+0x239/0x2c0
 [<ffffffff81574d42>] ? skb_release_data+0xf2/0x140
 [<ffffffff816aba8d>] wait_for_completion+0xfd/0x140
 [<ffffffff810c6440>] ? wake_up_state+0x20/0x20
 [<ffffffffc0159f5a>] tcmu_netlink_event+0x26a/0x3a0 [target_core_user]
 [<ffffffff810b34b0>] ? wake_up_atomic_t+0x30/0x30
 [<ffffffffc015a2c6>] tcmu_configure_device+0x236/0x350 [target_core_user]
 [<ffffffffc05085df>] target_configure_device+0x3f/0x3b0 [target_core_mod]
 [<ffffffffc0502e7c>] target_core_store_dev_enable+0x2c/0x60 [target_core_mod]
 [<ffffffffc0501244>] target_core_dev_store+0x24/0x40 [target_core_mod]
 [<ffffffff8128a0e4>] configfs_write_file+0xc4/0x130
 [<ffffffff81202aed>] vfs_write+0xbd/0x1e0
 [<ffffffff812038ff>] SyS_write+0x7f/0xe0
 [<ffffffff816b89fd>] system_call_fastpath+0x16/0x1b
==============

Be careful of using this, it could reset the normal netlink requesting
operations, so we should use this only when the user space daemon from
starting and just before the daemon could receive and handle the nl
requests.

Changes since v1(suggested by Mike Christie):
v2: - Makes the reset per device.

Signed-off-by: Xiubo Li <xiubli@redhat.com>
---
 drivers/target/target_core_user.c | 52 +++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
index 4ad89ea..7271da8 100644
--- a/drivers/target/target_core_user.c
+++ b/drivers/target/target_core_user.c
@@ -105,7 +105,8 @@  struct tcmu_hba {
 
 struct tcmu_nl_cmd {
 	/* wake up thread waiting for reply */
-	struct completion complete;
+	bool complete;
+
 	int cmd;
 	int status;
 };
@@ -159,9 +160,12 @@  struct tcmu_dev {
 
 	spinlock_t nl_cmd_lock;
 	struct tcmu_nl_cmd curr_nl_cmd;
-	/* wake up threads waiting on curr_nl_cmd */
+	/* wake up threads waiting on nl_cmd_wq */
 	wait_queue_head_t nl_cmd_wq;
 
+	/* complete thread waiting complete_wq */
+	wait_queue_head_t complete_wq;
+
 	char dev_config[TCMU_CONFIG_LEN];
 
 	int nl_reply_supported;
@@ -307,11 +311,13 @@  static int tcmu_genl_cmd_done(struct genl_info *info, int completed_cmd)
 		nl_cmd->status = rc;
 	}
 
-	spin_unlock(&udev->nl_cmd_lock);
 	if (!is_removed)
 		 target_undepend_item(&dev->dev_group.cg_item);
-	if (!ret)
-		complete(&nl_cmd->complete);
+	if (!ret) {
+		nl_cmd->complete = true;
+		wake_up(&udev->complete_wq);
+	}
+	spin_unlock(&udev->nl_cmd_lock);
 	return ret;
 }
 
@@ -1258,6 +1264,7 @@  static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name)
 	timer_setup(&udev->cmd_timer, tcmu_cmd_timedout, 0);
 
 	init_waitqueue_head(&udev->nl_cmd_wq);
+	init_waitqueue_head(&udev->complete_wq);
 	spin_lock_init(&udev->nl_cmd_lock);
 
 	INIT_RADIX_TREE(&udev->data_blocks, GFP_KERNEL);
@@ -1555,7 +1562,6 @@  static void tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd)
 
 	memset(nl_cmd, 0, sizeof(*nl_cmd));
 	nl_cmd->cmd = cmd;
-	init_completion(&nl_cmd->complete);
 
 	spin_unlock(&udev->nl_cmd_lock);
 }
@@ -1573,9 +1579,10 @@  static int tcmu_wait_genl_cmd_reply(struct tcmu_dev *udev)
 		return 0;
 
 	pr_debug("sleeping for nl reply\n");
-	wait_for_completion(&nl_cmd->complete);
+	wait_event(udev->complete_wq, nl_cmd->complete);
 
 	spin_lock(&udev->nl_cmd_lock);
+	nl_cmd->complete = false;
 	nl_cmd->cmd = TCMU_CMD_UNSPEC;
 	ret = nl_cmd->status;
 	nl_cmd->status = 0;
@@ -2323,6 +2330,36 @@  static ssize_t tcmu_block_dev_store(struct config_item *item, const char *page,
 }
 CONFIGFS_ATTR(tcmu_, block_dev);
 
+static ssize_t tcmu_reset_netlink_store(struct config_item *item, const char *page,
+				    size_t count)
+{
+	struct se_device *se_dev = container_of(to_config_group(item),
+						struct se_device,
+						dev_action_group);
+	struct tcmu_dev *udev = TCMU_DEV(se_dev);
+	struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd;
+	u8 val;
+	int ret;
+
+	ret = kstrtou8(page, 0, &val);
+	if (ret < 0)
+		return ret;
+
+	if (val != 1) {
+		pr_err("Invalid block value %d\n", val);
+		return -EINVAL;
+	}
+
+	spin_unlock(&udev->nl_cmd_lock);
+	nl_cmd->complete = true;
+	nl_cmd->status = -EINTR;
+	wake_up(&udev->complete_wq);
+	spin_unlock(&udev->nl_cmd_lock);
+
+	return count;
+}
+CONFIGFS_ATTR_WO(tcmu_, reset_netlink);
+
 static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
 				     size_t count)
 {
@@ -2363,6 +2400,7 @@  static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page,
 static struct configfs_attribute *tcmu_action_attrs[] = {
 	&tcmu_attr_block_dev,
 	&tcmu_attr_reset_ring,
+	&tcmu_attr_reset_netlink,
 	NULL,
 };

[PATCHv2] tcmu: allow userspace to reset netlink

Commit Message

Comments

Patch