Message ID | 20190108193942.29715-1-idryomov@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | rbd: don't return 0 on unmap if RBD_DEV_FLAG_REMOVING is set | expand |
On 01/09/2019 03:39 AM, Ilya Dryomov wrote: > There is a window between when RBD_DEV_FLAG_REMOVING is set and when > the device is removed from rbd_dev_list. During this window, we set > "already" and return 0. > > Returning 0 from write(2) can confuse userspace tools because > 0 indicates that nothing was written. In particular, "rbd unmap" > will retry the write multiple times a second: > > 10:28:05.463299 write(4, "0", 1) = 0 > 10:28:05.463509 write(4, "0", 1) = 0 > 10:28:05.463720 write(4, "0", 1) = 0 > 10:28:05.463942 write(4, "0", 1) = 0 > 10:28:05.464155 write(4, "0", 1) = 0 > > Cc: stable@vger.kernel.org > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > --- > drivers/block/rbd.c | 9 ++++----- > 1 file changed, 4 insertions(+), 5 deletions(-) > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > index 63f73e8a1ab5..2f91dee0ab5f 100644 > --- a/drivers/block/rbd.c > +++ b/drivers/block/rbd.c > @@ -5986,7 +5986,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > struct list_head *tmp; > int dev_id; > char opt_buf[6]; > - bool already = false; > bool force = false; > int ret; > > @@ -6019,13 +6018,13 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > spin_lock_irq(&rbd_dev->lock); > if (rbd_dev->open_count && !force) > ret = -EBUSY; > - else > - already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, > - &rbd_dev->flags); > + else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, > + &rbd_dev->flags)) > + ret = -EINPROGRESS; > spin_unlock_irq(&rbd_dev->lock); > } > spin_unlock(&rbd_dev_list_lock); > - if (ret < 0 || already) > + if (ret) > return ret; Hi Ilya, Reviewed and Tested. test step as below: (1) change in ceph: diff --git a/src/common/safe_io.c b/src/common/safe_io.c index 633e3b3..af6175f 100644 --- a/src/common/safe_io.c +++ b/src/common/safe_io.c @@ -62,6 +62,7 @@ ssize_t safe_write(int fd, const void *buf, size_t count) continue; return -errno; } + printf("write: %lu\n", r); count -= r; buf = (char *)buf + r; } (2) script: rbd map test rbd unmap test & rbd unmap test exit * result in upstream: write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 write: 0 rbd: sysfs write failed rbd: unmap failed: (2) No such file or directory * result with this patch: write: 102 rbd: sysfs write failed rbd: unmap failed: (115) Operation now in progress But I found another thing in do_rbd_remove(). We can use list_for_each_entry() rather than list_for_each() + list_entry(). diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8e5140b..d2ec454 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5983,7 +5983,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, size_t count) { struct rbd_device *rbd_dev = NULL; - struct list_head *tmp; int dev_id; char opt_buf[6]; bool already = false; @@ -6008,8 +6007,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, ret = -ENOENT; spin_lock(&rbd_dev_list_lock); - list_for_each(tmp, &rbd_dev_list) { - rbd_dev = list_entry(tmp, struct rbd_device, node); + list_for_each_entry(rbd_dev, &rbd_dev_list, node) { if (rbd_dev->dev_id == dev_id) { ret = 0; break; Just a cleanup, do you think it worth a separate cleanup patch? Thanx > > if (force) {
On Wed, Jan 9, 2019 at 3:07 AM Dongsheng Yang <dongsheng.yang@easystack.cn> wrote: > > On 01/09/2019 03:39 AM, Ilya Dryomov wrote: > > There is a window between when RBD_DEV_FLAG_REMOVING is set and when > > the device is removed from rbd_dev_list. During this window, we set > > "already" and return 0. > > > > Returning 0 from write(2) can confuse userspace tools because > > 0 indicates that nothing was written. In particular, "rbd unmap" > > will retry the write multiple times a second: > > > > 10:28:05.463299 write(4, "0", 1) = 0 > > 10:28:05.463509 write(4, "0", 1) = 0 > > 10:28:05.463720 write(4, "0", 1) = 0 > > 10:28:05.463942 write(4, "0", 1) = 0 > > 10:28:05.464155 write(4, "0", 1) = 0 > > > > Cc: stable@vger.kernel.org > > Signed-off-by: Ilya Dryomov <idryomov@gmail.com> > > --- > > drivers/block/rbd.c | 9 ++++----- > > 1 file changed, 4 insertions(+), 5 deletions(-) > > > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > > index 63f73e8a1ab5..2f91dee0ab5f 100644 > > --- a/drivers/block/rbd.c > > +++ b/drivers/block/rbd.c > > @@ -5986,7 +5986,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > > struct list_head *tmp; > > int dev_id; > > char opt_buf[6]; > > - bool already = false; > > bool force = false; > > int ret; > > > > @@ -6019,13 +6018,13 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > > spin_lock_irq(&rbd_dev->lock); > > if (rbd_dev->open_count && !force) > > ret = -EBUSY; > > - else > > - already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, > > - &rbd_dev->flags); > > + else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, > > + &rbd_dev->flags)) > > + ret = -EINPROGRESS; > > spin_unlock_irq(&rbd_dev->lock); > > } > > spin_unlock(&rbd_dev_list_lock); > > - if (ret < 0 || already) > > + if (ret) > > return ret; > > Hi Ilya, > Reviewed and Tested. test step as below: > > (1) change in ceph: > diff --git a/src/common/safe_io.c b/src/common/safe_io.c > index 633e3b3..af6175f 100644 > --- a/src/common/safe_io.c > +++ b/src/common/safe_io.c > @@ -62,6 +62,7 @@ ssize_t safe_write(int fd, const void *buf, size_t count) > continue; > return -errno; > } > + printf("write: %lu\n", r); > count -= r; > buf = (char *)buf + r; > } > > (2) script: > > rbd map test > rbd unmap test & rbd unmap test > exit > > * result in upstream: > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > write: 0 > rbd: sysfs write failed > rbd: unmap failed: (2) No such file or directory > > * result with this patch: > write: 102 This 102 is from "rbd map", right? The second "rbd unmap" couldn't have reached your printf. > rbd: sysfs write failed > rbd: unmap failed: (115) Operation now in progress > > But I found another thing in do_rbd_remove(). We can use > list_for_each_entry() > rather than list_for_each() + list_entry(). > > diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c > index 8e5140b..d2ec454 100644 > --- a/drivers/block/rbd.c > +++ b/drivers/block/rbd.c > @@ -5983,7 +5983,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > size_t count) > { > struct rbd_device *rbd_dev = NULL; > - struct list_head *tmp; > int dev_id; > char opt_buf[6]; > bool already = false; > @@ -6008,8 +6007,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, > > ret = -ENOENT; > spin_lock(&rbd_dev_list_lock); > - list_for_each(tmp, &rbd_dev_list) { > - rbd_dev = list_entry(tmp, struct rbd_device, node); > + list_for_each_entry(rbd_dev, &rbd_dev_list, node) { > if (rbd_dev->dev_id == dev_id) { > ret = 0; > break; > > Just a cleanup, do you think it worth a separate cleanup patch? No, there is nothing wrong with that. It's done that way in many places in the kernel. Thanks, Ilya
On 01/09/2019 05:44 PM, Ilya Dryomov wrote: > On Wed, Jan 9, 2019 at 3:07 AM Dongsheng Yang > <dongsheng.yang@easystack.cn> wrote: >> On 01/09/2019 03:39 AM, Ilya Dryomov wrote: >>> There is a window between when RBD_DEV_FLAG_REMOVING is set and when >>> the device is removed from rbd_dev_list. During this window, we set >>> "already" and return 0. >>> >>> Returning 0 from write(2) can confuse userspace tools because >>> 0 indicates that nothing was written. In particular, "rbd unmap" >>> will retry the write multiple times a second: >>> >>> 10:28:05.463299 write(4, "0", 1) = 0 >>> 10:28:05.463509 write(4, "0", 1) = 0 >>> 10:28:05.463720 write(4, "0", 1) = 0 >>> 10:28:05.463942 write(4, "0", 1) = 0 >>> 10:28:05.464155 write(4, "0", 1) = 0 >>> >>> Cc: stable@vger.kernel.org >>> Signed-off-by: Ilya Dryomov <idryomov@gmail.com> >>> --- >>> drivers/block/rbd.c | 9 ++++----- >>> 1 file changed, 4 insertions(+), 5 deletions(-) >>> >>> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c >>> index 63f73e8a1ab5..2f91dee0ab5f 100644 >>> --- a/drivers/block/rbd.c >>> +++ b/drivers/block/rbd.c >>> @@ -5986,7 +5986,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, >>> struct list_head *tmp; >>> int dev_id; >>> char opt_buf[6]; >>> - bool already = false; >>> bool force = false; >>> int ret; >>> >>> @@ -6019,13 +6018,13 @@ static ssize_t do_rbd_remove(struct bus_type *bus, >>> spin_lock_irq(&rbd_dev->lock); >>> if (rbd_dev->open_count && !force) >>> ret = -EBUSY; >>> - else >>> - already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, >>> - &rbd_dev->flags); >>> + else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, >>> + &rbd_dev->flags)) >>> + ret = -EINPROGRESS; >>> spin_unlock_irq(&rbd_dev->lock); >>> } >>> spin_unlock(&rbd_dev_list_lock); >>> - if (ret < 0 || already) >>> + if (ret) >>> return ret; >> Hi Ilya, >> Reviewed and Tested. test step as below: >> >> (1) change in ceph: >> diff --git a/src/common/safe_io.c b/src/common/safe_io.c >> index 633e3b3..af6175f 100644 >> --- a/src/common/safe_io.c >> +++ b/src/common/safe_io.c >> @@ -62,6 +62,7 @@ ssize_t safe_write(int fd, const void *buf, size_t count) >> continue; >> return -errno; >> } >> + printf("write: %lu\n", r); >> count -= r; >> buf = (char *)buf + r; >> } >> >> (2) script: >> >> rbd map test >> rbd unmap test & rbd unmap test >> exit >> >> * result in upstream: >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> write: 0 >> rbd: sysfs write failed >> rbd: unmap failed: (2) No such file or directory >> >> * result with this patch: >> write: 102 > This 102 is from "rbd map", right? The second "rbd unmap" couldn't > have reached your printf. My bad, that's totally a noise. 102 is a message "...WARNING: all dangerous and experimental features are enabled" because I ran this in my vstart env without CEPH_DEV. So update the result as below: $ export CEPH_DEV=true $ sh -x test.sh + rbd map test write: 86 /dev/rbd0 write: 1 + rbd unmap test + rbd unmap test rbd: sysfs write failed rbd: unmap failed: (115) Operation now in progress write: 1 write: 1 write: 1 + exit > >> rbd: sysfs write failed >> rbd: unmap failed: (115) Operation now in progress >> >> But I found another thing in do_rbd_remove(). We can use >> list_for_each_entry() >> rather than list_for_each() + list_entry(). >> >> diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c >> index 8e5140b..d2ec454 100644 >> --- a/drivers/block/rbd.c >> +++ b/drivers/block/rbd.c >> @@ -5983,7 +5983,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, >> size_t count) >> { >> struct rbd_device *rbd_dev = NULL; >> - struct list_head *tmp; >> int dev_id; >> char opt_buf[6]; >> bool already = false; >> @@ -6008,8 +6007,7 @@ static ssize_t do_rbd_remove(struct bus_type *bus, >> >> ret = -ENOENT; >> spin_lock(&rbd_dev_list_lock); >> - list_for_each(tmp, &rbd_dev_list) { >> - rbd_dev = list_entry(tmp, struct rbd_device, node); >> + list_for_each_entry(rbd_dev, &rbd_dev_list, node) { >> if (rbd_dev->dev_id == dev_id) { >> ret = 0; >> break; >> >> Just a cleanup, do you think it worth a separate cleanup patch? > No, there is nothing wrong with that. It's done that way in many > places in the kernel. Okey, Thanx > > Thanks, > > Ilya >
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 63f73e8a1ab5..2f91dee0ab5f 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5986,7 +5986,6 @@ static ssize_t do_rbd_remove(struct bus_type *bus, struct list_head *tmp; int dev_id; char opt_buf[6]; - bool already = false; bool force = false; int ret; @@ -6019,13 +6018,13 @@ static ssize_t do_rbd_remove(struct bus_type *bus, spin_lock_irq(&rbd_dev->lock); if (rbd_dev->open_count && !force) ret = -EBUSY; - else - already = test_and_set_bit(RBD_DEV_FLAG_REMOVING, - &rbd_dev->flags); + else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING, + &rbd_dev->flags)) + ret = -EINPROGRESS; spin_unlock_irq(&rbd_dev->lock); } spin_unlock(&rbd_dev_list_lock); - if (ret < 0 || already) + if (ret) return ret; if (force) {
There is a window between when RBD_DEV_FLAG_REMOVING is set and when the device is removed from rbd_dev_list. During this window, we set "already" and return 0. Returning 0 from write(2) can confuse userspace tools because 0 indicates that nothing was written. In particular, "rbd unmap" will retry the write multiple times a second: 10:28:05.463299 write(4, "0", 1) = 0 10:28:05.463509 write(4, "0", 1) = 0 10:28:05.463720 write(4, "0", 1) = 0 10:28:05.463942 write(4, "0", 1) = 0 10:28:05.464155 write(4, "0", 1) = 0 Cc: stable@vger.kernel.org Signed-off-by: Ilya Dryomov <idryomov@gmail.com> --- drivers/block/rbd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)