diff mbox

block: Fix S_DAX inode flag locking

Message ID 1463509797-10324-2-git-send-email-jonathan.derrick@intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jon Derrick May 17, 2016, 6:29 p.m. UTC
This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
locking rules. It is presumed that S_DAX is the only valid inode flag
for a block device which subscribes to direct-access, and will restore
any previously set flags if direct-access initialization fails.

This reverts to i_flags behavior prior to
bbab37ddc20bae4709bca8745c128c4f46fe63c5
by allowing other bd_inode flags when DAX is disabled

Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
---
 fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

Comments

Dan Williams May 17, 2016, 7:34 p.m. UTC | #1
On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
<jonathan.derrick@intel.com> wrote:
> This patch fixes S_DAX bd_inode i_flag locking to conform to suggested

A "fix" implies that its currently broken.  I don't see how it is, not
until we add an ioctl method or other path that also tries to update
the flags outside of blkdev_get() context.  So, I don't think this
patch stands on its own if you were intending it to be merged
separately.

> locking rules. It is presumed that S_DAX is the only valid inode flag
> for a block device which subscribes to direct-access, and will restore
> any previously set flags if direct-access initialization fails.
>
> This reverts to i_flags behavior prior to
> bbab37ddc20bae4709bca8745c128c4f46fe63c5
> by allowing other bd_inode flags when DAX is disabled
>
> Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> ---
>  fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
>  1 file changed, 26 insertions(+), 5 deletions(-)
>
> diff --git a/fs/block_dev.c b/fs/block_dev.c
> index 20a2c02..d41e37f 100644
> --- a/fs/block_dev.c
> +++ b/fs/block_dev.c
> @@ -1159,6 +1159,20 @@ void bd_set_size(struct block_device *bdev, loff_t size)
>  }
>  EXPORT_SYMBOL(bd_set_size);
>
> +static void bd_add_dax(struct inode *inode)
> +{
> +       inode_lock(inode);
> +       inode->i_flags |= S_DAX;
> +       inode_unlock(inode);
> +}
> +
> +static void bd_clear_dax(struct inode *inode)
> +{
> +       inode_lock(inode);
> +       inode->i_flags &= ~S_DAX;
> +       inode_unlock(inode);
> +}

Since this is inode generic should these helpers be prefixed "i_"
rather than "bd_"?

> +
>  static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
>
>  /*
> @@ -1172,6 +1186,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>  {
>         struct gendisk *disk;
>         struct module *owner;
> +       struct inode *inode;
>         int ret;
>         int partno;
>         int perm = 0;
> @@ -1198,6 +1213,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>         if (!disk)
>                 goto out;
>         owner = disk->fops->owner;
> +       inode = bdev->bd_inode;
>
>         disk_block_events(disk);
>         mutex_lock_nested(&bdev->bd_mutex, for_part);
> @@ -1206,9 +1222,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>                 bdev->bd_queue = disk->queue;
>                 bdev->bd_contains = bdev;
>                 if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
> -                       bdev->bd_inode->i_flags = S_DAX;
> -               else
> -                       bdev->bd_inode->i_flags = 0;
> +                       bd_add_dax(inode);
>
>                 if (!partno) {
>                         ret = -ENXIO;
> @@ -1228,6 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>                                         bdev->bd_part = NULL;
>                                         bdev->bd_disk = NULL;
>                                         bdev->bd_queue = NULL;
> +                                       bd_clear_dax(inode);
>                                         mutex_unlock(&bdev->bd_mutex);
>                                         disk_unblock_events(disk);
>                                         put_disk(disk);
> @@ -1239,7 +1254,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>                         if (!ret) {
>                                 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
>                                 if (!blkdev_dax_capable(bdev))
> -                                       bdev->bd_inode->i_flags &= ~S_DAX;
> +                                       bd_clear_dax(inode);
>                         }
>
>                         /*
> @@ -1276,7 +1291,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>                         }
>                         bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
>                         if (!blkdev_dax_capable(bdev))
> -                               bdev->bd_inode->i_flags &= ~S_DAX;
> +                               bd_clear_dax(inode);
>                 }
>         } else {
>                 if (bdev->bd_contains == bdev) {
> @@ -1297,6 +1312,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>                 put_disk(disk);
>                 module_put(owner);
>         }
> +       inode_lock(inode);
> +       if (inode->i_flags & S_DAX)
> +               inode->i_flags = S_DAX;
> +       inode_unlock(inode);
> +

Clear all other flags if S_DAX is set? Why?

>         bdev->bd_openers++;
>         if (for_part)
>                 bdev->bd_part_count++;
> @@ -1309,6 +1329,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
>         bdev->bd_disk = NULL;
>         bdev->bd_part = NULL;
>         bdev->bd_queue = NULL;
> +       bd_clear_dax(inode);

Why?
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jon Derrick May 17, 2016, 8:38 p.m. UTC | #2
Hi Dan,

Thanks for the review,


On Tue, May 17, 2016 at 12:34:57PM -0700, Dan Williams wrote:
> On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
> <jonathan.derrick@intel.com> wrote:
> > This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
> 
> A "fix" implies that its currently broken.  I don't see how it is, not
> until we add an ioctl method or other path that also tries to update
> the flags outside of blkdev_get() context.  So, I don't think this
> patch stands on its own if you were intending it to be merged
> separately.
Are there no other paths that can set/clear a bd_inode->i_flags?
If not, that's fine and I'll pack this into the next HIPRI set

> 
> > locking rules. It is presumed that S_DAX is the only valid inode flag
> > for a block device which subscribes to direct-access, and will restore
> > any previously set flags if direct-access initialization fails.
> >
> > This reverts to i_flags behavior prior to
> > bbab37ddc20bae4709bca8745c128c4f46fe63c5
> > by allowing other bd_inode flags when DAX is disabled
> >
> > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > ---
> >  fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
> >  1 file changed, 26 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > index 20a2c02..d41e37f 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -1159,6 +1159,20 @@ void bd_set_size(struct block_device *bdev, loff_t size)
> >  }
> >  EXPORT_SYMBOL(bd_set_size);
> >
> > +static void bd_add_dax(struct inode *inode)
> > +{
> > +       inode_lock(inode);
> > +       inode->i_flags |= S_DAX;
> > +       inode_unlock(inode);
> > +}
> > +
> > +static void bd_clear_dax(struct inode *inode)
> > +{
> > +       inode_lock(inode);
> > +       inode->i_flags &= ~S_DAX;
> > +       inode_unlock(inode);
> > +}
> 
> Since this is inode generic should these helpers be prefixed "i_"
> rather than "bd_"?
That's fine. bd_acquire/forget also use the inode in its parameter list, but its simple enough to use the bdev instead for these functions or i_ as you suggest

> 
> > +
> >  static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
> >
> >  /*
> > @@ -1172,6 +1186,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >  {
> >         struct gendisk *disk;
> >         struct module *owner;
> > +       struct inode *inode;
> >         int ret;
> >         int partno;
> >         int perm = 0;
> > @@ -1198,6 +1213,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >         if (!disk)
> >                 goto out;
> >         owner = disk->fops->owner;
> > +       inode = bdev->bd_inode;
> >
> >         disk_block_events(disk);
> >         mutex_lock_nested(&bdev->bd_mutex, for_part);
> > @@ -1206,9 +1222,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >                 bdev->bd_queue = disk->queue;
> >                 bdev->bd_contains = bdev;
> >                 if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
> > -                       bdev->bd_inode->i_flags = S_DAX;
> > -               else
> > -                       bdev->bd_inode->i_flags = 0;
> > +                       bd_add_dax(inode);
> >
> >                 if (!partno) {
> >                         ret = -ENXIO;
> > @@ -1228,6 +1242,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >                                         bdev->bd_part = NULL;
> >                                         bdev->bd_disk = NULL;
> >                                         bdev->bd_queue = NULL;
> > +                                       bd_clear_dax(inode);
> >                                         mutex_unlock(&bdev->bd_mutex);
> >                                         disk_unblock_events(disk);
> >                                         put_disk(disk);
> > @@ -1239,7 +1254,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >                         if (!ret) {
> >                                 bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
> >                                 if (!blkdev_dax_capable(bdev))
> > -                                       bdev->bd_inode->i_flags &= ~S_DAX;
> > +                                       bd_clear_dax(inode);
> >                         }
> >
> >                         /*
> > @@ -1276,7 +1291,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >                         }
> >                         bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
> >                         if (!blkdev_dax_capable(bdev))
> > -                               bdev->bd_inode->i_flags &= ~S_DAX;
> > +                               bd_clear_dax(inode);
> >                 }
> >         } else {
> >                 if (bdev->bd_contains == bdev) {
> > @@ -1297,6 +1312,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >                 put_disk(disk);
> >                 module_put(owner);
> >         }
> > +       inode_lock(inode);
> > +       if (inode->i_flags & S_DAX)
> > +               inode->i_flags = S_DAX;
> > +       inode_unlock(inode);
> > +
> 
> Clear all other flags if S_DAX is set? Why?
Code today sets i_flags = S_DAX, clearing all other flags, so I figured that's how it's supposed to be. Though I can't see any reason it has to be that way.


> 
> >         bdev->bd_openers++;
> >         if (for_part)
> >                 bdev->bd_part_count++;
> > @@ -1309,6 +1329,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> >         bdev->bd_disk = NULL;
> >         bdev->bd_part = NULL;
> >         bdev->bd_queue = NULL;
> > +       bd_clear_dax(inode);
> 
> Why?
As far as I can tell, this path is only for cleanup when a whole block device or partition block device fails to get. Your question makes me think I don't understand this part of the code correctly, and admittedly I don't understand it all that well.

All callers of out_clear set an error code on ret, so I assumed all instances of out_clear are errors. There's no reason to keep the S_DAX flag on the bdev this time around. If it resolves itself (for example, GENHD_FL_UP gets set), it can be set again on the next __blkdev_get pass. 
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner May 17, 2016, 10:58 p.m. UTC | #3
On Tue, May 17, 2016 at 12:34:57PM -0700, Dan Williams wrote:
> On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
> <jonathan.derrick@intel.com> wrote:
> > This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
> 
> A "fix" implies that its currently broken.  I don't see how it is, not
> until we add an ioctl method or other path that also tries to update
> the flags outside of blkdev_get() context.  So, I don't think this
> patch stands on its own if you were intending it to be merged
> separately.
> 
> > locking rules. It is presumed that S_DAX is the only valid inode flag
> > for a block device which subscribes to direct-access, and will restore
> > any previously set flags if direct-access initialization fails.
> >
> > This reverts to i_flags behavior prior to
> > bbab37ddc20bae4709bca8745c128c4f46fe63c5
> > by allowing other bd_inode flags when DAX is disabled
> >
> > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > ---
> >  fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
> >  1 file changed, 26 insertions(+), 5 deletions(-)
> >
> > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > index 20a2c02..d41e37f 100644
> > --- a/fs/block_dev.c
> > +++ b/fs/block_dev.c
> > @@ -1159,6 +1159,20 @@ void bd_set_size(struct block_device *bdev, loff_t size)
> >  }
> >  EXPORT_SYMBOL(bd_set_size);
> >
> > +static void bd_add_dax(struct inode *inode)
> > +{
> > +       inode_lock(inode);
> > +       inode->i_flags |= S_DAX;
> > +       inode_unlock(inode);
> > +}
> > +
> > +static void bd_clear_dax(struct inode *inode)
> > +{
> > +       inode_lock(inode);
> > +       inode->i_flags &= ~S_DAX;
> > +       inode_unlock(inode);
> > +}
> 
> Since this is inode generic should these helpers be prefixed "i_"
> rather than "bd_"?

Probably not, because in general filesystems are responsible for
updating i_flags to reflect on-disk inode configuration and that's
typically done under transaction contexts. e.g.  through ioctl
interfaces to set/clear flags that are stored on disk.  As such,
inode->i_flags is effectively protected by the filesystem specific
locking heirarchy, not the generic inode_lock().

e.g. have a look at XFS storing a persistent "DAX-enabled" flag in
the inode, which can be set/cleared on individual inodes dynamically
by FS_IOC_FSSETXATTR. The XFS i_flags update function assumes
exclusive access to the field as it is called under locked
transaction context. Similar code exists in ext4, btrfs, gfs2,
etc....

Cheers,

Dave.
Jan Kara May 18, 2016, 8:20 a.m. UTC | #4
On Wed 18-05-16 08:58:42, Dave Chinner wrote:
> On Tue, May 17, 2016 at 12:34:57PM -0700, Dan Williams wrote:
> > On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
> > <jonathan.derrick@intel.com> wrote:
> > > This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
> > 
> > A "fix" implies that its currently broken.  I don't see how it is, not
> > until we add an ioctl method or other path that also tries to update
> > the flags outside of blkdev_get() context.  So, I don't think this
> > patch stands on its own if you were intending it to be merged
> > separately.
> > 
> > > locking rules. It is presumed that S_DAX is the only valid inode flag
> > > for a block device which subscribes to direct-access, and will restore
> > > any previously set flags if direct-access initialization fails.
> > >
> > > This reverts to i_flags behavior prior to
> > > bbab37ddc20bae4709bca8745c128c4f46fe63c5
> > > by allowing other bd_inode flags when DAX is disabled
> > >
> > > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > > ---
> > >  fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
> > >  1 file changed, 26 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > > index 20a2c02..d41e37f 100644
> > > --- a/fs/block_dev.c
> > > +++ b/fs/block_dev.c
> > > @@ -1159,6 +1159,20 @@ void bd_set_size(struct block_device *bdev, loff_t size)
> > >  }
> > >  EXPORT_SYMBOL(bd_set_size);
> > >
> > > +static void bd_add_dax(struct inode *inode)
> > > +{
> > > +       inode_lock(inode);
> > > +       inode->i_flags |= S_DAX;
> > > +       inode_unlock(inode);
> > > +}
> > > +
> > > +static void bd_clear_dax(struct inode *inode)
> > > +{
> > > +       inode_lock(inode);
> > > +       inode->i_flags &= ~S_DAX;
> > > +       inode_unlock(inode);
> > > +}
> > 
> > Since this is inode generic should these helpers be prefixed "i_"
> > rather than "bd_"?
> 
> Probably not, because in general filesystems are responsible for
> updating i_flags to reflect on-disk inode configuration and that's
> typically done under transaction contexts. e.g.  through ioctl
> interfaces to set/clear flags that are stored on disk.  As such,
> inode->i_flags is effectively protected by the filesystem specific
> locking heirarchy, not the generic inode_lock().
>
> e.g. have a look at XFS storing a persistent "DAX-enabled" flag in
> the inode, which can be set/cleared on individual inodes dynamically
> by FS_IOC_FSSETXATTR. The XFS i_flags update function assumes
> exclusive access to the field as it is called under locked
> transaction context. Similar code exists in ext4, btrfs, gfs2,
> etc....

So in case of ext4, we actually do use inode_lock() to protect against
racing IOC_SETFLAGS calls. i_flags is a strange mix and there are a few
(like S_DEAD or S_NOSEC) which are not persistent and those get set /
cleared by VFS. Some of those places (e.g. the clearing in
__vfs_setxattr_noperm()) are not really controlled by the filesystem
AFAICT. So when XFS doesn't use inode_lock() to protect i_flags updates it
can race with VFS on i_flags updates.

								Honza
Jan Kara May 18, 2016, 8:29 a.m. UTC | #5
On Tue 17-05-16 14:38:05, Jon Derrick wrote:
> On Tue, May 17, 2016 at 12:34:57PM -0700, Dan Williams wrote:
> > On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
> > <jonathan.derrick@intel.com> wrote:
> > > This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
> > 
> > A "fix" implies that its currently broken.  I don't see how it is, not
> > until we add an ioctl method or other path that also tries to update
> > the flags outside of blkdev_get() context.  So, I don't think this
> > patch stands on its own if you were intending it to be merged
> > separately.
> Are there no other paths that can set/clear a bd_inode->i_flags?
> If not, that's fine and I'll pack this into the next HIPRI set

So for block device inodes I don't think anybody else is currently messing
with i_flags but using some lock (and inode_lock() looks fine to me) for
that seems like a reasonable future-proofing. But I think it is enough to
send these patches together with your patch set if it introduces another
user of i_flags in block device inodes. It is always better to see a
concrete reason for the change.

> > > +       inode_lock(inode);
> > > +       if (inode->i_flags & S_DAX)
> > > +               inode->i_flags = S_DAX;
> > > +       inode_unlock(inode);
> > > +
> > 
> > Clear all other flags if S_DAX is set? Why?
>
> Code today sets i_flags = S_DAX, clearing all other flags, so I figured
> that's how it's supposed to be. Though I can't see any reason it has to
> be that way.

IMO that's just over-cautious code and there's no good reason for that.
Inodes are initialized with i_flags == 0. Some i_flags may be set by
previous openers of the block device inode but then these are presumably
consistent. So I'd just remove this zeroing.

> > > @@ -1309,6 +1329,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
> > >         bdev->bd_disk = NULL;
> > >         bdev->bd_part = NULL;
> > >         bdev->bd_queue = NULL;
> > > +       bd_clear_dax(inode);
> > 
> > Why?
> As far as I can tell, this path is only for cleanup when a whole block
> device or partition block device fails to get. Your question makes me
> think I don't understand this part of the code correctly, and admittedly
> I don't understand it all that well.
> 
> All callers of out_clear set an error code on ret, so I assumed all
> instances of out_clear are errors. There's no reason to keep the S_DAX
> flag on the bdev this time around. If it resolves itself (for example,
> GENHD_FL_UP gets set), it can be set again on the next __blkdev_get pass. 

Yeah, I agree. When we fail to properly set up the block device inode, it
probably makes sense to clear S_DAX just to be sure...

								Honza
Dave Chinner May 18, 2016, 2:32 p.m. UTC | #6
On Wed, May 18, 2016 at 10:20:39AM +0200, Jan Kara wrote:
> On Wed 18-05-16 08:58:42, Dave Chinner wrote:
> > On Tue, May 17, 2016 at 12:34:57PM -0700, Dan Williams wrote:
> > > On Tue, May 17, 2016 at 11:29 AM, Jon Derrick
> > > <jonathan.derrick@intel.com> wrote:
> > > > This patch fixes S_DAX bd_inode i_flag locking to conform to suggested
> > > 
> > > A "fix" implies that its currently broken.  I don't see how it is, not
> > > until we add an ioctl method or other path that also tries to update
> > > the flags outside of blkdev_get() context.  So, I don't think this
> > > patch stands on its own if you were intending it to be merged
> > > separately.
> > > 
> > > > locking rules. It is presumed that S_DAX is the only valid inode flag
> > > > for a block device which subscribes to direct-access, and will restore
> > > > any previously set flags if direct-access initialization fails.
> > > >
> > > > This reverts to i_flags behavior prior to
> > > > bbab37ddc20bae4709bca8745c128c4f46fe63c5
> > > > by allowing other bd_inode flags when DAX is disabled
> > > >
> > > > Signed-off-by: Jon Derrick <jonathan.derrick@intel.com>
> > > > ---
> > > >  fs/block_dev.c | 31 ++++++++++++++++++++++++++-----
> > > >  1 file changed, 26 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/fs/block_dev.c b/fs/block_dev.c
> > > > index 20a2c02..d41e37f 100644
> > > > --- a/fs/block_dev.c
> > > > +++ b/fs/block_dev.c
> > > > @@ -1159,6 +1159,20 @@ void bd_set_size(struct block_device *bdev, loff_t size)
> > > >  }
> > > >  EXPORT_SYMBOL(bd_set_size);
> > > >
> > > > +static void bd_add_dax(struct inode *inode)
> > > > +{
> > > > +       inode_lock(inode);
> > > > +       inode->i_flags |= S_DAX;
> > > > +       inode_unlock(inode);
> > > > +}
> > > > +
> > > > +static void bd_clear_dax(struct inode *inode)
> > > > +{
> > > > +       inode_lock(inode);
> > > > +       inode->i_flags &= ~S_DAX;
> > > > +       inode_unlock(inode);
> > > > +}
> > > 
> > > Since this is inode generic should these helpers be prefixed "i_"
> > > rather than "bd_"?
> > 
> > Probably not, because in general filesystems are responsible for
> > updating i_flags to reflect on-disk inode configuration and that's
> > typically done under transaction contexts. e.g.  through ioctl
> > interfaces to set/clear flags that are stored on disk.  As such,
> > inode->i_flags is effectively protected by the filesystem specific
> > locking heirarchy, not the generic inode_lock().
> >
> > e.g. have a look at XFS storing a persistent "DAX-enabled" flag in
> > the inode, which can be set/cleared on individual inodes dynamically
> > by FS_IOC_FSSETXATTR. The XFS i_flags update function assumes
> > exclusive access to the field as it is called under locked
> > transaction context. Similar code exists in ext4, btrfs, gfs2,
> > etc....
> 
> So in case of ext4, we actually do use inode_lock() to protect against
> racing IOC_SETFLAGS calls. i_flags is a strange mix and there are a few
> (like S_DEAD or S_NOSEC) which are not persistent and those get set /
> cleared by VFS. Some of those places (e.g. the clearing in
> __vfs_setxattr_noperm()) are not really controlled by the filesystem
> AFAICT. So when XFS doesn't use inode_lock() to protect i_flags updates it
> can race with VFS on i_flags updates.

There are several other filesystems with the same problem. It's not
actually clear what they rules for i_flags are. The comment above
inode_set_flags() is ambiguous at best, and even points this out:

 * In the long run, i_mutex is overkill, and we should probably look
 * at using the i_lock spinlock to protect i_flags, and then make sure
 * it is so documented in include/linux/fs.h and that all code follows
 * the locking convention!!

Perhaps that should be done before the situation gets worse...

Cheers

Dave.
diff mbox

Patch

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 20a2c02..d41e37f 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1159,6 +1159,20 @@  void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
+static void bd_add_dax(struct inode *inode)
+{
+	inode_lock(inode);
+	inode->i_flags |= S_DAX;
+	inode_unlock(inode);
+}
+
+static void bd_clear_dax(struct inode *inode)
+{
+	inode_lock(inode);
+	inode->i_flags &= ~S_DAX;
+	inode_unlock(inode);
+}
+
 static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
@@ -1172,6 +1186,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 {
 	struct gendisk *disk;
 	struct module *owner;
+	struct inode *inode;
 	int ret;
 	int partno;
 	int perm = 0;
@@ -1198,6 +1213,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	if (!disk)
 		goto out;
 	owner = disk->fops->owner;
+	inode = bdev->bd_inode;
 
 	disk_block_events(disk);
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
@@ -1206,9 +1222,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
 		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
-			bdev->bd_inode->i_flags = S_DAX;
-		else
-			bdev->bd_inode->i_flags = 0;
+			bd_add_dax(inode);
 
 		if (!partno) {
 			ret = -ENXIO;
@@ -1228,6 +1242,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 					bdev->bd_part = NULL;
 					bdev->bd_disk = NULL;
 					bdev->bd_queue = NULL;
+					bd_clear_dax(inode);
 					mutex_unlock(&bdev->bd_mutex);
 					disk_unblock_events(disk);
 					put_disk(disk);
@@ -1239,7 +1254,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
 				if (!blkdev_dax_capable(bdev))
-					bdev->bd_inode->i_flags &= ~S_DAX;
+					bd_clear_dax(inode);
 			}
 
 			/*
@@ -1276,7 +1291,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
 			if (!blkdev_dax_capable(bdev))
-				bdev->bd_inode->i_flags &= ~S_DAX;
+				bd_clear_dax(inode);
 		}
 	} else {
 		if (bdev->bd_contains == bdev) {
@@ -1297,6 +1312,11 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		put_disk(disk);
 		module_put(owner);
 	}
+	inode_lock(inode);
+	if (inode->i_flags & S_DAX)
+		inode->i_flags = S_DAX;
+	inode_unlock(inode);
+
 	bdev->bd_openers++;
 	if (for_part)
 		bdev->bd_part_count++;
@@ -1309,6 +1329,7 @@  static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	bdev->bd_disk = NULL;
 	bdev->bd_part = NULL;
 	bdev->bd_queue = NULL;
+	bd_clear_dax(inode);
 	if (bdev != bdev->bd_contains)
 		__blkdev_put(bdev->bd_contains, mode, 1);
 	bdev->bd_contains = NULL;