diff mbox

[17/18] xfs: implement pnfs export operations

Message ID 1420561721-9150-18-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Jan. 6, 2015, 4:28 p.m. UTC
Add operations to export pNFS block layouts from an XFS filesystem.  See
the previous commit adding the operations for an explanation of them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/Makefile     |   1 +
 fs/xfs/xfs_export.c |   6 ++
 fs/xfs/xfs_fsops.c  |   2 +
 fs/xfs/xfs_iops.c   |   2 +-
 fs/xfs/xfs_iops.h   |   1 +
 fs/xfs/xfs_mount.h  |   2 +
 fs/xfs/xfs_pnfs.c   | 270 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_pnfs.h   |  11 +++
 8 files changed, 294 insertions(+), 1 deletion(-)
 create mode 100644 fs/xfs/xfs_pnfs.c
 create mode 100644 fs/xfs/xfs_pnfs.h

Comments

Dave Chinner Jan. 7, 2015, 12:24 a.m. UTC | #1
On Tue, Jan 06, 2015 at 05:28:40PM +0100, Christoph Hellwig wrote:
> Add operations to export pNFS block layouts from an XFS filesystem.  See
> the previous commit adding the operations for an explanation of them.
.....
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index fdc6422..2b86be8 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -601,6 +601,8 @@ xfs_growfs_data(
>  	if (!mutex_trylock(&mp->m_growlock))
>  		return -EWOULDBLOCK;
>  	error = xfs_growfs_data_private(mp, in);
> +	if (!error)
> +		mp->m_generation++;
>  	mutex_unlock(&mp->m_growlock);
>  	return error;
>  }

I couldn't find an explanation of what this generation number is
for. What are it's semantics w.r.t. server crashes?

> +xfs_fs_get_uuid(
> +	struct super_block	*sb,
> +	u8			*buf,
> +	u32			*len,
> +	u64			*offset)
> +{
> +	struct xfs_mount	*mp = XFS_M(sb);
> +
> +	if (*len < sizeof(uuid_t))
> +		return -EINVAL;
> +
> +	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));

uuid_copy()?

> +	*len = sizeof(uuid_t);
> +	*offset = offsetof(struct xfs_dsb, sb_uuid);
> +	return 0;
> +}
> +
> +static void
> +xfs_map_iomap(
> +	struct xfs_inode	*ip,
> +	struct iomap		*iomap,
> +	struct xfs_bmbt_irec	*imap,
> +	xfs_off_t	offset)

xfs_bmbt_to_iomap()?

> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +
> +	iomap->blkno = -1;
> +	if (imap->br_startblock == HOLESTARTBLOCK)
> +		iomap->type = IOMAP_HOLE;
> +	else if (imap->br_startblock == DELAYSTARTBLOCK)
> +		iomap->type = IOMAP_DELALLOC;
> +	else {
> +		/*
> +		 * the block number in the iomap must match the start offset we
> +		 * place in the iomap.
> +		 */
> +		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
> +		ASSERT(iomap->blkno || XFS_IS_REALTIME_INODE(ip));
> +		if (imap->br_state == XFS_EXT_UNWRITTEN)
> +			iomap->type = IOMAP_UNWRITTEN;
> +		else
> +			iomap->type = IOMAP_MAPPED;
> +	}
> +	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
> +	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
> +}

Why does this function get passed an offset it is not actually used?

> +static int
> +xfs_fs_update_flags(
> +	struct xfs_inode	*ip)
> +{
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_trans	*tp;
> +	int			error;
> +
> +	/*
> +	 * Update the mode, and prealloc flag bits.
> +	 */
> +	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
> +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
> +	if (error) {
> +		xfs_trans_cancel(tp, 0);
> +		return error;
> +	}
> +
> +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> +	ip->i_d.di_mode &= ~S_ISUID;
> +	if (ip->i_d.di_mode & S_IXGRP)
> +		ip->i_d.di_mode &= ~S_ISGID;
> +
> +	ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
> +
> +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> +	return xfs_trans_commit(tp, 0);
> +}

That needs timestamp changes as well. i.e.:

	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

and at that point, it's basically the same code as in
xfs_file_fallocate() and xfs_ioc_space(), so should probably be
factored into a common operation.

> +
> +/*
> + * Get a layout for the pNFS client.
> + *
> + * Note that in the allocation case we do force out the transaction here.
> + * There is no metadata update that is required to be stable for NFS
> + * semantics, and layouts are not valid over a server crash.  Instead
> + * we'll have to be careful in the commit routine as it might pass us
> + * blocks for an allocation that never made it to disk in the recovery
> + * case.

I think you are saying that because block allocation is an async
transaction, then we have to deal with the possibility that we crash
before the transaction hits the disk.

How often do we have to allocate
new blocks like this? Do we need to use async transactions for this
case, or should we simply do the brute force thing (by making the
allocation transaction synchronous) initially and then, if
performance problems arise, optimise from there?

> + */
> +int
> +xfs_fs_map_blocks(
> +	struct inode		*inode,
> +	loff_t			offset,
> +	u64			length,
> +	struct iomap		*iomap,
> +	bool			write,
> +	u32			*device_generation)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_bmbt_irec	imap;
> +	xfs_fileoff_t		offset_fsb, end_fsb;
> +	loff_t			limit;
> +	int			bmapi_flags = XFS_BMAPI_ENTIRE;
> +	int			nimaps = 1;
> +	uint			lock_flags;
> +	int			error = 0;
> +
> +	if (XFS_FORCED_SHUTDOWN(mp))
> +		return -EIO;
> +	if (XFS_IS_REALTIME_INODE(ip))
> +		return -ENXIO;
> +
> +	xfs_ilock(ip, XFS_IOLOCK_EXCL);

Why are we locking out IO just to read the block map (needs a
comment)?

> +	if (!write) {
> +		limit = max(round_up(i_size_read(inode),
> +				     inode->i_sb->s_blocksize),
> +			    mp->m_super->s_maxbytes);
> +	} else {
> +		limit = mp->m_super->s_maxbytes;
> +	}

	limit = mp->m_super->s_maxbytes;
	if (!write)
		limit = max(limit, round_up(i_size_read(inode),
					    inode->i_sb->s_blocksize));
> +
> +	error = -EINVAL;
> +	if (offset > limit)
> +		goto out_unlock;
> +	if (offset + length > mp->m_super->s_maxbytes)
> +		length = limit - offset;

Need to catch a wrap through zero...

> +	/*
> +	 * Flush data and truncate the pagecache.  pNFS block clients just
> +	 * like direct I/O access the disk directly.
> +	 */
> +	error = filemap_write_and_wait(inode->i_mapping);
> +	if (error)
> +		goto out_unlock;
> +	invalidate_inode_pages2(inode->i_mapping);

invalidate_inode_pages2() can fail with EBUSY....

> +
> +	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
> +	offset_fsb = XFS_B_TO_FSBT(mp, offset);
> +
> +	lock_flags = xfs_ilock_data_map_shared(ip);
> +	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
> +				&imap, &nimaps, bmapi_flags);
> +	xfs_iunlock(ip, lock_flags);
> +
> +	if (error)
> +		goto out_unlock;
> +
> +	if (write) {

ASSERT(imap.br_startblock != DELAYSTARTBLOCK);

> +		if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
> +			error = xfs_iomap_write_direct(ip, offset, length,
> +						       &imap, nimaps);
> +			if (error)
> +				goto out_unlock;
> +		}
> +
> +		error = xfs_fs_update_flags(ip);
> +		if (error)
> +			goto out_unlock;
> +	}
> +	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
> +
> +	xfs_map_iomap(ip, iomap, &imap, offset);
> +	*device_generation = mp->m_generation;

So whenever the server first starts up the generation number in a
map is going to be zero - what purpose does this actually serve?

> +	return error;
> +out_unlock:
> +	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
> +	return error;
> +}
> +
> +/*
> + * Make sure the blocks described by maps are stable on disk.  This includes
> + * converting any unwritten extents, flushing the disk cache and updating the
> + * time stamps.
> + *
> + * Note that we rely on the caller to always send us a timestamp update so that
> + * we always commit a transaction here.  If that stops being true we will have
> + * to manually flush the cache here similar to what the fsync code path does
> + * for datasyncs on files that have no dirty metadata.

Needs an assert.

> + *
> + * In the reclaim case we might get called for blocks that were only allocated
> + * in memory and not on disk.  We rely on the fact that unwritten extent
> + * conversions handle this properly.
> + */

Making allocation transactions synchronous as well will make this
wart go away.

> +int
> +xfs_fs_commit_blocks(
> +	struct inode		*inode,
> +	struct iomap		*maps,
> +	int			nr_maps,
> +	struct iattr		*iattr)
> +{
> +	struct xfs_inode	*ip = XFS_I(inode);
> +	struct xfs_mount	*mp = ip->i_mount;
> +	struct xfs_trans	*tp;
> +	int			error, i;
> +	loff_t			size;
> +
> +	xfs_ilock(ip, XFS_IOLOCK_EXCL);
> +
> +	size = i_size_read(inode);
> +	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size)
> +		size = iattr->ia_size;
> +
> +	for (i = 0; i < nr_maps; i++) {
> +		u64 start, length, end;
> +
> +		start = maps[i].offset;
> +		if (start > size)
> +			continue;
> +
> +		end = start + maps[i].length;
> +		if (end > size)
> +			end = size;
> +
> +		length = end - start;
> +		if (!length)
> +			continue;
> +
> +		error = xfs_iomap_write_unwritten(ip, start, length);
> +		if (error)
> +			goto out_drop_iolock;
> +	}
> +
> +	/*
> +	 * Make sure reads through the pagecache see the new data.
> +	 */
> +	invalidate_inode_pages2(inode->i_mapping);

Probably should do that first. Also, what happens if there is local
dirty data on the file at this point? Doesn't this just toss them
away?

Cheers,

Dave.
Christoph Hellwig Jan. 7, 2015, 10:40 a.m. UTC | #2
On Wed, Jan 07, 2015 at 11:24:34AM +1100, Dave Chinner wrote:
> > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> > index fdc6422..2b86be8 100644
> > --- a/fs/xfs/xfs_fsops.c
> > +++ b/fs/xfs/xfs_fsops.c
> > @@ -601,6 +601,8 @@ xfs_growfs_data(
> >  	if (!mutex_trylock(&mp->m_growlock))
> >  		return -EWOULDBLOCK;
> >  	error = xfs_growfs_data_private(mp, in);
> > +	if (!error)
> > +		mp->m_generation++;
> >  	mutex_unlock(&mp->m_growlock);
> >  	return error;
> >  }
> 
> I couldn't find an explanation of what this generation number is
> for. What are it's semantics w.r.t. server crashes?

The generation is incremented when we grow the filesystem, so that
a new layout (block mapping) returned to the cl?ent referers to the
new NFS device ID, which will make the client aware of the new size.

The device IDs aren't persistent, so after a server crash / reboot
we'll start at zero again.

I'll add comments explaining this to the code.

> Why does this function get passed an offset it is not actually used?

Historic reasons..

> > +static int
> > +xfs_fs_update_flags(
> > +	struct xfs_inode	*ip)
> > +{
> > +	struct xfs_mount	*mp = ip->i_mount;
> > +	struct xfs_trans	*tp;
> > +	int			error;
> > +
> > +	/*
> > +	 * Update the mode, and prealloc flag bits.
> > +	 */
> > +	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
> > +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
> > +	if (error) {
> > +		xfs_trans_cancel(tp, 0);
> > +		return error;
> > +	}
> > +
> > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> > +	ip->i_d.di_mode &= ~S_ISUID;
> > +	if (ip->i_d.di_mode & S_IXGRP)
> > +		ip->i_d.di_mode &= ~S_ISGID;
> > +
> > +	ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
> > +
> > +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> > +	return xfs_trans_commit(tp, 0);
> > +}
> 
> That needs timestamp changes as well. i.e.:
> 
> 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);

The time stamps are only updated when we actually commit the data.
Updating them here might be harmless, but I'll have to dig into the
protocol specification and tests a bit more to check if doing the
additional timestamp update would be harmless.

> > +
> > +/*
> > + * Get a layout for the pNFS client.
> > + *
> > + * Note that in the allocation case we do force out the transaction here.
> > + * There is no metadata update that is required to be stable for NFS
> > + * semantics, and layouts are not valid over a server crash.  Instead
> > + * we'll have to be careful in the commit routine as it might pass us
> > + * blocks for an allocation that never made it to disk in the recovery
> > + * case.
> 
> I think you are saying that because block allocation is an async
> transaction, then we have to deal with the possibility that we crash
> before the transaction hits the disk.
> 
> How often do we have to allocate
> new blocks like this? Do we need to use async transactions for this
> case, or should we simply do the brute force thing (by making the
> allocation transaction synchronous) initially and then, if
> performance problems arise, optimise from there?

Every block allocation from a pNFS client goes through this path, so
yes it is performance critical.

> > +	xfs_map_iomap(ip, iomap, &imap, offset);
> > +	*device_generation = mp->m_generation;
> 
> So whenever the server first starts up the generation number in a
> map is going to be zero - what purpose does this actually serve?

So that we can communicate if a device was grown to the client, which
in this case needs to re-read the device information.

> > +		if (!length)
> > +			continue;
> > +
> > +		error = xfs_iomap_write_unwritten(ip, start, length);
> > +		if (error)
> > +			goto out_drop_iolock;
> > +	}
> > +
> > +	/*
> > +	 * Make sure reads through the pagecache see the new data.
> > +	 */
> > +	invalidate_inode_pages2(inode->i_mapping);
> 
> Probably should do that first. Also, what happens if there is local
> dirty data on the file at this point? Doesn't this just toss them
> away?

If there was local data it will be tossed.  For regular writes this can't
happen because we really outstanding layouts in the write path.  For
mmap we for now ignore this problem, as a pNFS server should generally
not be used locally.  
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Jan. 7, 2015, 9:11 p.m. UTC | #3
On Wed, Jan 07, 2015 at 11:40:10AM +0100, Christoph Hellwig wrote:
> On Wed, Jan 07, 2015 at 11:24:34AM +1100, Dave Chinner wrote:
> > > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> > > index fdc6422..2b86be8 100644
> > > --- a/fs/xfs/xfs_fsops.c
> > > +++ b/fs/xfs/xfs_fsops.c
> > > @@ -601,6 +601,8 @@ xfs_growfs_data(
> > >  	if (!mutex_trylock(&mp->m_growlock))
> > >  		return -EWOULDBLOCK;
> > >  	error = xfs_growfs_data_private(mp, in);
> > > +	if (!error)
> > > +		mp->m_generation++;
> > >  	mutex_unlock(&mp->m_growlock);
> > >  	return error;
> > >  }
> > 
> > I couldn't find an explanation of what this generation number is
> > for. What are it's semantics w.r.t. server crashes?
> 
> The generation is incremented when we grow the filesystem, so that
> a new layout (block mapping) returned to the cl?ent referers to the
> new NFS device ID, which will make the client aware of the new size.
> 
> The device IDs aren't persistent, so after a server crash / reboot
> we'll start at zero again.

So what happens if a grow occurs, then the server crashes, and the
client on reboot sees the same generation as before the grow
occured?

Perhaps it would be better to just initialise the generation with a
random number?

> I'll add comments explaining this to the code.
> 
> > Why does this function get passed an offset it is not actually used?
> 
> Historic reasons..
> 
> > > +static int
> > > +xfs_fs_update_flags(
> > > +	struct xfs_inode	*ip)
> > > +{
> > > +	struct xfs_mount	*mp = ip->i_mount;
> > > +	struct xfs_trans	*tp;
> > > +	int			error;
> > > +
> > > +	/*
> > > +	 * Update the mode, and prealloc flag bits.
> > > +	 */
> > > +	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
> > > +	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
> > > +	if (error) {
> > > +		xfs_trans_cancel(tp, 0);
> > > +		return error;
> > > +	}
> > > +
> > > +	xfs_ilock(ip, XFS_ILOCK_EXCL);
> > > +	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
> > > +	ip->i_d.di_mode &= ~S_ISUID;
> > > +	if (ip->i_d.di_mode & S_IXGRP)
> > > +		ip->i_d.di_mode &= ~S_ISGID;
> > > +
> > > +	ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
> > > +
> > > +	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
> > > +	return xfs_trans_commit(tp, 0);
> > > +}
> > 
> > That needs timestamp changes as well. i.e.:
> > 
> > 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
> 
> The time stamps are only updated when we actually commit the data.
> Updating them here might be harmless, but I'll have to dig into the
> protocol specification and tests a bit more to check if doing the
> additional timestamp update would be harmless.
> 
> > > +
> > > +/*
> > > + * Get a layout for the pNFS client.
> > > + *
> > > + * Note that in the allocation case we do force out the transaction here.
> > > + * There is no metadata update that is required to be stable for NFS
> > > + * semantics, and layouts are not valid over a server crash.  Instead
> > > + * we'll have to be careful in the commit routine as it might pass us
> > > + * blocks for an allocation that never made it to disk in the recovery
> > > + * case.
> > 
> > I think you are saying that because block allocation is an async
> > transaction, then we have to deal with the possibility that we crash
> > before the transaction hits the disk.
> > 
> > How often do we have to allocate
> > new blocks like this? Do we need to use async transactions for this
> > case, or should we simply do the brute force thing (by making the
> > allocation transaction synchronous) initially and then, if
> > performance problems arise, optimise from there?
> 
> Every block allocation from a pNFS client goes through this path, so
> yes it is performance critical.

Sure, but how many allocations per second are we expecting to have
to support? We can do tens of thousands of synchronous transactions
per second on luns with non-volatile write caches, so I'm really
wondering how much of a limitation this is going to be in the real
world. Do you have any numbers?

> > So whenever the server first starts up the generation number in a
> > map is going to be zero - what purpose does this actually serve?
> 
> So that we can communicate if a device was grown to the client, which
> in this case needs to re-read the device information.

Why does it need to reread the device information? the layouts that
are handled to it are still going to be valid from the server POV...

> > > +	/*
> > > +	 * Make sure reads through the pagecache see the new data.
> > > +	 */
> > > +	invalidate_inode_pages2(inode->i_mapping);
> > 
> > Probably should do that first. Also, what happens if there is local
> > dirty data on the file at this point? Doesn't this just toss them
> > away?
> 
> If there was local data it will be tossed.  For regular writes this can't
> happen because we really outstanding layouts in the write path.  For
> mmap we for now ignore this problem, as a pNFS server should generally
> not be used locally.  

Comments, please. ;)

Cheers,

Dave.
Christoph Hellwig Jan. 8, 2015, 12:43 p.m. UTC | #4
On Thu, Jan 08, 2015 at 08:11:40AM +1100, Dave Chinner wrote:
> So what happens if a grow occurs, then the server crashes, and the
> client on reboot sees the same generation as before the grow
> occured?

The client doesn't really see the generation.  It's party of the deviceid,
which is opaqueue to the client.

If the client sends the opaqueue device ID that contains the generation
after the grow to a server that had crashed / restarted the server
will reject it as the server starts at zero.  The causes the client
to get a new, valid device ID from the server.

Unlike the NFS file hadles which are persistent the device IDs are volatile
handles that can go away (and have really horrible life time rules..).

> > Every block allocation from a pNFS client goes through this path, so
> > yes it is performance critical.
> 
> Sure, but how many allocations per second are we expecting to have
> to support? We can do tens of thousands of synchronous transactions
> per second on luns with non-volatile write caches, so I'm really
> wondering how much of a limitation this is going to be in the real
> world. Do you have any numbers?

I don't have numbers right now without running specific benchmarks,
but the rate will be about the same as for local XFS use on the same
workload.

> 
> > > So whenever the server first starts up the generation number in a
> > > map is going to be zero - what purpose does this actually serve?
> > 
> > So that we can communicate if a device was grown to the client, which
> > in this case needs to re-read the device information.
> 
> Why does it need to reread the device information? the layouts that
> are handled to it are still going to be valid from the server POV...

The existing layouts are still valid.  But any new layout can reference the
added size, so any new layout needs to point to the new device ID.

Once the client sees the new device ID it needs to get the information for
it, which causes it to re-read the device information.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Jan. 8, 2015, 9:04 p.m. UTC | #5
On Thu, Jan 08, 2015 at 01:43:27PM +0100, Christoph Hellwig wrote:
> On Thu, Jan 08, 2015 at 08:11:40AM +1100, Dave Chinner wrote:
> > So what happens if a grow occurs, then the server crashes, and the
> > client on reboot sees the same generation as before the grow
> > occured?
> 
> The client doesn't really see the generation.  It's party of the deviceid,
> which is opaqueue to the client.
> 
> If the client sends the opaqueue device ID that contains the generation
> after the grow to a server that had crashed / restarted the server
> will reject it as the server starts at zero.  The causes the client
> to get a new, valid device ID from the server.

But if the server fs has a generation number of zero when it
crashes, how does the client tell that it needs a new device ID from
the server?

> Unlike the NFS file hadles which are persistent the device IDs are volatile
> handles that can go away (and have really horrible life time rules..).

Right. How the clients detect that "going away" when the device
generation is zero both before and after a server crash is the
question I'm asking....

> > > So that we can communicate if a device was grown to the client, which
> > > in this case needs to re-read the device information.
> > 
> > Why does it need to reread the device information? the layouts that
> > are handled to it are still going to be valid from the server POV...
> 
> The existing layouts are still valid.  But any new layout can reference the
> added size, so any new layout needs to point to the new device ID.
> 
> Once the client sees the new device ID it needs to get the information for
> it, which causes it to re-read the device information.

OK.

Cheers,

Dave.
Christoph Hellwig Jan. 9, 2015, 11:41 a.m. UTC | #6
On Fri, Jan 09, 2015 at 08:04:05AM +1100, Dave Chinner wrote:
> > If the client sends the opaqueue device ID that contains the generation
> > after the grow to a server that had crashed / restarted the server
> > will reject it as the server starts at zero.  The causes the client
> > to get a new, valid device ID from the server.
> 
> But if the server fs has a generation number of zero when it
> crashes, how does the client tell that it needs a new device ID from
> the server?
>
> > Unlike the NFS file hadles which are persistent the device IDs are volatile
> > handles that can go away (and have really horrible life time rules..).
> 
> Right. How the clients detect that "going away" when the device
> generation is zero both before and after a server crash is the
> question I'm asking....

The server tells the client by rejecting the operation using the
device ID.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Dave Chinner Jan. 12, 2015, 3:04 a.m. UTC | #7
On Fri, Jan 09, 2015 at 12:41:59PM +0100, Christoph Hellwig wrote:
> On Fri, Jan 09, 2015 at 08:04:05AM +1100, Dave Chinner wrote:
> > > If the client sends the opaqueue device ID that contains the generation
> > > after the grow to a server that had crashed / restarted the server
> > > will reject it as the server starts at zero.  The causes the client
> > > to get a new, valid device ID from the server.
> > 
> > But if the server fs has a generation number of zero when it
> > crashes, how does the client tell that it needs a new device ID from
> > the server?
> >
> > > Unlike the NFS file hadles which are persistent the device IDs are volatile
> > > handles that can go away (and have really horrible life time rules..).
> > 
> > Right. How the clients detect that "going away" when the device
> > generation is zero both before and after a server crash is the
> > question I'm asking....
> 
> The server tells the client by rejecting the operation using the
> device ID.

Ok, so:

client				server

get layout
dev id == 0
				grow
				gen++ (=1)
				crash
				....
				gen = 0 (initialised after boot)

commit layout
dev id == 0
				server executes op, even though
				device has changed....

What prevents this? Shouldn't the server be rejecting the commit
layout operation as there was a grow operation between the client
operations?

Cheers,

Dave.
Christoph Hellwig Jan. 14, 2015, 10:08 a.m. UTC | #8
On Mon, Jan 12, 2015 at 02:04:01PM +1100, Dave Chinner wrote:
> Ok, so:
> 
> client				server
> 
> get layout
> dev id == 0
> 				grow
> 				gen++ (=1)
> 				crash
> 				....
> 				gen = 0 (initialised after boot)
> 
> commit layout
> dev id == 0
> 				server executes op, even though
> 				device has changed....
> 
> What prevents this? Shouldn't the server be rejecting the commit
> layout operation as there was a grow operation between the client
> operations?

There is no need to reject the commit.  Grows for the block layout
driver never invalidate existing layouts, as they are purely grow
operation.  The only reason to bother with the generation is
to ensure that new layouts might point into areas the client
didn't previously known about.  So the interesting variation of your
scenario above is:

client				server

				grow
 				gen++ (=1)

get layout
dev id == (x, 1)
 				crash
 				....
 				gen = 0 (initialised after boot)
 
commit layout
id == 1

Which will be rejected, and the client either choses to get a
new layout / deviceID, or just writes the data back through normal
I/O.

Now one interesting case would be a resize that completed in
memory, gets a layout refering to it send out, but not commited to disk,
and then anothe resize to a smaller size before the commit.  Not
really practical, but if it happend we could get writes beyond the
end of the filesystem.

I didn't assume this was possible as I assumed growfs to be synchronous,
but it turns out while we do various synchronous buffer writes the
transaction isn't actually commited synchronously.

I think we should just make growfs commit the transaction
synchronously to avoid both the pnfs problem, as well as the
problem of growfs potentially updating the secondary superblocks
before the transaction hit the disk.
--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d617999..df68285 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@  xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)		+= xfs_stats.o
 xfs-$(CONFIG_SYSCTL)		+= xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)		+= xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS)		+= xfs_pnfs.o
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14..b97359b 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@ 
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_pnfs.h"
 
 /*
  * Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@  const struct export_operations xfs_export_operations = {
 	.fh_to_parent		= xfs_fs_fh_to_parent,
 	.get_parent		= xfs_fs_get_parent,
 	.commit_metadata	= xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+	.get_uuid		= xfs_fs_get_uuid,
+	.map_blocks		= xfs_fs_map_blocks,
+	.commit_blocks		= xfs_fs_commit_blocks,
+#endif
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc6422..2b86be8 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -601,6 +601,8 @@  xfs_growfs_data(
 	if (!mutex_trylock(&mp->m_growlock))
 		return -EWOULDBLOCK;
 	error = xfs_growfs_data_private(mp, in);
+	if (!error)
+		mp->m_generation++;
 	mutex_unlock(&mp->m_growlock);
 	return error;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311c..6ff84e8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -496,7 +496,7 @@  xfs_setattr_mode(
 	inode->i_mode |= mode & ~S_IFMT;
 }
 
-static void
+void
 xfs_setattr_time(
 	struct xfs_inode	*ip,
 	struct iattr		*iattr)
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e43..ea7a98e 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@  extern void xfs_setup_inode(struct xfs_inode *);
  */
 #define XFS_ATTR_NOACL		0x01	/* Don't call posix_acl_chmod */
 
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
 			       int flags);
 extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69..aba26c8 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -175,6 +175,8 @@  typedef struct xfs_mount {
 	struct workqueue_struct	*m_reclaim_workqueue;
 	struct workqueue_struct	*m_log_workqueue;
 	struct workqueue_struct *m_eofblocks_workqueue;
+
+	__uint32_t		m_generation;	/* incremented on each growfs */
 } xfs_mount_t;
 
 /*
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 0000000..d95f596
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,270 @@ 
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_pnfs.h"
+
+int
+xfs_fs_get_uuid(
+	struct super_block	*sb,
+	u8			*buf,
+	u32			*len,
+	u64			*offset)
+{
+	struct xfs_mount	*mp = XFS_M(sb);
+
+	if (*len < sizeof(uuid_t))
+		return -EINVAL;
+
+	memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+	*len = sizeof(uuid_t);
+	*offset = offsetof(struct xfs_dsb, sb_uuid);
+	return 0;
+}
+
+static void
+xfs_map_iomap(
+	struct xfs_inode	*ip,
+	struct iomap		*iomap,
+	struct xfs_bmbt_irec	*imap,
+	xfs_off_t	offset)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	iomap->blkno = -1;
+	if (imap->br_startblock == HOLESTARTBLOCK)
+		iomap->type = IOMAP_HOLE;
+	else if (imap->br_startblock == DELAYSTARTBLOCK)
+		iomap->type = IOMAP_DELALLOC;
+	else {
+		/*
+		 * the block number in the iomap must match the start offset we
+		 * place in the iomap.
+		 */
+		iomap->blkno = xfs_fsb_to_db(ip, imap->br_startblock);
+		ASSERT(iomap->blkno || XFS_IS_REALTIME_INODE(ip));
+		if (imap->br_state == XFS_EXT_UNWRITTEN)
+			iomap->type = IOMAP_UNWRITTEN;
+		else
+			iomap->type = IOMAP_MAPPED;
+	}
+	iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+	iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+
+static int
+xfs_fs_update_flags(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	/*
+	 * Update the mode, and prealloc flag bits.
+	 */
+	tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
+	if (error) {
+		xfs_trans_cancel(tp, 0);
+		return error;
+	}
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	ip->i_d.di_mode &= ~S_ISUID;
+	if (ip->i_d.di_mode & S_IXGRP)
+		ip->i_d.di_mode &= ~S_ISGID;
+
+	ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+	return xfs_trans_commit(tp, 0);
+}
+
+/*
+ * Get a layout for the pNFS client.
+ *
+ * Note that in the allocation case we do force out the transaction here.
+ * There is no metadata update that is required to be stable for NFS
+ * semantics, and layouts are not valid over a server crash.  Instead
+ * we'll have to be careful in the commit routine as it might pass us
+ * blocks for an allocation that never made it to disk in the recovery
+ * case.
+ */
+int
+xfs_fs_map_blocks(
+	struct inode		*inode,
+	loff_t			offset,
+	u64			length,
+	struct iomap		*iomap,
+	bool			write,
+	u32			*device_generation)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	loff_t			limit;
+	int			bmapi_flags = XFS_BMAPI_ENTIRE;
+	int			nimaps = 1;
+	uint			lock_flags;
+	int			error = 0;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+	if (XFS_IS_REALTIME_INODE(ip))
+		return -ENXIO;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+	if (!write) {
+		limit = max(round_up(i_size_read(inode),
+				     inode->i_sb->s_blocksize),
+			    mp->m_super->s_maxbytes);
+	} else {
+		limit = mp->m_super->s_maxbytes;
+	}
+
+	error = -EINVAL;
+	if (offset > limit)
+		goto out_unlock;
+	if (offset + length > mp->m_super->s_maxbytes)
+		length = limit - offset;
+
+	/*
+	 * Flush data and truncate the pagecache.  pNFS block clients just
+	 * like direct I/O access the disk directly.
+	 */
+	error = filemap_write_and_wait(inode->i_mapping);
+	if (error)
+		goto out_unlock;
+	invalidate_inode_pages2(inode->i_mapping);
+
+	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+
+	lock_flags = xfs_ilock_data_map_shared(ip);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, bmapi_flags);
+	xfs_iunlock(ip, lock_flags);
+
+	if (error)
+		goto out_unlock;
+
+	if (write) {
+		if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+			error = xfs_iomap_write_direct(ip, offset, length,
+						       &imap, nimaps);
+			if (error)
+				goto out_unlock;
+		}
+
+		error = xfs_fs_update_flags(ip);
+		if (error)
+			goto out_unlock;
+	}
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+
+	xfs_map_iomap(ip, iomap, &imap, offset);
+	*device_generation = mp->m_generation;
+	return error;
+out_unlock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ *
+ * In the reclaim case we might get called for blocks that were only allocated
+ * in memory and not on disk.  We rely on the fact that unwritten extent
+ * conversions handle this properly.
+ */
+int
+xfs_fs_commit_blocks(
+	struct inode		*inode,
+	struct iomap		*maps,
+	int			nr_maps,
+	struct iattr		*iattr)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error, i;
+	loff_t			size;
+
+	xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+	size = i_size_read(inode);
+	if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size)
+		size = iattr->ia_size;
+
+	for (i = 0; i < nr_maps; i++) {
+		u64 start, length, end;
+
+		start = maps[i].offset;
+		if (start > size)
+			continue;
+
+		end = start + maps[i].length;
+		if (end > size)
+			end = size;
+
+		length = end - start;
+		if (!length)
+			continue;
+
+		error = xfs_iomap_write_unwritten(ip, start, length);
+		if (error)
+			goto out_drop_iolock;
+	}
+
+	/*
+	 * Make sure reads through the pagecache see the new data.
+	 */
+	invalidate_inode_pages2(inode->i_mapping);
+
+	tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+	if (error)
+		goto out_drop_iolock;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	xfs_setattr_time(ip, iattr);
+	if (iattr->ia_valid & ATTR_SIZE) {
+		if (iattr->ia_size > i_size_read(inode)) {
+			i_size_write(inode, iattr->ia_size);
+			ip->i_d.di_size = iattr->ia_size;
+		}
+	}
+
+	xfs_trans_set_sync(tp);
+	error = xfs_trans_commit(tp, 0);
+
+out_drop_iolock:
+	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 0000000..0d91255
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,11 @@ 
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+		struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+		struct iattr *iattr);
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */