diff mbox

[v3,5/8] gfs2: Implement iomap buffered write support

Message ID 20180323191728.23819-6-agruenba@redhat.com (mailing list archive)
State New, archived
Headers show

Commit Message

Andreas Gruenbacher March 23, 2018, 7:17 p.m. UTC
With the traditional page-based writes, blocks are allocated separately
for each page written to.  With iomap writes, we can allocate a lot more
blocks at once, with a fraction of the allocation overhead for each
page.

Signed-off-by: Andreas Gruenbacher <agruenba@redhat.com>
---
 fs/gfs2/aops.c |  20 +++--
 fs/gfs2/aops.h |  22 +++++
 fs/gfs2/bmap.c | 263 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 fs/gfs2/file.c |  44 ++++++++--
 4 files changed, 325 insertions(+), 24 deletions(-)
 create mode 100644 fs/gfs2/aops.h

Comments

Bob Peterson April 6, 2018, 2:44 p.m. UTC | #1
Hi,

----- Original Message -----
(snip)
> +static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t
> length,
> +				  unsigned flags, struct iomap *iomap)
(snip)
> +{
> +	struct metapath mp = { .mp_aheight = 1, };
> +	struct gfs2_inode *ip = GFS2_I(inode);
> +	struct gfs2_sbd *sdp = GFS2_SB(inode);
> +	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
> +	bool unstuff, alloc_required;
> +	int ret;
> +
> +	ret = gfs2_write_lock(inode);
> +	if (ret)
> +		return ret;
> +
> +	unstuff = gfs2_is_stuffed(ip) &&
> +		  pos + length > gfs2_max_stuffed_size(ip);
> +
> +	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
> +	if (ret)
> +		goto out_release;
> +
> +	alloc_required = unstuff || iomap->type != IOMAP_MAPPED;
> +
> +	if (alloc_required || gfs2_is_jdata(ip))
> +		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, &ind_blocks);

It would be ideal if the iomap_get could tell us how many blocks are mapped
and how many are unmapped, if it wouldn't cause unnecessary IO.
If we could use the number of unmapped blocks rather than iomap->length, it could
potentially save us from reserving unnecessary journal space, for example, on
rewrites (iow where the metadata for the writes already exists).
(snip)

Bob Peterson
Andreas Grünbacher April 6, 2018, 3:15 p.m. UTC | #2
2018-04-06 16:44 GMT+02:00 Bob Peterson <rpeterso@redhat.com>:
> Hi,
>
> ----- Original Message -----
> (snip)
>> +static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t
>> length,
>> +                               unsigned flags, struct iomap *iomap)
> (snip)
>> +{
>> +     struct metapath mp = { .mp_aheight = 1, };
>> +     struct gfs2_inode *ip = GFS2_I(inode);
>> +     struct gfs2_sbd *sdp = GFS2_SB(inode);
>> +     unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
>> +     bool unstuff, alloc_required;
>> +     int ret;
>> +
>> +     ret = gfs2_write_lock(inode);
>> +     if (ret)
>> +             return ret;
>> +
>> +     unstuff = gfs2_is_stuffed(ip) &&
>> +               pos + length > gfs2_max_stuffed_size(ip);
>> +
>> +     ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
>> +     if (ret)
>> +             goto out_release;
>> +
>> +     alloc_required = unstuff || iomap->type != IOMAP_MAPPED;
>> +
>> +     if (alloc_required || gfs2_is_jdata(ip))
>> +             gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, &ind_blocks);
>
> It would be ideal if the iomap_get could tell us how many blocks are mapped
> and how many are unmapped, if it wouldn't cause unnecessary IO.
> If we could use the number of unmapped blocks rather than iomap->length, it could
> potentially save us from reserving unnecessary journal space, for example, on
> rewrites (iow where the metadata for the writes already exists).
> (snip)

Yes, I'll change gfs2_iomap_get to return the number of blocks
gfs2_iomap_alloc should be able to allocate. It's a simple variant of
hole_size; the maximum will be an entire indirect block worth of data.

Thanks,
Andreas
diff mbox

Patch

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a1c6b5de9200..3d9633175aa8 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -22,6 +22,7 @@ 
 #include <linux/backing-dev.h>
 #include <linux/uio.h>
 #include <trace/events/writeback.h>
+#include <linux/sched/signal.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -36,10 +37,11 @@ 
 #include "super.h"
 #include "util.h"
 #include "glops.h"
+#include "aops.h"
 
 
-static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
-				   unsigned int from, unsigned int len)
+void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+			    unsigned int from, unsigned int len)
 {
 	struct buffer_head *head = page_buffers(page);
 	unsigned int bsize = head->b_size;
@@ -462,7 +464,7 @@  static int gfs2_jdata_writepages(struct address_space *mapping,
  * Returns: errno
  */
 
-static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
+int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
 {
 	struct buffer_head *dibh;
 	u64 dsize = i_size_read(&ip->i_inode);
@@ -773,7 +775,7 @@  static int gfs2_write_begin(struct file *file, struct address_space *mapping,
  * adjust_fs_space - Adjusts the free space available due to gfs2_grow
  * @inode: the rindex inode
  */
-static void adjust_fs_space(struct inode *inode)
+void adjust_fs_space(struct inode *inode)
 {
 	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
 	struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -819,11 +821,11 @@  static void adjust_fs_space(struct inode *inode)
  * This copies the data from the page into the inode block after
  * the inode data structure itself.
  *
- * Returns: errno
+ * Returns: copied bytes or errno
  */
-static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
-				  loff_t pos, unsigned copied,
-				  struct page *page)
+int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+			   loff_t pos, unsigned copied,
+			   struct page *page)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	u64 to = pos + copied;
@@ -862,7 +864,7 @@  static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
  * The main write_end function for GFS2. We just put our locking around the VFS
  * provided functions.
  *
- * Returns: errno
+ * Returns: copied bytes or errno
  */
 
 static int gfs2_write_end(struct file *file, struct address_space *mapping,
diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h
new file mode 100644
index 000000000000..976bb32dd405
--- /dev/null
+++ b/fs/gfs2/aops.h
@@ -0,0 +1,22 @@ 
+/*
+ * Copyright (C) 2017 Red Hat, Inc.  All rights reserved.
+ *
+ * This copyrighted material is made available to anyone wishing to use,
+ * modify, copy, or redistribute it subject to the terms and conditions
+ * of the GNU General Public License version 2.
+ */
+
+#ifndef __AOPS_DOT_H__
+#define __AOPS_DOT_H__
+
+#include "incore.h"
+
+extern int stuffed_readpage(struct gfs2_inode *ip, struct page *page);
+extern int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
+				  loff_t pos, unsigned copied,
+				  struct page *page);
+extern void adjust_fs_space(struct inode *inode);
+extern void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
+				   unsigned int from, unsigned int len);
+
+#endif /* __AOPS_DOT_H__ */
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1ed7bc022d7b..b6ee5252c014 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -28,6 +28,7 @@ 
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
+#include "aops.h"
 #include "trace_gfs2.h"
 
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
@@ -41,6 +42,8 @@  struct metapath {
 	int mp_aheight; /* actual height (lookup height) */
 };
 
+static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
+
 /**
  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  * @ip: the inode
@@ -375,7 +378,7 @@  static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 	return mp->mp_aheight - x - 1;
 }
 
-static inline void release_metapath(struct metapath *mp)
+static void release_metapath(struct metapath *mp)
 {
 	int i;
 
@@ -383,6 +386,7 @@  static inline void release_metapath(struct metapath *mp)
 		if (mp->mp_bh[i] == NULL)
 			break;
 		brelse(mp->mp_bh[i]);
+		mp->mp_bh[i] = NULL;
 	}
 }
 
@@ -675,14 +679,15 @@  static u64 hole_size(struct inode *inode, sector_t lblock, struct metapath *mp)
 	return holesz << inode->i_blkbits;
 }
 
-static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap)
+static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap,
+			       u64 length)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 
 	iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 		      sizeof(struct gfs2_dinode);
 	iomap->offset = 0;
-	iomap->length = i_size_read(inode);
+	iomap->length = length;
 	iomap->type = IOMAP_MAPPED;
 	iomap->flags = IOMAP_F_DATA_INLINE;
 }
@@ -722,10 +727,15 @@  static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 		if (flags & IOMAP_REPORT) {
 			if (pos >= i_size_read(inode))
 				return -ENOENT;
-			gfs2_stuffed_iomap(inode, iomap);
+			gfs2_stuffed_iomap(inode, iomap,
+					   i_size_read(inode));
+			return 0;
+		}
+		if (pos + length <= gfs2_max_stuffed_size(ip)) {
+			gfs2_stuffed_iomap(inode, iomap,
+					   gfs2_max_stuffed_size(ip));
 			return 0;
 		}
-		BUG_ON(!(flags & IOMAP_WRITE));
 	}
 	lblock = pos >> inode->i_blkbits;
 	lblock_stop = (pos + length - 1) >> inode->i_blkbits;
@@ -796,9 +806,140 @@  static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 		else
 			iomap->length = size - pos;
 	}
+	/* FIXME: Should we limit iomap->length to the maximum allocation size
+	 * here according to how gfs2_iomap_alloc allocates blocks? */
 	goto out;
 }
 
+static int gfs2_write_lock(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh);
+	error = gfs2_glock_nq(&ip->i_gh);
+	if (error)
+		goto out_uninit;
+	if (&ip->i_inode == sdp->sd_rindex) {
+		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+		error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+					   GL_NOCACHE, &m_ip->i_gh);
+		if (error)
+			goto out_unlock;
+	}
+	return 0;
+
+out_unlock:
+	gfs2_glock_dq(&ip->i_gh);
+out_uninit:
+	gfs2_holder_uninit(&ip->i_gh);
+	return error;
+}
+
+static void gfs2_write_unlock(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+	if (&ip->i_inode == sdp->sd_rindex) {
+		struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+
+		gfs2_glock_dq_uninit(&m_ip->i_gh);
+	}
+	gfs2_glock_dq_uninit(&ip->i_gh);
+}
+
+static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos, loff_t length,
+				  unsigned flags, struct iomap *iomap)
+{
+	struct metapath mp = { .mp_aheight = 1, };
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
+	bool unstuff, alloc_required;
+	int ret;
+
+	ret = gfs2_write_lock(inode);
+	if (ret)
+		return ret;
+
+	unstuff = gfs2_is_stuffed(ip) &&
+		  pos + length > gfs2_max_stuffed_size(ip);
+
+	ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
+	if (ret)
+		goto out_release;
+
+	alloc_required = unstuff || iomap->type != IOMAP_MAPPED;
+
+	if (alloc_required || gfs2_is_jdata(ip))
+		gfs2_write_calc_reserv(ip, iomap->length, &data_blocks, &ind_blocks);
+
+	if (alloc_required) {
+		struct gfs2_alloc_parms ap = { .target = data_blocks + ind_blocks };
+		ret = gfs2_quota_lock_check(ip, &ap);
+		if (ret)
+			goto out_release;
+
+		ret = gfs2_inplace_reserve(ip, &ap);
+		if (ret)
+			goto out_qunlock;
+	}
+
+	rblocks = RES_DINODE + ind_blocks;
+	if (gfs2_is_jdata(ip))
+		rblocks += data_blocks;
+	if (ind_blocks || data_blocks)
+		rblocks += RES_STATFS + RES_QUOTA;
+	if (inode == sdp->sd_rindex)
+		rblocks += 2 * RES_STATFS;
+	if (alloc_required)
+		rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
+
+	ret = gfs2_trans_begin(sdp, rblocks, iomap->length >> inode->i_blkbits);
+	if (ret)
+		goto out_trans_fail;
+
+	if (unstuff) {
+		ret = gfs2_unstuff_dinode(ip, NULL);
+		if (ret)
+			goto out_trans_end;
+		release_metapath(&mp);
+		ret = gfs2_iomap_get(inode, iomap->offset, iomap->length,
+				     flags, iomap, &mp);
+		if (ret)
+			goto out_trans_end;
+	}
+
+	if (iomap->type != IOMAP_MAPPED) {
+		ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
+		if (ret) {
+			gfs2_trans_end(sdp);
+			if (alloc_required)
+				gfs2_inplace_release(ip);
+			punch_hole(ip, iomap->offset, iomap->length);
+			goto out_qunlock;
+		}
+	}
+	release_metapath(&mp);
+	return 0;
+
+out_trans_end:
+	gfs2_trans_end(sdp);
+out_trans_fail:
+	if (alloc_required)
+		gfs2_inplace_release(ip);
+out_qunlock:
+	if (alloc_required)
+		gfs2_quota_unlock(ip);
+out_release:
+	release_metapath(&mp);
+	gfs2_write_unlock(inode);
+	return ret;
+}
+
 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 			    unsigned flags, struct iomap *iomap)
 {
@@ -808,10 +949,7 @@  static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 
 	trace_gfs2_iomap_start(ip, pos, length, flags);
 	if (flags & IOMAP_WRITE) {
-		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
-		if (!ret && iomap->type == IOMAP_HOLE)
-			ret = gfs2_iomap_alloc(inode, iomap, flags, &mp);
-		release_metapath(&mp);
+		ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap);
 	} else {
 		ret = gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
 		release_metapath(&mp);
@@ -820,8 +958,115 @@  static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
 	return ret;
 }
 
+static int
+gfs2_iomap_write_begin(struct inode *inode, loff_t pos, unsigned len,
+		       unsigned flags, struct page **pagep, struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct page *page;
+	int ret;
+
+	if (gfs2_is_stuffed(ip)) {
+		BUG_ON(pos + len > gfs2_max_stuffed_size(ip));
+
+		page = grab_cache_page_write_begin(inode->i_mapping, 0, flags);
+		if (!page)
+			return -ENOMEM;
+
+		if (!PageUptodate(page)) {
+			ret = stuffed_readpage(ip, page);
+			if (ret) {
+				unlock_page(page);
+				put_page(page);
+				return ret;
+			}
+		}
+		*pagep = page;
+		return 0;
+	}
+
+	return iomap_write_begin(inode, pos, len, flags, pagep, iomap);
+}
+
+static int gfs2_iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
+				unsigned copied, struct page *page)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct buffer_head *dibh;
+	int ret;
+
+	if (gfs2_is_stuffed(ip)) {
+		ret = gfs2_meta_inode_buffer(ip, &dibh);
+		if (ret) {
+			unlock_page(page);
+			put_page(page);
+			return ret;
+		}
+		ret = gfs2_stuffed_write_end(inode, dibh, pos, copied, page);
+		brelse(dibh);
+		return ret;
+	}
+
+	if (gfs2_is_jdata(ip))
+		gfs2_page_add_databufs(ip, page, offset_in_page(pos), len);
+
+	return iomap_write_end(inode, pos, len, copied, page);
+}
+
+static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+			  ssize_t written, unsigned flags, struct iomap *iomap)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_trans *tr = current->journal_info;
+
+	if (!(flags & IOMAP_WRITE))
+		return 0;
+
+	gfs2_ordered_add_inode(ip);
+
+	if (tr->tr_num_buf_new)
+		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+	else {
+		struct buffer_head *dibh;
+		int ret;
+
+		ret = gfs2_meta_inode_buffer(ip, &dibh);
+		if (unlikely(ret))
+			return ret;
+		gfs2_trans_add_meta(ip->i_gl, dibh);
+		brelse(dibh);
+	}
+
+	if (inode == sdp->sd_rindex) {
+		adjust_fs_space(inode);
+		sdp->sd_rindex_uptodate = 0;
+	}
+
+	gfs2_trans_end(sdp);
+	gfs2_inplace_release(ip);
+
+	if (length != written && (iomap->flags & IOMAP_F_NEW)) {
+		/* Deallocate blocks that were just allocated. */
+		loff_t end = round_down(pos + length, PAGE_SIZE);
+		pos = round_up(pos, PAGE_SIZE);
+		if (pos < end) {
+			truncate_pagecache_range(inode, pos, end - 1);
+			punch_hole(ip, pos, end - pos);
+		}
+	}
+
+	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
+		gfs2_quota_unlock(ip);
+	gfs2_write_unlock(inode);
+	return 0;
+}
+
 const struct iomap_ops gfs2_iomap_ops = {
 	.iomap_begin = gfs2_iomap_begin,
+	.write_begin = gfs2_iomap_write_begin,
+	.write_end = gfs2_iomap_write_end,
+	.iomap_end = gfs2_iomap_end,
 };
 
 /**
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 2edd3a9a7b79..d1b54e781577 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,10 +26,12 @@ 
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/delay.h>
+#include <linux/backing-dev.h>
 
 #include "gfs2.h"
 #include "incore.h"
 #include "bmap.h"
+#include "aops.h"
 #include "dir.h"
 #include "glock.h"
 #include "glops.h"
@@ -691,9 +693,7 @@  static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 /**
  * gfs2_file_write_iter - Perform a write to a file
  * @iocb: The io context
- * @iov: The data to write
- * @nr_segs: Number of @iov segments
- * @pos: The file position
+ * @from: The data to write
  *
  * We have to do a lock/unlock here to refresh the inode size for
  * O_APPEND writes, otherwise we can land up writing at the wrong
@@ -705,8 +705,9 @@  static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct gfs2_inode *ip = GFS2_I(file_inode(file));
-	int ret;
+	struct inode *inode = file->f_mapping->host;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	ssize_t ret;
 
 	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
@@ -723,7 +724,38 @@  static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		gfs2_glock_dq_uninit(&gh);
 	}
 
-	return generic_file_write_iter(iocb, from);
+	if (iocb->ki_flags & IOCB_DIRECT)
+		return generic_file_write_iter(iocb, from);
+
+	inode_lock(inode);
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+
+	ret = file_remove_privs(file);
+	if (ret)
+		goto out;
+
+	ret = file_update_time(file);
+	if (ret)
+		goto out;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = inode_to_bdi(inode);
+
+	ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+
+	current->backing_dev_info = NULL;
+
+out:
+	inode_unlock(inode);
+	if (likely(ret > 0)) {
+		iocb->ki_pos += ret;
+
+		/* Handle various SYNC-type writes */
+		ret = generic_write_sync(iocb, ret);
+	}
+	return ret;
 }
 
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,