diff mbox series

[RFC] iomap: Remove indirect function call

Message ID 20200328155156.GS22483@bombadil.infradead.org (mailing list archive)
State New, archived
Headers show
Series [RFC] iomap: Remove indirect function call | expand

Commit Message

Matthew Wilcox March 28, 2020, 3:51 p.m. UTC
By splitting iomap_apply into __iomap_apply and an inline iomap_apply,
we convert the call to 'actor' into a direct function call.  I haven't
done any performance measurements, but given the current costs of indirect
function calls, this would seem worthwhile to me?

Comments

Christoph Hellwig March 31, 2020, 7:46 a.m. UTC | #1
On Sat, Mar 28, 2020 at 08:51:56AM -0700, Matthew Wilcox wrote:
> By splitting iomap_apply into __iomap_apply and an inline iomap_apply,
> we convert the call to 'actor' into a direct function call.  I haven't
> done any performance measurements, but given the current costs of indirect
> function calls, this would seem worthwhile to me?

Hmm.  Given that emount of compiler stupidity we are dealing with did
you at least look at the assembly output to see if this actually removes
the indirect call?  I wouldn't be quite sure.
Matthew Wilcox March 31, 2020, 1:02 p.m. UTC | #2
On Tue, Mar 31, 2020 at 12:46:28AM -0700, Christoph Hellwig wrote:
> On Sat, Mar 28, 2020 at 08:51:56AM -0700, Matthew Wilcox wrote:
> > By splitting iomap_apply into __iomap_apply and an inline iomap_apply,
> > we convert the call to 'actor' into a direct function call.  I haven't
> > done any performance measurements, but given the current costs of indirect
> > function calls, this would seem worthwhile to me?
> 
> Hmm.  Given that emount of compiler stupidity we are dealing with did
> you at least look at the assembly output to see if this actually removes
> the indirect call?  I wouldn't be quite sure.

If it does get inlined, then the compiler does it:

     b9d:       e8 ae fe ff ff          callq  a50 <iomap_page_mkwrite_actor>

If not, then the compiler emits a function in each file called iomap_apply
which makes an indirect call.  So s/inline/__always_inline/ in the original
patch.

before:
   text	   data	    bss	    dec	    hex	filename
   5314	   4648	      0	   9962	   26ea	fs/iomap/trace.o
   1050	     72	      0	   1122	    462	fs/iomap/apply.o
  17316	    636	    224	  18176	   4700	fs/iomap/buffered-io.o
   4773	     76	      0	   4849	   12f1	fs/iomap/direct-io.o
   1335	     28	      0	   1363	    553	fs/iomap/fiemap.o
   1928	     28	      0	   1956	    7a4	fs/iomap/seek.o
   1394	      8	      0	   1402	    57a	fs/iomap/swapfile.o

after:
   text	   data	    bss	    dec	    hex	filename
   5314	   4648	      0	   9962	   26ea	fs/iomap/trace.o
    722	     72	      0	    794	    31a	fs/iomap/apply.o
  18784	    636	    224	  19644	   4cbc	fs/iomap/buffered-io.o
   5169	     76	      0	   5245	   147d	fs/iomap/direct-io.o
   2093	     28	      0	   2121	    849	fs/iomap/fiemap.o
   2467	     28	      0	   2495	    9bf	fs/iomap/seek.o
   1664	      8	      0	   1672	    688	fs/iomap/swapfile.o

33110 to 36213 bytes of text.  So not free.  Worthwhile?
diff mbox series

Patch

diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c
index 76925b40b5fd..c747cbf66a3d 100644
--- a/fs/iomap/apply.c
+++ b/fs/iomap/apply.c
@@ -9,24 +9,11 @@ 
 #include <linux/iomap.h>
 #include "trace.h"
 
-/*
- * Execute a iomap write on a segment of the mapping that spans a
- * contiguous range of pages that have identical block mapping state.
- *
- * This avoids the need to map pages individually, do individual allocations
- * for each page and most importantly avoid the need for filesystem specific
- * locking per page. Instead, all the operations are amortised over the entire
- * range of pages. It is assumed that the filesystems will lock whatever
- * resources they require in the iomap_begin call, and release them in the
- * iomap_end call.
- */
-loff_t
-iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
-		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
+loff_t __iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, struct iomap *iomap, struct iomap *srcmap,
+		const struct iomap_ops *ops, iomap_actor_t actor)
 {
-	struct iomap iomap = { .type = IOMAP_HOLE };
-	struct iomap srcmap = { .type = IOMAP_HOLE };
-	loff_t written = 0, ret;
+	loff_t ret;
 	u64 end;
 
 	trace_iomap_apply(inode, pos, length, flags, ops, actor, _RET_IP_);
@@ -43,52 +30,26 @@  iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
 	 * expose transient stale data. If the reserve fails, we can safely
 	 * back out at this point as there is nothing to undo.
 	 */
-	ret = ops->iomap_begin(inode, pos, length, flags, &iomap, &srcmap);
+	ret = ops->iomap_begin(inode, pos, length, flags, iomap, srcmap);
 	if (ret)
 		return ret;
-	if (WARN_ON(iomap.offset > pos))
+	if (WARN_ON(iomap->offset > pos))
 		return -EIO;
-	if (WARN_ON(iomap.length == 0))
+	if (WARN_ON(iomap->length == 0))
 		return -EIO;
 
-	trace_iomap_apply_dstmap(inode, &iomap);
-	if (srcmap.type != IOMAP_HOLE)
-		trace_iomap_apply_srcmap(inode, &srcmap);
+	trace_iomap_apply_dstmap(inode, iomap);
+	if (srcmap->type != IOMAP_HOLE)
+		trace_iomap_apply_srcmap(inode, srcmap);
 
 	/*
 	 * Cut down the length to the one actually provided by the filesystem,
 	 * as it might not be able to give us the whole size that we requested.
 	 */
-	end = iomap.offset + iomap.length;
-	if (srcmap.type != IOMAP_HOLE)
-		end = min(end, srcmap.offset + srcmap.length);
+	end = iomap->offset + iomap->length;
+	if (srcmap->type != IOMAP_HOLE)
+		end = min(end, srcmap->offset + srcmap->length);
 	if (pos + length > end)
 		length = end - pos;
-
-	/*
-	 * Now that we have guaranteed that the space allocation will succeed,
-	 * we can do the copy-in page by page without having to worry about
-	 * failures exposing transient data.
-	 *
-	 * To support COW operations, we read in data for partially blocks from
-	 * the srcmap if the file system filled it in.  In that case we the
-	 * length needs to be limited to the earlier of the ends of the iomaps.
-	 * If the file system did not provide a srcmap we pass in the normal
-	 * iomap into the actors so that they don't need to have special
-	 * handling for the two cases.
-	 */
-	written = actor(inode, pos, length, data, &iomap,
-			srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
-
-	/*
-	 * Now the data has been copied, commit the range we've copied.  This
-	 * should not fail unless the filesystem has had a fatal error.
-	 */
-	if (ops->iomap_end) {
-		ret = ops->iomap_end(inode, pos, length,
-				     written > 0 ? written : 0,
-				     flags, &iomap);
-	}
-
-	return written ? written : ret;
+	return length;
 }
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 8b09463dae0d..31e82e4d30f8 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -148,9 +148,62 @@  struct iomap_ops {
 typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 		void *data, struct iomap *iomap, struct iomap *srcmap);
 
-loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
-		unsigned flags, const struct iomap_ops *ops, void *data,
-		iomap_actor_t actor);
+loff_t __iomap_apply(struct inode *inode, loff_t pos, loff_t length,
+		unsigned flags, struct iomap *iomap, struct iomap *srcmap,
+		const struct iomap_ops *ops, iomap_actor_t actor);
+
+/*
+ * Execute a iomap write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ *
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the iomap_begin call, and release them in the
+ * iomap_end call.
+ */
+static inline loff_t
+iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
+		const struct iomap_ops *ops, void *data, iomap_actor_t actor)
+{
+	struct iomap iomap = { .type = IOMAP_HOLE };
+	struct iomap srcmap = { .type = IOMAP_HOLE };
+	loff_t written;
+
+	length = __iomap_apply(inode, pos, length, flags, &iomap, &srcmap,
+				ops, actor);
+	/*
+	 * Now that we have guaranteed that the space allocation will succeed,
+	 * we can do the copy-in page by page without having to worry about
+	 * failures exposing transient data.
+	 *
+	 * To support COW operations, we read in data for partially blocks from
+	 * the srcmap if the file system filled it in.  In that case we the
+	 * length needs to be limited to the earlier of the ends of the iomaps.
+	 * If the file system did not provide a srcmap we pass in the normal
+	 * iomap into the actors so that they don't need to have special
+	 * handling for the two cases.
+	 */
+	if (length < 0)
+		return length;
+
+	written = actor(inode, pos, length, data, &iomap,
+			srcmap.type != IOMAP_HOLE ? &srcmap : &iomap);
+	/*
+	 * Now the data has been copied, commit the range we've copied.  This
+	 * should not fail unless the filesystem has had a fatal error.
+	 */
+	if (ops->iomap_end) {
+		loff_t ret = ops->iomap_end(inode, pos, length,
+				     written > 0 ? written : 0,
+				     flags, &iomap);
+		if (!written)
+			written = ret;
+	}
+
+	return written;
+}
 
 ssize_t iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *from,
 		const struct iomap_ops *ops);