diff mbox

[13/13] orangefs: implement write through the page cache

Message ID 1495447141-12216-14-git-send-email-martin@omnibond.com (mailing list archive)
State New, archived
Headers show

Commit Message

Martin Brandenburg May 22, 2017, 9:59 a.m. UTC
With this and the last commit, OrangeFS is capable of writing through
the page cache.  This should significantly increase performance of very
small writes, since writeback will not be done for every write call.

However it is not appropriate for use with multiple clients due to the
long writeback delay.

Signed-off-by: Martin Brandenburg <martin@omnibond.com>
---
 fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------
 1 file changed, 56 insertions(+), 72 deletions(-)

Comments

Jeff Layton May 25, 2017, 4:09 p.m. UTC | #1
On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote:
> With this and the last commit, OrangeFS is capable of writing through
> the page cache.  This should significantly increase performance of very
> small writes, since writeback will not be done for every write call.
> 
> However it is not appropriate for use with multiple clients due to the
> long writeback delay.
> 
> Signed-off-by: Martin Brandenburg <martin@omnibond.com>
> ---
>  fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------
>  1 file changed, 56 insertions(+), 72 deletions(-)
> 
> diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
> index c03deea..3ab0a1f 100644
> --- a/fs/orangefs/file.c
> +++ b/fs/orangefs/file.c
> @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
>  	return generic_file_read_iter(iocb, iter);
>  }
>  
> -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> +static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
> +    struct iov_iter *iter)
>  {
> -	struct file *file = iocb->ki_filp;
> -	loff_t pos;
> -	ssize_t rc;
> -
> -	BUG_ON(iocb->private);
> -
> -	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
> -
> -	inode_lock(file->f_mapping->host);
> -
> -	/* Make sure generic_write_checks sees an up to date inode size. */
> -	if (file->f_flags & O_APPEND) {
> -		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
> -		    STATX_SIZE);
> -		if (rc == -ESTALE)
> -			rc = -EIO;
> -		if (rc) {
> -			gossip_err("%s: orangefs_inode_getattr failed, "
> -			    "rc:%zd:.\n", __func__, rc);
> -			goto out;
> -		}
> -	}
> -
> -	if (file->f_pos > i_size_read(file->f_mapping->host))
> -		orangefs_i_size_write(file->f_mapping->host, file->f_pos);
> -
> -	rc = generic_write_checks(iocb, iter);
> -
> -	if (rc <= 0) {
> -		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
> -			   __func__, rc);
> -		goto out;
> -	}
> -
> -	/*
> -	 * if we are appending, generic_write_checks would have updated
> -	 * pos to the end of the file, so we will wait till now to set
> -	 * pos...
> -	 */
> -	pos = *(&iocb->ki_pos);
> -
> -	rc = do_readv_writev(ORANGEFS_IO_WRITE,
> -			     file,
> -			     &pos,
> -			     iter);
> -	if (rc < 0) {
> -		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
> -			   __func__, rc);
> -		goto out;
> -	}
> -
> -	iocb->ki_pos = pos;
>  	orangefs_stats.writes++;
> -
> -	if (pos > i_size_read(file->f_mapping->host))
> -		orangefs_i_size_write(file->f_mapping->host, pos);
> -
> -out:
> -
> -	inode_unlock(file->f_mapping->host);
> -	return rc;
> +	return generic_file_write_iter(iocb, iter);
>  }
>  
>  /*
> @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
>  	orangefs_flush_inode(inode);
>  
>  	/*
> -	 * remove all associated inode pages from the page cache and
> -	 * readahead cache (if any); this forces an expensive refresh of
> -	 * data for the next caller of mmap (or 'get_block' accesses)
> +	 * remove all associated inode pages from the readahead cache
> +	 * (if any)
>  	 */
>  	if (file_inode(file) &&
>  	    file_inode(file)->i_mapping &&
> @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
>  			gossip_debug(GOSSIP_INODE_DEBUG,
>  			    "flush_racache finished\n");
>  		}
> -		truncate_inode_pages(file_inode(file)->i_mapping,
> -				     0);
>  	}
>  	return 0;
>  }
> @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = {
>  	.fsync		= orangefs_fsync,
>  };
>  
> +static int orangefs_writepage(struct page *page,
> +    struct writeback_control *wbc)
> +{
> +	struct inode *inode = page->mapping->host;
> +	struct iov_iter iter;
> +	struct iovec iov;
> +	loff_t off;
> +	size_t len;
> +	ssize_t r;
> +	void *map;
> +
> +	off = page_offset(page);
> +	len = i_size_read(inode);
> +	if (off + PAGE_SIZE > len)
> +		len = len - off;
> +	else
> +		len = PAGE_SIZE;
> +
> +	map = kmap_atomic(page);
> +	iov.iov_base = map;
> +	iov.iov_len = len;
> +	iov_iter_init(&iter, WRITE, &iov, 1, len);
> +
> +	set_page_writeback(page);
> +

wait_for_direct_io can sleep, so you can't use kmap_atomic there.
Regular old kmap should be ok there though.

Also, you probably really don't want to use iovecs there, as those are
expected to deal in userland addresses and the kmap address won't be
one.

It may be cleaner to use a bio_vec there instead. You most likely
wouldn't need to kmap at all if you did that.

> +	r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> +	    len, 0);
> +	kunmap_atomic(map);
> +

When writeback fails for some reason, you'll also want to call
mapping_set_error to help ensure that those errors get reported (unless
you're tracking them on your own somehow). I don't see where that's
being done in a cursory glance at these patches, but I could have missed
it.

> +	end_page_writeback(page);
> +	unlock_page(page);
> +	return 0;
> +}
> +
> static int orangefs_readpage(struct file *file, struct page *page)
>  {
>  	int ret;
> @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page)
>  	return ret;
>  }
>  
> +static int orangefs_write_end(struct file *file,
> +    struct address_space *mapping, loff_t pos, unsigned len,
> +    unsigned copied, struct page *page, void *fsdata)
> +{
> +	int r;
> +	r = simple_write_end(file, mapping, pos, len, copied, page,
> +	    fsdata);
> +	mark_inode_dirty_sync(file_inode(file));
> +	return r;
> +}
> +
>  static void orangefs_invalidatepage(struct page *page,
>  				 unsigned int offset,
>  				 unsigned int length)
> @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
>  {
>  	struct file *file = iocb->ki_filp;
>  	loff_t pos = *(&iocb->ki_pos);
> -	/*
> -	 * This cannot happen until write_iter becomes
> -	 * generic_file_write_iter.
> -	 */
> -	BUG_ON(iov_iter_rw(iter) != READ);
> -	return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
> +	return do_readv_writev(iocb->ki_flags & IOCB_WRITE ?
> +	    ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
>  }
>  
>  /** ORANGEFS2 implementation of address space operations */
>  const struct address_space_operations orangefs_address_operations = {
> +	.writepage = orangefs_writepage,
>  	.readpage = orangefs_readpage,
> +	.set_page_dirty = __set_page_dirty_nobuffers,
> +	.write_begin = simple_write_begin,
> +	.write_end = orangefs_write_end,
>  	.invalidatepage = orangefs_invalidatepage,
>  	.releasepage = orangefs_releasepage,
>  	.direct_IO = orangefs_direct_IO,
Martin Brandenburg May 26, 2017, 6:09 p.m. UTC | #2
On Thu, May 25, 2017 at 12:09:49PM -0400, Jeff Layton wrote:
> On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote:
> > With this and the last commit, OrangeFS is capable of writing through
> > the page cache.  This should significantly increase performance of very
> > small writes, since writeback will not be done for every write call.
> > 
> > However it is not appropriate for use with multiple clients due to the
> > long writeback delay.
> > 
> > Signed-off-by: Martin Brandenburg <martin@omnibond.com>
> > ---
> >  fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------
> >  1 file changed, 56 insertions(+), 72 deletions(-)
> > 
> > diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
> > index c03deea..3ab0a1f 100644
> > --- a/fs/orangefs/file.c
> > +++ b/fs/orangefs/file.c
> > @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
> >  	return generic_file_read_iter(iocb, iter);
> >  }
> >  
> > -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > +static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
> > +    struct iov_iter *iter)
> >  {
> > -	struct file *file = iocb->ki_filp;
> > -	loff_t pos;
> > -	ssize_t rc;
> > -
> > -	BUG_ON(iocb->private);
> > -
> > -	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
> > -
> > -	inode_lock(file->f_mapping->host);
> > -
> > -	/* Make sure generic_write_checks sees an up to date inode size. */
> > -	if (file->f_flags & O_APPEND) {
> > -		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
> > -		    STATX_SIZE);
> > -		if (rc == -ESTALE)
> > -			rc = -EIO;
> > -		if (rc) {
> > -			gossip_err("%s: orangefs_inode_getattr failed, "
> > -			    "rc:%zd:.\n", __func__, rc);
> > -			goto out;
> > -		}
> > -	}
> > -
> > -	if (file->f_pos > i_size_read(file->f_mapping->host))
> > -		orangefs_i_size_write(file->f_mapping->host, file->f_pos);
> > -
> > -	rc = generic_write_checks(iocb, iter);
> > -
> > -	if (rc <= 0) {
> > -		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
> > -			   __func__, rc);
> > -		goto out;
> > -	}
> > -
> > -	/*
> > -	 * if we are appending, generic_write_checks would have updated
> > -	 * pos to the end of the file, so we will wait till now to set
> > -	 * pos...
> > -	 */
> > -	pos = *(&iocb->ki_pos);
> > -
> > -	rc = do_readv_writev(ORANGEFS_IO_WRITE,
> > -			     file,
> > -			     &pos,
> > -			     iter);
> > -	if (rc < 0) {
> > -		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
> > -			   __func__, rc);
> > -		goto out;
> > -	}
> > -
> > -	iocb->ki_pos = pos;
> >  	orangefs_stats.writes++;
> > -
> > -	if (pos > i_size_read(file->f_mapping->host))
> > -		orangefs_i_size_write(file->f_mapping->host, pos);
> > -
> > -out:
> > -
> > -	inode_unlock(file->f_mapping->host);
> > -	return rc;
> > +	return generic_file_write_iter(iocb, iter);
> >  }
> >  
> >  /*
> > @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> >  	orangefs_flush_inode(inode);
> >  
> >  	/*
> > -	 * remove all associated inode pages from the page cache and
> > -	 * readahead cache (if any); this forces an expensive refresh of
> > -	 * data for the next caller of mmap (or 'get_block' accesses)
> > +	 * remove all associated inode pages from the readahead cache
> > +	 * (if any)
> >  	 */
> >  	if (file_inode(file) &&
> >  	    file_inode(file)->i_mapping &&
> > @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> >  			gossip_debug(GOSSIP_INODE_DEBUG,
> >  			    "flush_racache finished\n");
> >  		}
> > -		truncate_inode_pages(file_inode(file)->i_mapping,
> > -				     0);
> >  	}
> >  	return 0;
> >  }
> > @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = {
> >  	.fsync		= orangefs_fsync,
> >  };
> >  
> > +static int orangefs_writepage(struct page *page,
> > +    struct writeback_control *wbc)
> > +{
> > +	struct inode *inode = page->mapping->host;
> > +	struct iov_iter iter;
> > +	struct iovec iov;
> > +	loff_t off;
> > +	size_t len;
> > +	ssize_t r;
> > +	void *map;
> > +
> > +	off = page_offset(page);
> > +	len = i_size_read(inode);
> > +	if (off + PAGE_SIZE > len)
> > +		len = len - off;
> > +	else
> > +		len = PAGE_SIZE;
> > +
> > +	map = kmap_atomic(page);
> > +	iov.iov_base = map;
> > +	iov.iov_len = len;
> > +	iov_iter_init(&iter, WRITE, &iov, 1, len);
> > +
> > +	set_page_writeback(page);
> > +
> 
> wait_for_direct_io can sleep, so you can't use kmap_atomic there.
> Regular old kmap should be ok there though.
> 
> Also, you probably really don't want to use iovecs there, as those are
> expected to deal in userland addresses and the kmap address won't be
> one.
> 
> It may be cleaner to use a bio_vec there instead. You most likely
> wouldn't need to kmap at all if you did that.

I tried that since that's what readpage does.

	static int orangefs_writepage(struct page *page,
	    struct writeback_control *wbc)
	{
		struct inode *inode = page->mapping->host;
		struct iov_iter iter;
		struct bio_vec bv;
		loff_t off;
		size_t len;
		ssize_t r;

		off = page_offset(page);
		len = i_size_read(inode);
		if (off + PAGE_SIZE > len)
			len = len - off;
		else
			len = PAGE_SIZE;

		bv.bv_page = page;
		bv.bv_len = len;
		iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, len);

		set_page_writeback(page);

		r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
		   len, 0);
		printk("orangefs_writepage wait_for_direct_io returned %d\n", (int)r);

		end_page_writeback(page);
		unlock_page(page);
		return 0;
	}

But this produced:

	[  125.739396] BUG: unable to handle kernel paging request at ffff880303833802
	[  125.739429] IP: __memcpy+0x12/0x20
	[  125.739439] PGD 2ec7067 
	[  125.739441] PUD 0 

	[  125.739463] Oops: 0000 [#1] SMP
	[  125.739474] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl snd_hda_codec_realtek nfs snd_hda_codec_generic lockd grace fscache sunrpc adt7475 hwmon_vid nouveau video mxm_wmi wmi coretemp i2c_algo_bit kvm_intel drm_kms_helper ttm drm kvm snd_hda_intel snd_hda_codec irqbypass snd_hwdep evdev iTCO_wdt dcdbas iTCO_vendor_support pcspkr snd_hda_core serio_raw i2c_i801 snd_pcm snd_timer lpc_ich mfd_core i2c_core shpchp snd soundcore button acpi_cpufreq autofs4 ext4 crc16 jbd2 mbcache hid_generic usbhid hid sg sr_mod cdrom sd_mod usb_storage ata_generic ata_piix libata psmouse firewire_ohci scsi_mod r8169 firewire_core ehci_pci mii uhci_hcd ehci_hcd crc_itu_t usbcore usb_common
	[  125.739683] CPU: 1 PID: 111 Comm: kworker/u8:2 Not tainted 4.11.0+ #66
	[  125.739697] Hardware name: Dell Inc. Studio 540      /0M017G, BIOS 1.1.1 07/24/2009
	[  125.739718] Workqueue: writeback wb_workfn (flush-orangefs-1)
	[  125.739734] task: ffff88021d9bc100 task.stack: ffffc900016d8000
	[  125.739748] RIP: 0010:__memcpy+0x12/0x20
	[  125.739759] RSP: 0018:ffffc900016db800 EFLAGS: 00010246
	[  125.739773] RAX: ffff88020693d000 RBX: 0000000000001000 RCX: 0000000000000200
	[  125.739788] RDX: 0000000000000000 RSI: ffff880303833802 RDI: ffff88020693d000
	[  125.739803] RBP: ffffc900016db808 R08: 0000160000000000 R09: 00000000ffff8802
	[  125.739818] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000001000
	[  125.739834] R13: ffffc900016db990 R14: 0000000000000000 R15: ffff88020693e000
	[  125.739849] FS:  0000000000000000(0000) GS:ffff880227000000(0000) knlGS:0000000000000000
	[  125.739866] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
	[  125.739879] CR2: ffff880303833802 CR3: 0000000220d97000 CR4: 00000000000406e0
	[  125.739895] Call Trace:
	[  125.739906]  ? memcpy_from_page+0x46/0x80
	[  125.739918]  copy_from_iter+0x1f1/0x380
	[  125.739929]  ? get+0x55/0x2a0
	[  125.739942]  copy_page_from_iter+0x166/0x250
	[  125.739957]  orangefs_bufmap_copy_from_iovec+0x77/0xc0
	[  125.739971]  wait_for_direct_io+0x1cb/0x500
	[  125.739987]  orangefs_writepage+0x76/0xb0
	[  125.740002]  __writepage+0x16/0x40
	[  125.740014]  write_cache_pages+0x266/0x5c0
	[  125.740027]  ? sched_clock_local+0x17/0x90
	[  125.740040]  ? wb_position_ratio+0x1f0/0x1f0
	[  125.740060]  generic_writepages+0x52/0x70
	[  125.740074]  do_writepages+0x2b/0x30
	[  125.740084]  ? do_writepages+0x2b/0x30
	[  125.740096]  __writeback_single_inode+0x61/0x6d0
	[  125.740109]  ? _raw_spin_unlock+0x27/0x30
	[  125.740122]  writeback_sb_inodes+0x2e4/0x680
	[  125.740141]  __writeback_inodes_wb+0x8f/0xc0
	[  125.740155]  wb_writeback+0x32c/0x4f0
	[  125.740171]  wb_workfn+0x9c/0x560
	[  125.740181]  ? wb_workfn+0x9c/0x560
	[  125.740194]  ? process_one_work+0x164/0x650
	[  125.740210]  process_one_work+0x1e1/0x650
	[  125.740225]  worker_thread+0x69/0x4a0
	[  125.740240]  kthread+0x127/0x160
	[  125.740250]  ? process_one_work+0x650/0x650
	[  125.740262]  ? kthread_create_on_node+0x40/0x40
	[  125.740275]  ret_from_fork+0x31/0x40
	[  125.740291] Code: 75 05 e8 72 fb ff ff 48 8b 43 60 48 2b 43 50 88 43 4e 5b 5d f3 c3 90 90 90 66 66 90 66 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 
	[  125.740413] RIP: __memcpy+0x12/0x20 RSP: ffffc900016db800
	[  125.740425] CR2: ffff880303833802
	[  125.740437] ---[ end trace 844fdaac4da7b68d ]---

> 
> > +	r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> > +	    len, 0);
> > +	kunmap_atomic(map);
> > +
> 
> When writeback fails for some reason, you'll also want to call
> mapping_set_error to help ensure that those errors get reported (unless
> you're tracking them on your own somehow). I don't see where that's
> being done in a cursory glance at these patches, but I could have missed
> it.
> 
> > +	end_page_writeback(page);
> > +	unlock_page(page);
> > +	return 0;
> > +}
> > +
> > static int orangefs_readpage(struct file *file, struct page *page)
> >  {
> >  	int ret;
> > @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page)
> >  	return ret;
> >  }
> >  
> > +static int orangefs_write_end(struct file *file,
> > +    struct address_space *mapping, loff_t pos, unsigned len,
> > +    unsigned copied, struct page *page, void *fsdata)
> > +{
> > +	int r;
> > +	r = simple_write_end(file, mapping, pos, len, copied, page,
> > +	    fsdata);
> > +	mark_inode_dirty_sync(file_inode(file));
> > +	return r;
> > +}
> > +
> >  static void orangefs_invalidatepage(struct page *page,
> >  				 unsigned int offset,
> >  				 unsigned int length)
> > @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
> >  {
> >  	struct file *file = iocb->ki_filp;
> >  	loff_t pos = *(&iocb->ki_pos);
> > -	/*
> > -	 * This cannot happen until write_iter becomes
> > -	 * generic_file_write_iter.
> > -	 */
> > -	BUG_ON(iov_iter_rw(iter) != READ);
> > -	return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
> > +	return do_readv_writev(iocb->ki_flags & IOCB_WRITE ?
> > +	    ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
> >  }
> >  
> >  /** ORANGEFS2 implementation of address space operations */
> >  const struct address_space_operations orangefs_address_operations = {
> > +	.writepage = orangefs_writepage,
> >  	.readpage = orangefs_readpage,
> > +	.set_page_dirty = __set_page_dirty_nobuffers,
> > +	.write_begin = simple_write_begin,
> > +	.write_end = orangefs_write_end,
> >  	.invalidatepage = orangefs_invalidatepage,
> >  	.releasepage = orangefs_releasepage,
> >  	.direct_IO = orangefs_direct_IO,
> 
> -- 
> Jeff Layton <jlayton@redhat.com>
Jeff Layton May 26, 2017, 6:48 p.m. UTC | #3
On Fri, 2017-05-26 at 14:09 -0400, martin@omnibond.com wrote:
> On Thu, May 25, 2017 at 12:09:49PM -0400, Jeff Layton wrote:
> > On Mon, 2017-05-22 at 05:59 -0400, Martin Brandenburg wrote:
> > > With this and the last commit, OrangeFS is capable of writing through
> > > the page cache.  This should significantly increase performance of very
> > > small writes, since writeback will not be done for every write call.
> > > 
> > > However it is not appropriate for use with multiple clients due to the
> > > long writeback delay.
> > > 
> > > Signed-off-by: Martin Brandenburg <martin@omnibond.com>
> > > ---
> > >  fs/orangefs/file.c | 128 +++++++++++++++++++++++------------------------------
> > >  1 file changed, 56 insertions(+), 72 deletions(-)
> > > 
> > > diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
> > > index c03deea..3ab0a1f 100644
> > > --- a/fs/orangefs/file.c
> > > +++ b/fs/orangefs/file.c
> > > @@ -448,69 +448,11 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
> > >  	return generic_file_read_iter(iocb, iter);
> > >  }
> > >  
> > > -static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
> > > +static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
> > > +    struct iov_iter *iter)
> > >  {
> > > -	struct file *file = iocb->ki_filp;
> > > -	loff_t pos;
> > > -	ssize_t rc;
> > > -
> > > -	BUG_ON(iocb->private);
> > > -
> > > -	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
> > > -
> > > -	inode_lock(file->f_mapping->host);
> > > -
> > > -	/* Make sure generic_write_checks sees an up to date inode size. */
> > > -	if (file->f_flags & O_APPEND) {
> > > -		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
> > > -		    STATX_SIZE);
> > > -		if (rc == -ESTALE)
> > > -			rc = -EIO;
> > > -		if (rc) {
> > > -			gossip_err("%s: orangefs_inode_getattr failed, "
> > > -			    "rc:%zd:.\n", __func__, rc);
> > > -			goto out;
> > > -		}
> > > -	}
> > > -
> > > -	if (file->f_pos > i_size_read(file->f_mapping->host))
> > > -		orangefs_i_size_write(file->f_mapping->host, file->f_pos);
> > > -
> > > -	rc = generic_write_checks(iocb, iter);
> > > -
> > > -	if (rc <= 0) {
> > > -		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
> > > -			   __func__, rc);
> > > -		goto out;
> > > -	}
> > > -
> > > -	/*
> > > -	 * if we are appending, generic_write_checks would have updated
> > > -	 * pos to the end of the file, so we will wait till now to set
> > > -	 * pos...
> > > -	 */
> > > -	pos = *(&iocb->ki_pos);
> > > -
> > > -	rc = do_readv_writev(ORANGEFS_IO_WRITE,
> > > -			     file,
> > > -			     &pos,
> > > -			     iter);
> > > -	if (rc < 0) {
> > > -		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
> > > -			   __func__, rc);
> > > -		goto out;
> > > -	}
> > > -
> > > -	iocb->ki_pos = pos;
> > >  	orangefs_stats.writes++;
> > > -
> > > -	if (pos > i_size_read(file->f_mapping->host))
> > > -		orangefs_i_size_write(file->f_mapping->host, pos);
> > > -
> > > -out:
> > > -
> > > -	inode_unlock(file->f_mapping->host);
> > > -	return rc;
> > > +	return generic_file_write_iter(iocb, iter);
> > >  }
> > >  
> > >  /*
> > > @@ -606,9 +548,8 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> > >  	orangefs_flush_inode(inode);
> > >  
> > >  	/*
> > > -	 * remove all associated inode pages from the page cache and
> > > -	 * readahead cache (if any); this forces an expensive refresh of
> > > -	 * data for the next caller of mmap (or 'get_block' accesses)
> > > +	 * remove all associated inode pages from the readahead cache
> > > +	 * (if any)
> > >  	 */
> > >  	if (file_inode(file) &&
> > >  	    file_inode(file)->i_mapping &&
> > > @@ -621,8 +562,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
> > >  			gossip_debug(GOSSIP_INODE_DEBUG,
> > >  			    "flush_racache finished\n");
> > >  		}
> > > -		truncate_inode_pages(file_inode(file)->i_mapping,
> > > -				     0);
> > >  	}
> > >  	return 0;
> > >  }
> > > @@ -741,6 +680,40 @@ const struct file_operations orangefs_file_operations = {
> > >  	.fsync		= orangefs_fsync,
> > >  };
> > >  
> > > +static int orangefs_writepage(struct page *page,
> > > +    struct writeback_control *wbc)
> > > +{
> > > +	struct inode *inode = page->mapping->host;
> > > +	struct iov_iter iter;
> > > +	struct iovec iov;
> > > +	loff_t off;
> > > +	size_t len;
> > > +	ssize_t r;
> > > +	void *map;
> > > +
> > > +	off = page_offset(page);
> > > +	len = i_size_read(inode);
> > > +	if (off + PAGE_SIZE > len)
> > > +		len = len - off;
> > > +	else
> > > +		len = PAGE_SIZE;
> > > +
> > > +	map = kmap_atomic(page);
> > > +	iov.iov_base = map;
> > > +	iov.iov_len = len;
> > > +	iov_iter_init(&iter, WRITE, &iov, 1, len);
> > > +
> > > +	set_page_writeback(page);
> > > +
> > 
> > wait_for_direct_io can sleep, so you can't use kmap_atomic there.
> > Regular old kmap should be ok there though.
> > 
> > Also, you probably really don't want to use iovecs there, as those are
> > expected to deal in userland addresses and the kmap address won't be
> > one.
> > 
> > It may be cleaner to use a bio_vec there instead. You most likely
> > wouldn't need to kmap at all if you did that.
> 
> I tried that since that's what readpage does.
> 
> 	static int orangefs_writepage(struct page *page,
> 	    struct writeback_control *wbc)
> 	{
> 		struct inode *inode = page->mapping->host;
> 		struct iov_iter iter;
> 		struct bio_vec bv;
> 		loff_t off;
> 		size_t len;
> 		ssize_t r;
> 
> 		off = page_offset(page);
> 		len = i_size_read(inode);
> 		if (off + PAGE_SIZE > len)
> 			len = len - off;
> 		else
> 			len = PAGE_SIZE;
> 
> 		bv.bv_page = page;
> 		bv.bv_len = len;

You didn't set bv.bv_offset there.

> 		iov_iter_bvec(&iter, ITER_BVEC | WRITE, &bv, 1, len);
> 
> 		set_page_writeback(page);
> 
> 		r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> 		   len, 0);
> 		printk("orangefs_writepage wait_for_direct_io returned %d\n", (int)r);
> 
> 		end_page_writeback(page);
> 		unlock_page(page);
> 		return 0;
> 	}
> 
> But this produced:
> 
> 	[  125.739396] BUG: unable to handle kernel paging request at ffff880303833802
> 	[  125.739429] IP: __memcpy+0x12/0x20
> 	[  125.739439] PGD 2ec7067 
> 	[  125.739441] PUD 0 
> 
> 	[  125.739463] Oops: 0000 [#1] SMP
> 	[  125.739474] Modules linked in: nfsd auth_rpcgss oid_registry nfs_acl snd_hda_codec_realtek nfs snd_hda_codec_generic lockd grace fscache sunrpc adt7475 hwmon_vid nouveau video mxm_wmi wmi coretemp i2c_algo_bit kvm_intel drm_kms_helper ttm drm kvm snd_hda_intel snd_hda_codec irqbypass snd_hwdep evdev iTCO_wdt dcdbas iTCO_vendor_support pcspkr snd_hda_core serio_raw i2c_i801 snd_pcm snd_timer lpc_ich mfd_core i2c_core shpchp snd soundcore button acpi_cpufreq autofs4 ext4 crc16 jbd2 mbcache hid_generic usbhid hid sg sr_mod cdrom sd_mod usb_storage ata_generic ata_piix libata psmouse firewire_ohci scsi_mod r8169 firewire_core ehci_pci mii uhci_hcd ehci_hcd crc_itu_t usbcore usb_common
> 	[  125.739683] CPU: 1 PID: 111 Comm: kworker/u8:2 Not tainted 4.11.0+ #66
> 	[  125.739697] Hardware name: Dell Inc. Studio 540      /0M017G, BIOS 1.1.1 07/24/2009
> 	[  125.739718] Workqueue: writeback wb_workfn (flush-orangefs-1)
> 	[  125.739734] task: ffff88021d9bc100 task.stack: ffffc900016d8000
> 	[  125.739748] RIP: 0010:__memcpy+0x12/0x20
> 	[  125.739759] RSP: 0018:ffffc900016db800 EFLAGS: 00010246
> 	[  125.739773] RAX: ffff88020693d000 RBX: 0000000000001000 RCX: 0000000000000200
> 	[  125.739788] RDX: 0000000000000000 RSI: ffff880303833802 RDI: ffff88020693d000
> 	[  125.739803] RBP: ffffc900016db808 R08: 0000160000000000 R09: 00000000ffff8802
> 	[  125.739818] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000001000
> 	[  125.739834] R13: ffffc900016db990 R14: 0000000000000000 R15: ffff88020693e000
> 	[  125.739849] FS:  0000000000000000(0000) GS:ffff880227000000(0000) knlGS:0000000000000000
> 	[  125.739866] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> 	[  125.739879] CR2: ffff880303833802 CR3: 0000000220d97000 CR4: 00000000000406e0
> 	[  125.739895] Call Trace:
> 	[  125.739906]  ? memcpy_from_page+0x46/0x80
> 	[  125.739918]  copy_from_iter+0x1f1/0x380
> 	[  125.739929]  ? get+0x55/0x2a0
> 	[  125.739942]  copy_page_from_iter+0x166/0x250
> 	[  125.739957]  orangefs_bufmap_copy_from_iovec+0x77/0xc0
> 	[  125.739971]  wait_for_direct_io+0x1cb/0x500
> 	[  125.739987]  orangefs_writepage+0x76/0xb0
> 	[  125.740002]  __writepage+0x16/0x40
> 	[  125.740014]  write_cache_pages+0x266/0x5c0
> 	[  125.740027]  ? sched_clock_local+0x17/0x90
> 	[  125.740040]  ? wb_position_ratio+0x1f0/0x1f0
> 	[  125.740060]  generic_writepages+0x52/0x70
> 	[  125.740074]  do_writepages+0x2b/0x30
> 	[  125.740084]  ? do_writepages+0x2b/0x30
> 	[  125.740096]  __writeback_single_inode+0x61/0x6d0
> 	[  125.740109]  ? _raw_spin_unlock+0x27/0x30
> 	[  125.740122]  writeback_sb_inodes+0x2e4/0x680
> 	[  125.740141]  __writeback_inodes_wb+0x8f/0xc0
> 	[  125.740155]  wb_writeback+0x32c/0x4f0
> 	[  125.740171]  wb_workfn+0x9c/0x560
> 	[  125.740181]  ? wb_workfn+0x9c/0x560
> 	[  125.740194]  ? process_one_work+0x164/0x650
> 	[  125.740210]  process_one_work+0x1e1/0x650
> 	[  125.740225]  worker_thread+0x69/0x4a0
> 	[  125.740240]  kthread+0x127/0x160
> 	[  125.740250]  ? process_one_work+0x650/0x650
> 	[  125.740262]  ? kthread_create_on_node+0x40/0x40
> 	[  125.740275]  ret_from_fork+0x31/0x40
> 	[  125.740291] Code: 75 05 e8 72 fb ff ff 48 8b 43 60 48 2b 43 50 88 43 4e 5b 5d f3 c3 90 90 90 66 66 90 66 90 48 89 f8 48 89 d1 48 c1 e9 03 83 e2 07 <f3> 48 a5 89 d1 f3 a4 c3 66 0f 1f 44 00 00 48 89 f8 48 89 d1 f3 
> 	[  125.740413] RIP: __memcpy+0x12/0x20 RSP: ffffc900016db800
> 	[  125.740425] CR2: ffff880303833802
> 	[  125.740437] ---[ end trace 844fdaac4da7b68d ]---
> 
> > 
> > > +	r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
> > > +	    len, 0);
> > > +	kunmap_atomic(map);
> > > +
> > 
> > When writeback fails for some reason, you'll also want to call
> > mapping_set_error to help ensure that those errors get reported (unless
> > you're tracking them on your own somehow). I don't see where that's
> > being done in a cursory glance at these patches, but I could have missed
> > it.
> > 
> > > +	end_page_writeback(page);
> > > +	unlock_page(page);
> > > +	return 0;
> > > +}
> > > +
> > > static int orangefs_readpage(struct file *file, struct page *page)
> > >  {
> > >  	int ret;
> > > @@ -786,6 +759,17 @@ static int orangefs_readpage(struct file *file, struct page *page)
> > >  	return ret;
> > >  }
> > >  
> > > +static int orangefs_write_end(struct file *file,
> > > +    struct address_space *mapping, loff_t pos, unsigned len,
> > > +    unsigned copied, struct page *page, void *fsdata)
> > > +{
> > > +	int r;
> > > +	r = simple_write_end(file, mapping, pos, len, copied, page,
> > > +	    fsdata);
> > > +	mark_inode_dirty_sync(file_inode(file));
> > > +	return r;
> > > +}
> > > +
> > >  static void orangefs_invalidatepage(struct page *page,
> > >  				 unsigned int offset,
> > >  				 unsigned int length)
> > > @@ -815,17 +799,17 @@ static ssize_t orangefs_direct_IO(struct kiocb *iocb,
> > >  {
> > >  	struct file *file = iocb->ki_filp;
> > >  	loff_t pos = *(&iocb->ki_pos);
> > > -	/*
> > > -	 * This cannot happen until write_iter becomes
> > > -	 * generic_file_write_iter.
> > > -	 */
> > > -	BUG_ON(iov_iter_rw(iter) != READ);
> > > -	return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
> > > +	return do_readv_writev(iocb->ki_flags & IOCB_WRITE ?
> > > +	    ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
> > >  }
> > >  
> > >  /** ORANGEFS2 implementation of address space operations */
> > >  const struct address_space_operations orangefs_address_operations = {
> > > +	.writepage = orangefs_writepage,
> > >  	.readpage = orangefs_readpage,
> > > +	.set_page_dirty = __set_page_dirty_nobuffers,
> > > +	.write_begin = simple_write_begin,
> > > +	.write_end = orangefs_write_end,
> > >  	.invalidatepage = orangefs_invalidatepage,
> > >  	.releasepage = orangefs_releasepage,
> > >  	.direct_IO = orangefs_direct_IO,
> > 
> > -- 
> > Jeff Layton <jlayton@redhat.com>
diff mbox

Patch

diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index c03deea..3ab0a1f 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -448,69 +448,11 @@  static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
 	return generic_file_read_iter(iocb, iter);
 }
 
-static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+static ssize_t orangefs_file_write_iter(struct kiocb *iocb,
+    struct iov_iter *iter)
 {
-	struct file *file = iocb->ki_filp;
-	loff_t pos;
-	ssize_t rc;
-
-	BUG_ON(iocb->private);
-
-	gossip_debug(GOSSIP_FILE_DEBUG, "orangefs_file_write_iter\n");
-
-	inode_lock(file->f_mapping->host);
-
-	/* Make sure generic_write_checks sees an up to date inode size. */
-	if (file->f_flags & O_APPEND) {
-		rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1,
-		    STATX_SIZE);
-		if (rc == -ESTALE)
-			rc = -EIO;
-		if (rc) {
-			gossip_err("%s: orangefs_inode_getattr failed, "
-			    "rc:%zd:.\n", __func__, rc);
-			goto out;
-		}
-	}
-
-	if (file->f_pos > i_size_read(file->f_mapping->host))
-		orangefs_i_size_write(file->f_mapping->host, file->f_pos);
-
-	rc = generic_write_checks(iocb, iter);
-
-	if (rc <= 0) {
-		gossip_err("%s: generic_write_checks failed, rc:%zd:.\n",
-			   __func__, rc);
-		goto out;
-	}
-
-	/*
-	 * if we are appending, generic_write_checks would have updated
-	 * pos to the end of the file, so we will wait till now to set
-	 * pos...
-	 */
-	pos = *(&iocb->ki_pos);
-
-	rc = do_readv_writev(ORANGEFS_IO_WRITE,
-			     file,
-			     &pos,
-			     iter);
-	if (rc < 0) {
-		gossip_err("%s: do_readv_writev failed, rc:%zd:.\n",
-			   __func__, rc);
-		goto out;
-	}
-
-	iocb->ki_pos = pos;
 	orangefs_stats.writes++;
-
-	if (pos > i_size_read(file->f_mapping->host))
-		orangefs_i_size_write(file->f_mapping->host, pos);
-
-out:
-
-	inode_unlock(file->f_mapping->host);
-	return rc;
+	return generic_file_write_iter(iocb, iter);
 }
 
 /*
@@ -606,9 +548,8 @@  static int orangefs_file_release(struct inode *inode, struct file *file)
 	orangefs_flush_inode(inode);
 
 	/*
-	 * remove all associated inode pages from the page cache and
-	 * readahead cache (if any); this forces an expensive refresh of
-	 * data for the next caller of mmap (or 'get_block' accesses)
+	 * remove all associated inode pages from the readahead cache
+	 * (if any)
 	 */
 	if (file_inode(file) &&
 	    file_inode(file)->i_mapping &&
@@ -621,8 +562,6 @@  static int orangefs_file_release(struct inode *inode, struct file *file)
 			gossip_debug(GOSSIP_INODE_DEBUG,
 			    "flush_racache finished\n");
 		}
-		truncate_inode_pages(file_inode(file)->i_mapping,
-				     0);
 	}
 	return 0;
 }
@@ -741,6 +680,40 @@  const struct file_operations orangefs_file_operations = {
 	.fsync		= orangefs_fsync,
 };
 
+static int orangefs_writepage(struct page *page,
+    struct writeback_control *wbc)
+{
+	struct inode *inode = page->mapping->host;
+	struct iov_iter iter;
+	struct iovec iov;
+	loff_t off;
+	size_t len;
+	ssize_t r;
+	void *map;
+
+	off = page_offset(page);
+	len = i_size_read(inode);
+	if (off + PAGE_SIZE > len)
+		len = len - off;
+	else
+		len = PAGE_SIZE;
+
+	map = kmap_atomic(page);
+	iov.iov_base = map;
+	iov.iov_len = len;
+	iov_iter_init(&iter, WRITE, &iov, 1, len);
+
+	set_page_writeback(page);
+
+	r = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter,
+	    len, 0);
+	kunmap_atomic(map);
+
+	end_page_writeback(page);
+	unlock_page(page);
+	return 0;
+}
+
 static int orangefs_readpage(struct file *file, struct page *page)
 {
 	int ret;
@@ -786,6 +759,17 @@  static int orangefs_readpage(struct file *file, struct page *page)
 	return ret;
 }
 
+static int orangefs_write_end(struct file *file,
+    struct address_space *mapping, loff_t pos, unsigned len,
+    unsigned copied, struct page *page, void *fsdata)
+{
+	int r;
+	r = simple_write_end(file, mapping, pos, len, copied, page,
+	    fsdata);
+	mark_inode_dirty_sync(file_inode(file));
+	return r;
+}
+
 static void orangefs_invalidatepage(struct page *page,
 				 unsigned int offset,
 				 unsigned int length)
@@ -815,17 +799,17 @@  static ssize_t orangefs_direct_IO(struct kiocb *iocb,
 {
 	struct file *file = iocb->ki_filp;
 	loff_t pos = *(&iocb->ki_pos);
-	/*
-	 * This cannot happen until write_iter becomes
-	 * generic_file_write_iter.
-	 */
-	BUG_ON(iov_iter_rw(iter) != READ);
-	return do_readv_writev(ORANGEFS_IO_READ, file, &pos, iter);
+	return do_readv_writev(iocb->ki_flags & IOCB_WRITE ?
+	    ORANGEFS_IO_WRITE : ORANGEFS_IO_READ, file, &pos, iter);
 }
 
 /** ORANGEFS2 implementation of address space operations */
 const struct address_space_operations orangefs_address_operations = {
+	.writepage = orangefs_writepage,
 	.readpage = orangefs_readpage,
+	.set_page_dirty = __set_page_dirty_nobuffers,
+	.write_begin = simple_write_begin,
+	.write_end = orangefs_write_end,
 	.invalidatepage = orangefs_invalidatepage,
 	.releasepage = orangefs_releasepage,
 	.direct_IO = orangefs_direct_IO,