Message ID | 1362120384-5188-7-git-send-email-zheng.z.yan@intel.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Feb 28, 2013 at 10:46 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote: > From: "Yan, Zheng" <zheng.z.yan@intel.com> > > ceph_aio_write() has an optimization that marks CEPH_CAP_FILE_WR > cap dirty before data is copied to page cache and inode size is > updated. The optimization avoids slow cap revocation caused by > balance_dirty_pages(), but introduces inode size update race. If > ceph_check_caps() flushes the dirty cap before the inode size is > updated, MDS can miss the new inode size. So just remove the > optimization. > > Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> > --- > fs/ceph/file.c | 42 +++++++++++++++++------------------------- > 1 file changed, 17 insertions(+), 25 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index a949805..28ef273 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, > if (ceph_snap(inode) != CEPH_NOSNAP) > return -EROFS; > > + sb_start_write(inode->i_sb); > retry_snap: > - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) > - return -ENOSPC; > + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { > + ret = -ENOSPC; > + goto out; > + } > __ceph_do_pending_vmtruncate(inode); > dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", > inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, > @@ -750,29 +753,10 @@ retry_snap: > ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, > &iocb->ki_pos); > } else { > - /* > - * buffered write; drop Fw early to avoid slow > - * revocation if we get stuck on balance_dirty_pages > - */ > - int dirty; > - > - spin_lock(&ci->i_ceph_lock); > - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); > - spin_unlock(&ci->i_ceph_lock); > - ceph_put_cap_refs(ci, got); > - > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); > - if ((ret >= 0 || ret == -EIOCBQUEUED) && > - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) > - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { > - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); > - if (err < 0) > - ret = err; > - } > - > - if (dirty) > - __mark_inode_dirty(inode, dirty); > - goto out; > + mutex_lock(&inode->i_mutex); > + ret = __generic_file_aio_write(iocb, iov, nr_segs, > + &iocb->ki_pos); > + mutex_unlock(&inode->i_mutex); Hmm, you're here passing in a different value than the removed generic_file_aio_write() call did — &iocb->ki_pos instead of pos. Everything else is using the pos parameter so I rather expect that should still be used here? Also a quick skim of the interfaces makes me think that the two versions aren't interchangeable — __generic_file_aio_write() also handles O_SYNC in addition to grabbing i_mutex. Why'd you switch them? (I haven't checked deep enough to see if the unlocked version is correct or not, but it does say that's what most filesystems should use.) -Greg > } > > if (ret >= 0) { > @@ -790,12 +774,20 @@ out_put: > ceph_cap_string(got)); > ceph_put_cap_refs(ci, got); > > + if (ret >= 0 && > + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || > + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { > + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); > + if (err < 0) > + ret = err; > + } > out: > if (ret == -EOLDSNAPC) { > dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", > inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); > goto retry_snap; > } > + sb_end_write(inode->i_sb); > > return ret; > } > -- > 1.7.11.7 > > -- > To unsubscribe from this list: send the line "unsubscribe ceph-devel" in > the body of a message to majordomo@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 03/05/2013 02:26 AM, Gregory Farnum wrote: > On Thu, Feb 28, 2013 at 10:46 PM, Yan, Zheng <zheng.z.yan@intel.com> wrote: >> From: "Yan, Zheng" <zheng.z.yan@intel.com> >> >> ceph_aio_write() has an optimization that marks CEPH_CAP_FILE_WR >> cap dirty before data is copied to page cache and inode size is >> updated. The optimization avoids slow cap revocation caused by >> balance_dirty_pages(), but introduces inode size update race. If >> ceph_check_caps() flushes the dirty cap before the inode size is >> updated, MDS can miss the new inode size. So just remove the >> optimization. >> >> Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com> >> --- >> fs/ceph/file.c | 42 +++++++++++++++++------------------------- >> 1 file changed, 17 insertions(+), 25 deletions(-) >> >> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >> index a949805..28ef273 100644 >> --- a/fs/ceph/file.c >> +++ b/fs/ceph/file.c >> @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, >> if (ceph_snap(inode) != CEPH_NOSNAP) >> return -EROFS; >> >> + sb_start_write(inode->i_sb); >> retry_snap: >> - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) >> - return -ENOSPC; >> + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { >> + ret = -ENOSPC; >> + goto out; >> + } >> __ceph_do_pending_vmtruncate(inode); >> dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", >> inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, >> @@ -750,29 +753,10 @@ retry_snap: >> ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, >> &iocb->ki_pos); >> } else { >> - /* >> - * buffered write; drop Fw early to avoid slow >> - * revocation if we get stuck on balance_dirty_pages >> - */ >> - int dirty; >> - >> - spin_lock(&ci->i_ceph_lock); >> - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); >> - spin_unlock(&ci->i_ceph_lock); >> - ceph_put_cap_refs(ci, got); >> - >> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); >> - if ((ret >= 0 || ret == -EIOCBQUEUED) && >> - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) >> - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { >> - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); >> - if (err < 0) >> - ret = err; >> - } >> - >> - if (dirty) >> - __mark_inode_dirty(inode, dirty); >> - goto out; >> + mutex_lock(&inode->i_mutex); >> + ret = __generic_file_aio_write(iocb, iov, nr_segs, >> + &iocb->ki_pos); >> + mutex_unlock(&inode->i_mutex); > > Hmm, you're here passing in a different value than the removed > generic_file_aio_write() call did — &iocb->ki_pos instead of pos. > Everything else is using the pos parameter so I rather expect that > should still be used here? They always have the same value, see the BUG_ON in generic_file_aio_write() > Also a quick skim of the interfaces makes me think that the two > versions aren't interchangeable — __generic_file_aio_write() also > handles O_SYNC in addition to grabbing i_mutex. Why'd you switch them? ceph has its own code that handles O_SYNC case. I want to make sb_start_write() covers ceph_sync_write(), that's the reason I use __generic_file_aio_write() here. Regards Yan, Zheng > (I haven't checked deep enough to see if the unlocked version is > correct or not, but it does say that's what most filesystems should > use.) > -Greg > > >> } >> >> if (ret >= 0) { >> @@ -790,12 +774,20 @@ out_put: >> ceph_cap_string(got)); >> ceph_put_cap_refs(ci, got); >> >> + if (ret >= 0 && >> + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || >> + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { >> + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); >> + if (err < 0) >> + ret = err; >> + } >> out: >> if (ret == -EOLDSNAPC) { >> dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", >> inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); >> goto retry_snap; >> } >> + sb_end_write(inode->i_sb); >> >> return ret; >> } >> -- >> 1.7.11.7 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in >> the body of a message to majordomo@vger.kernel.org >> More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Monday, March 4, 2013 at 5:57 PM, Yan, Zheng wrote: > On 03/05/2013 02:26 AM, Gregory Farnum wrote: > > On Thu, Feb 28, 2013 at 10:46 PM, Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)> wrote: > > > From: "Yan, Zheng" <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)> > > > > > > ceph_aio_write() has an optimization that marks CEPH_CAP_FILE_WR > > > cap dirty before data is copied to page cache and inode size is > > > updated. The optimization avoids slow cap revocation caused by > > > balance_dirty_pages(), but introduces inode size update race. If > > > ceph_check_caps() flushes the dirty cap before the inode size is > > > updated, MDS can miss the new inode size. So just remove the > > > optimization. > > > > > > Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com (mailto:zheng.z.yan@intel.com)> > > > --- > > > fs/ceph/file.c | 42 +++++++++++++++++------------------------- > > > 1 file changed, 17 insertions(+), 25 deletions(-) > > > > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > > > index a949805..28ef273 100644 > > > --- a/fs/ceph/file.c > > > +++ b/fs/ceph/file.c > > > @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, > > > if (ceph_snap(inode) != CEPH_NOSNAP) > > > return -EROFS; > > > > > > + sb_start_write(inode->i_sb); > > > retry_snap: > > > - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) > > > - return -ENOSPC; > > > + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { > > > + ret = -ENOSPC; > > > + goto out; > > > + } > > > __ceph_do_pending_vmtruncate(inode); > > > dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", > > > inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, > > > @@ -750,29 +753,10 @@ retry_snap: > > > ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, > > > &iocb->ki_pos); > > > } else { > > > - /* > > > - * buffered write; drop Fw early to avoid slow > > > - * revocation if we get stuck on balance_dirty_pages > > > - */ > > > - int dirty; > > > - > > > - spin_lock(&ci->i_ceph_lock); > > > - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); > > > - spin_unlock(&ci->i_ceph_lock); > > > - ceph_put_cap_refs(ci, got); > > > - > > > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); > > > - if ((ret >= 0 || ret == -EIOCBQUEUED) && > > > - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) > > > - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { > > > - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); > > > - if (err < 0) > > > - ret = err; > > > - } > > > - > > > - if (dirty) > > > - __mark_inode_dirty(inode, dirty); > > > - goto out; > > > + mutex_lock(&inode->i_mutex); > > > + ret = __generic_file_aio_write(iocb, iov, nr_segs, > > > + &iocb->ki_pos); > > > + mutex_unlock(&inode->i_mutex); > > > > > > > > Hmm, you're here passing in a different value than the removed > > generic_file_aio_write() call did — &iocb->ki_pos instead of pos. > > Everything else is using the pos parameter so I rather expect that > > should still be used here? > > > > They always have the same value, see the BUG_ON in generic_file_aio_write() > > > Also a quick skim of the interfaces makes me think that the two > > versions aren't interchangeable — __generic_file_aio_write() also > > handles O_SYNC in addition to grabbing i_mutex. Why'd you switch them? > > > > ceph has its own code that handles O_SYNC case. I want to make sb_start_write() > covers ceph_sync_write(), that's the reason I use __generic_file_aio_write() here. > > Regards > Yan, Zheng > Ah, yep — sounds good! -Greg -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index a949805..28ef273 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -724,9 +724,12 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov, if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + sb_start_write(inode->i_sb); retry_snap: - if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) - return -ENOSPC; + if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL)) { + ret = -ENOSPC; + goto out; + } __ceph_do_pending_vmtruncate(inode); dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len, @@ -750,29 +753,10 @@ retry_snap: ret = ceph_sync_write(file, iov->iov_base, iov->iov_len, &iocb->ki_pos); } else { - /* - * buffered write; drop Fw early to avoid slow - * revocation if we get stuck on balance_dirty_pages - */ - int dirty; - - spin_lock(&ci->i_ceph_lock); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); - spin_unlock(&ci->i_ceph_lock); - ceph_put_cap_refs(ci, got); - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - if ((ret >= 0 || ret == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) - || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { - err = vfs_fsync_range(file, pos, pos + ret - 1, 1); - if (err < 0) - ret = err; - } - - if (dirty) - __mark_inode_dirty(inode, dirty); - goto out; + mutex_lock(&inode->i_mutex); + ret = __generic_file_aio_write(iocb, iov, nr_segs, + &iocb->ki_pos); + mutex_unlock(&inode->i_mutex); } if (ret >= 0) { @@ -790,12 +774,20 @@ out_put: ceph_cap_string(got)); ceph_put_cap_refs(ci, got); + if (ret >= 0 && + ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host) || + ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) { + err = vfs_fsync_range(file, pos, pos + ret - 1, 1); + if (err < 0) + ret = err; + } out: if (ret == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len); goto retry_snap; } + sb_end_write(inode->i_sb); return ret; }