Message ID | 201309121325129235088@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> On 09/12/2013 01:25 PM, majianpeng wrote: > For readv/preadv sync-operatoin, ceph only do the first iov. > It don't think other iovs.Now implement this. > > V4: > modify one bug. > V3: > modify some bug. > V2: > -add generic_segment_checks > -using struct iov_iter replace cloning the iovs. > -return previous successfully copied if ceph_copy_page_vector_to_user > met error. > > > Signed-off-by: Jianpeng Ma <majianpeng@gmail.com> > Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> > --- > fs/ceph/file.c | 157 ++++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 106 insertions(+), 51 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 3de8982..bc7fa52 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -408,51 +408,94 @@ more: > * > * If the read spans object boundary, just do multiple reads. > */ > -static ssize_t ceph_sync_read(struct file *file, char __user *data, > - unsigned len, loff_t *poff, int *checkeof) > +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, > + int *checkeof) > { > + struct file *file = iocb->ki_filp; > struct inode *inode = file_inode(file); > struct page **pages; > - u64 off = *poff; > + u64 off = iocb->ki_pos; > int num_pages, ret; > > - dout("sync_read on file %p %llu~%u %s\n", file, off, len, > + dout("sync_read on file %p %llu~%u %s\n", file, off, > + (unsigned)iocb->ki_left, > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > - > - if (file->f_flags & O_DIRECT) { > - num_pages = calc_pages_for((unsigned long)data, len); > - pages = ceph_get_direct_page_vector(data, num_pages, true); > - } else { > - num_pages = calc_pages_for(off, len); > - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); > - } > - if (IS_ERR(pages)) > - return PTR_ERR(pages); > - > /* > * flush any page cache pages in this range. this > * will make concurrent normal and sync io slow, > * but it will at least behave sensibly when they are > * in sequence. > */ > - ret = filemap_write_and_wait(inode->i_mapping); > + ret = filemap_write_and_wait_range(inode->i_mapping, off, > + off + iocb->ki_left); > if (ret < 0) > - goto done; > - > - ret = striped_read(inode, off, len, pages, num_pages, checkeof, > - file->f_flags & O_DIRECT, > - (unsigned long)data & ~PAGE_MASK); > + return ret; > > - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) > - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); > - if (ret >= 0) > - *poff = off + ret; > + if (file->f_flags & O_DIRECT) { > + while (iov_iter_count(i)) { > + void __user *data = i->iov[0].iov_base + i->iov_offset; > + size_t len = i->iov[0].iov_len - i->iov_offset; > + > + num_pages = calc_pages_for((unsigned long)data, len); > + pages = ceph_get_direct_page_vector(data, > + num_pages, true); > + if (IS_ERR(pages)) > + return PTR_ERR(pages); > + > + ret = striped_read(inode, off, len, > + pages, num_pages, checkeof, > + 1, (unsigned long)data & ~PAGE_MASK); > + ceph_put_page_vector(pages, num_pages, true); > + > + if (ret <= 0) > + break; > + off += ret; > + iov_iter_advance(i, ret); > + if (ret < len) > + break; > + } > + } else { > + size_t len = iocb->ki_left; > > -done: > - if (file->f_flags & O_DIRECT) > - ceph_put_page_vector(pages, num_pages, true); > - else > + num_pages = calc_pages_for(off, len); > + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); > + if (IS_ERR(pages)) > + return PTR_ERR(pages); > + ret = striped_read(inode, off, len, pages, > + num_pages, checkeof, 0, 0); > + if (ret > 0) { > + int l, k = 0; > + size_t left = len = ret; > + > + while (left) { > + void __user *data = i->iov[0].iov_base > + + i->iov_offset; > + l = min(i->iov[0].iov_len - i->iov_offset, > + left); > + > + ret = ceph_copy_page_vector_to_user(&pages[k], > + data, off, > + l); > + if (ret > 0) { > + iov_iter_advance(i, ret); > + left -= ret; > + off += ret; > + k = calc_pages_for(iocb->ki_pos, > + len - left + 1) - 1; > + BUG_ON(k >= num_pages && left); > + } else > + break; > + } > + } > ceph_release_page_vector(pages, num_pages); > + } > + > + if (off > iocb->ki_pos) { > + ret = off - iocb->ki_pos; > + iocb->ki_pos = off; > + iocb->ki_left -= ret; > + } > + > dout("sync_read result %d\n", ret); > return ret; > } > @@ -647,55 +690,67 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, > { > struct file *filp = iocb->ki_filp; > struct ceph_file_info *fi = filp->private_data; > - loff_t *ppos = &iocb->ki_pos; > - size_t len = iov->iov_len; > + size_t len = 0; > struct inode *inode = file_inode(filp); > struct ceph_inode_info *ci = ceph_inode(inode); > - void __user *base = iov->iov_base; > ssize_t ret; > int want, got = 0; > int checkeof = 0, read = 0; > > dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", > inode, ceph_vinop(inode), pos, (unsigned)len, inode); > -again: > + > + ret = generic_segment_checks(iov, &nr_segs, &len, VERIFY_WRITE); > + if (ret) > + return ret; > + > if (fi->fmode & CEPH_FILE_MODE_LAZY) > want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; > else > want = CEPH_CAP_FILE_CACHE; > ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); > if (ret < 0) > - goto out; > + return ret; > + > dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", > inode, ceph_vinop(inode), pos, (unsigned)len, > ceph_cap_string(got)); > > if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || > (iocb->ki_filp->f_flags & O_DIRECT) || > - (fi->flags & CEPH_F_SYNC)) > + (fi->flags & CEPH_F_SYNC)) { > + struct iov_iter i; > + > + iocb->ki_left = len; > + iov_iter_init(&i, iov, nr_segs, len, 0); > +again: > /* hmm, this isn't really async... */ > - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); > - else > + ret = ceph_sync_read(iocb, &i, &checkeof); > + > + if (checkeof && ret >= 0) { > + int statret = ceph_do_getattr(inode, > + CEPH_STAT_CAP_SIZE); > + > + /* hit EOF or hole? */ > + if (statret == 0 && iocb->ki_pos < inode->i_size && > + iocb->ki_left) { > + dout("sync_read hit hole, ppos %lld < size %lld" > + ", reading more\n", iocb->ki_pos, > + inode->i_size); > + > + read += ret; > + checkeof = 0; > + goto again; > + } > + } > + > + } else > ret = generic_file_aio_read(iocb, iov, nr_segs, pos); > > -out: > dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", > inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); > ceph_put_cap_refs(ci, got); > > - if (checkeof && ret >= 0) { > - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); > - > - /* hit EOF or hole? */ > - if (statret == 0 && *ppos < inode->i_size) { > - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); > - read += ret; > - base += ret; > - len -= ret; > - checkeof = 0; > - goto again; > - } > - } > if (ret >= 0) > ret += read; > > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Thu, Sep 12, 2013 at 1:25 PM, majianpeng <majianpeng@gmail.com> wrote: > For readv/preadv sync-operatoin, ceph only do the first iov. > It don't think other iovs.Now implement this. > > V4: > modify one bug. > V3: > modify some bug. > V2: > -add generic_segment_checks > -using struct iov_iter replace cloning the iovs. > -return previous successfully copied if ceph_copy_page_vector_to_user > met error. > > > Signed-off-by: Jianpeng Ma <majianpeng@gmail.com> > Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> > --- > fs/ceph/file.c | 157 ++++++++++++++++++++++++++++++++++++++------------------- > 1 file changed, 106 insertions(+), 51 deletions(-) > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c > index 3de8982..bc7fa52 100644 > --- a/fs/ceph/file.c > +++ b/fs/ceph/file.c > @@ -408,51 +408,94 @@ more: > * > * If the read spans object boundary, just do multiple reads. > */ > -static ssize_t ceph_sync_read(struct file *file, char __user *data, > - unsigned len, loff_t *poff, int *checkeof) > +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, > + int *checkeof) > { > + struct file *file = iocb->ki_filp; > struct inode *inode = file_inode(file); > struct page **pages; > - u64 off = *poff; > + u64 off = iocb->ki_pos; > int num_pages, ret; > > - dout("sync_read on file %p %llu~%u %s\n", file, off, len, > + dout("sync_read on file %p %llu~%u %s\n", file, off, > + (unsigned)iocb->ki_left, > (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); > - > - if (file->f_flags & O_DIRECT) { > - num_pages = calc_pages_for((unsigned long)data, len); > - pages = ceph_get_direct_page_vector(data, num_pages, true); > - } else { > - num_pages = calc_pages_for(off, len); > - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); > - } > - if (IS_ERR(pages)) > - return PTR_ERR(pages); > - > /* > * flush any page cache pages in this range. this > * will make concurrent normal and sync io slow, > * but it will at least behave sensibly when they are > * in sequence. > */ > - ret = filemap_write_and_wait(inode->i_mapping); > + ret = filemap_write_and_wait_range(inode->i_mapping, off, > + off + iocb->ki_left); > if (ret < 0) > - goto done; > - > - ret = striped_read(inode, off, len, pages, num_pages, checkeof, > - file->f_flags & O_DIRECT, > - (unsigned long)data & ~PAGE_MASK); > + return ret; > > - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) > - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); > - if (ret >= 0) > - *poff = off + ret; > + if (file->f_flags & O_DIRECT) { > + while (iov_iter_count(i)) { > + void __user *data = i->iov[0].iov_base + i->iov_offset; > + size_t len = i->iov[0].iov_len - i->iov_offset; > + > + num_pages = calc_pages_for((unsigned long)data, len); > + pages = ceph_get_direct_page_vector(data, > + num_pages, true); > + if (IS_ERR(pages)) > + return PTR_ERR(pages); > + > + ret = striped_read(inode, off, len, > + pages, num_pages, checkeof, > + 1, (unsigned long)data & ~PAGE_MASK); > + ceph_put_page_vector(pages, num_pages, true); > + > + if (ret <= 0) > + break; > + off += ret; > + iov_iter_advance(i, ret); > + if (ret < len) > + break; > + } > + } else { > + size_t len = iocb->ki_left; > > -done: > - if (file->f_flags & O_DIRECT) > - ceph_put_page_vector(pages, num_pages, true); > - else > + num_pages = calc_pages_for(off, len); > + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); > + if (IS_ERR(pages)) > + return PTR_ERR(pages); > + ret = striped_read(inode, off, len, pages, > + num_pages, checkeof, 0, 0); > + if (ret > 0) { > + int l, k = 0; > + size_t left = len = ret; > + > + while (left) { > + void __user *data = i->iov[0].iov_base > + + i->iov_offset; > + l = min(i->iov[0].iov_len - i->iov_offset, > + left); > + > + ret = ceph_copy_page_vector_to_user(&pages[k], > + data, off, > + l); > + if (ret > 0) { > + iov_iter_advance(i, ret); > + left -= ret; > + off += ret; > + k = calc_pages_for(iocb->ki_pos, > + len - left + 1) - 1; > + BUG_ON(k >= num_pages && left); > + } else > + break; > + } > + } > ceph_release_page_vector(pages, num_pages); > + } > + > + if (off > iocb->ki_pos) { > + ret = off - iocb->ki_pos; > + iocb->ki_pos = off; > + iocb->ki_left -= ret; > + } > + > dout("sync_read result %d\n", ret); > return ret; > } > @@ -647,55 +690,67 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, > { > struct file *filp = iocb->ki_filp; > struct ceph_file_info *fi = filp->private_data; > - loff_t *ppos = &iocb->ki_pos; > - size_t len = iov->iov_len; > + size_t len = 0; > struct inode *inode = file_inode(filp); > struct ceph_inode_info *ci = ceph_inode(inode); > - void __user *base = iov->iov_base; > ssize_t ret; > int want, got = 0; > int checkeof = 0, read = 0; > > dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", > inode, ceph_vinop(inode), pos, (unsigned)len, inode); > -again: > + > + ret = generic_segment_checks(iov, &nr_segs, &len, VERIFY_WRITE); > + if (ret) > + return ret; > + > if (fi->fmode & CEPH_FILE_MODE_LAZY) > want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; > else > want = CEPH_CAP_FILE_CACHE; > ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); > if (ret < 0) > - goto out; > + return ret; > + > dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", > inode, ceph_vinop(inode), pos, (unsigned)len, > ceph_cap_string(got)); > > if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || > (iocb->ki_filp->f_flags & O_DIRECT) || > - (fi->flags & CEPH_F_SYNC)) > + (fi->flags & CEPH_F_SYNC)) { > + struct iov_iter i; > + > + iocb->ki_left = len; > + iov_iter_init(&i, iov, nr_segs, len, 0); > +again: > /* hmm, this isn't really async... */ > - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); > - else > + ret = ceph_sync_read(iocb, &i, &checkeof); > + > + if (checkeof && ret >= 0) { > + int statret = ceph_do_getattr(inode, > + CEPH_STAT_CAP_SIZE); It's wrong to move getattr to here. because getattr while holding Fr cap can cause hang. Regards Yan, Zheng > + > + /* hit EOF or hole? */ > + if (statret == 0 && iocb->ki_pos < inode->i_size && > + iocb->ki_left) { > + dout("sync_read hit hole, ppos %lld < size %lld" > + ", reading more\n", iocb->ki_pos, > + inode->i_size); > + > + read += ret; > + checkeof = 0; > + goto again; > + } > + } > + > + } else > ret = generic_file_aio_read(iocb, iov, nr_segs, pos); > > -out: > dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", > inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); > ceph_put_cap_refs(ci, got); > > - if (checkeof && ret >= 0) { > - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); > - > - /* hit EOF or hole? */ > - if (statret == 0 && *ppos < inode->i_size) { > - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); > - read += ret; > - base += ret; > - len -= ret; > - checkeof = 0; > - goto again; > - } > - } > if (ret >= 0) > ret += read; > > -- > 1.8.1.2 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Pk9uIFRodSwgU2VwIDEyLCAyMDEzIGF0IDE6MjUgUE0sIG1hamlhbnBlbmcgPG1hamlhbnBlbmdA Z21haWwuY29tPiB3cm90ZToNCj4+IEZvciByZWFkdi9wcmVhZHYgc3luYy1vcGVyYXRvaW4sIGNl cGggb25seSBkbyB0aGUgZmlyc3QgaW92Lg0KPj4gSXQgZG9uJ3QgdGhpbmsgb3RoZXIgaW92cy5O b3cgaW1wbGVtZW50IHRoaXMuDQo+Pg0KPj4gVjQ6DQo+PiAgICAgICAgIG1vZGlmeSBvbmUgYnVn Lg0KPj4gVjM6DQo+PiAgICBtb2RpZnkgc29tZSBidWcuDQo+PiBWMjoNCj4+ICAgLWFkZCBnZW5l cmljX3NlZ21lbnRfY2hlY2tzDQo+PiAgIC11c2luZyBzdHJ1Y3QgaW92X2l0ZXIgcmVwbGFjZSBj bG9uaW5nIHRoZSBpb3ZzLg0KPj4gICAtcmV0dXJuIHByZXZpb3VzIHN1Y2Nlc3NmdWxseSBjb3Bp ZWQgaWYgY2VwaF9jb3B5X3BhZ2VfdmVjdG9yX3RvX3VzZXINCj4+ICAgIG1ldCBlcnJvci4NCj4+ DQo+Pg0KPj4gU2lnbmVkLW9mZi1ieTogSmlhbnBlbmcgTWEgPG1hamlhbnBlbmdAZ21haWwuY29t Pg0KPj4gUmV2aWV3ZWQtYnk6IFlhbiwgWmhlbmcgPHpoZW5nLnoueWFuQGludGVsLmNvbT4NCj4+ IC0tLQ0KPj4gICBmcy9jZXBoL2ZpbGUuYyB8IDE1NyArKysrKysrKysrKysrKysrKysrKysrKysr KysrKysrKysrKysrKy0tLS0tLS0tLS0tLS0tLS0tLS0NCj4+ICAxIGZpbGUgY2hhbmdlZCwgMTA2 IGluc2VydGlvbnMoKyksIDUxIGRlbGV0aW9ucygtKQ0KPj4NCj4+IGRpZmYgLS1naXQgYS9mcy9j ZXBoL2ZpbGUuYyBiL2ZzL2NlcGgvZmlsZS5jDQo+PiBpbmRleCAzZGU4OTgyLi5iYzdmYTUyIDEw MDY0NA0KPj4gLS0tIGEvZnMvY2VwaC9maWxlLmMNCj4+ICsrKyBiL2ZzL2NlcGgvZmlsZS5jDQo+ PiBAQCAtNDA4LDUxICs0MDgsOTQgQEAgbW9yZToNCj4+ICAgKg0KPj4gICAqIElmIHRoZSByZWFk IHNwYW5zIG9iamVjdCBib3VuZGFyeSwganVzdCBkbyBtdWx0aXBsZSByZWFkcy4NCj4+ICAgKi8N Cj4+IC1zdGF0aWMgc3NpemVfdCBjZXBoX3N5bmNfcmVhZChzdHJ1Y3QgZmlsZSAqZmlsZSwgY2hh ciBfX3VzZXIgKmRhdGEsDQo+PiAtICAgICAgICAgICAgICAgICAgICAgICAgICAgICB1bnNpZ25l ZCBsZW4sIGxvZmZfdCAqcG9mZiwgaW50ICpjaGVja2VvZikNCj4+ICtzdGF0aWMgc3NpemVfdCBj ZXBoX3N5bmNfcmVhZChzdHJ1Y3Qga2lvY2IgKmlvY2IsIHN0cnVjdCBpb3ZfaXRlciAqaSwNCj4+ ICsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgaW50ICpjaGVja2VvZikNCj4+ICB7DQo+ PiArICAgICAgIHN0cnVjdCBmaWxlICpmaWxlID0gaW9jYi0+a2lfZmlscDsNCj4+ICAgICAgICAg c3RydWN0IGlub2RlICppbm9kZSA9IGZpbGVfaW5vZGUoZmlsZSk7DQo+PiAgICAgICAgIHN0cnVj dCBwYWdlICoqcGFnZXM7DQo+PiAtICAgICAgIHU2NCBvZmYgPSAqcG9mZjsNCj4+ICsgICAgICAg dTY0IG9mZiA9IGlvY2ItPmtpX3BvczsNCj4+ICAgICAgICAgaW50IG51bV9wYWdlcywgcmV0Ow0K Pj4NCj4+IC0gICAgICAgZG91dCgic3luY19yZWFkIG9uIGZpbGUgJXAgJWxsdX4ldSAlc1xuIiwg ZmlsZSwgb2ZmLCBsZW4sDQo+PiArICAgICAgIGRvdXQoInN5bmNfcmVhZCBvbiBmaWxlICVwICVs bHV+JXUgJXNcbiIsIGZpbGUsIG9mZiwNCj4+ICsgICAgICAgICAgICAodW5zaWduZWQpaW9jYi0+ a2lfbGVmdCwNCj4+ICAgICAgICAgICAgICAoZmlsZS0+Zl9mbGFncyAmIE9fRElSRUNUKSA/ICJP X0RJUkVDVCIgOiAiIik7DQo+PiAtDQo+PiAtICAgICAgIGlmIChmaWxlLT5mX2ZsYWdzICYgT19E SVJFQ1QpIHsNCj4+IC0gICAgICAgICAgICAgICBudW1fcGFnZXMgPSBjYWxjX3BhZ2VzX2Zvcigo dW5zaWduZWQgbG9uZylkYXRhLCBsZW4pOw0KPj4gLSAgICAgICAgICAgICAgIHBhZ2VzID0gY2Vw aF9nZXRfZGlyZWN0X3BhZ2VfdmVjdG9yKGRhdGEsIG51bV9wYWdlcywgdHJ1ZSk7DQo+PiAtICAg ICAgIH0gZWxzZSB7DQo+PiAtICAgICAgICAgICAgICAgbnVtX3BhZ2VzID0gY2FsY19wYWdlc19m b3Iob2ZmLCBsZW4pOw0KPj4gLSAgICAgICAgICAgICAgIHBhZ2VzID0gY2VwaF9hbGxvY19wYWdl X3ZlY3RvcihudW1fcGFnZXMsIEdGUF9OT0ZTKTsNCj4+IC0gICAgICAgfQ0KPj4gLSAgICAgICBp ZiAoSVNfRVJSKHBhZ2VzKSkNCj4+IC0gICAgICAgICAgICAgICByZXR1cm4gUFRSX0VSUihwYWdl cyk7DQo+PiAtDQo+PiAgICAgICAgIC8qDQo+PiAgICAgICAgICAqIGZsdXNoIGFueSBwYWdlIGNh Y2hlIHBhZ2VzIGluIHRoaXMgcmFuZ2UuICB0aGlzDQo+PiAgICAgICAgICAqIHdpbGwgbWFrZSBj b25jdXJyZW50IG5vcm1hbCBhbmQgc3luYyBpbyBzbG93LA0KPj4gICAgICAgICAgKiBidXQgaXQg d2lsbCBhdCBsZWFzdCBiZWhhdmUgc2Vuc2libHkgd2hlbiB0aGV5IGFyZQ0KPj4gICAgICAgICAg KiBpbiBzZXF1ZW5jZS4NCj4+ICAgICAgICAgICovDQo+PiAtICAgICAgIHJldCA9IGZpbGVtYXBf d3JpdGVfYW5kX3dhaXQoaW5vZGUtPmlfbWFwcGluZyk7DQo+PiArICAgICAgIHJldCA9IGZpbGVt YXBfd3JpdGVfYW5kX3dhaXRfcmFuZ2UoaW5vZGUtPmlfbWFwcGluZywgb2ZmLA0KPj4gKyAgICAg ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgb2ZmICsgaW9jYi0+a2lf bGVmdCk7DQo+PiAgICAgICAgIGlmIChyZXQgPCAwKQ0KPj4gLSAgICAgICAgICAgICAgIGdvdG8g ZG9uZTsNCj4+IC0NCj4+IC0gICAgICAgcmV0ID0gc3RyaXBlZF9yZWFkKGlub2RlLCBvZmYsIGxl biwgcGFnZXMsIG51bV9wYWdlcywgY2hlY2tlb2YsDQo+PiAtICAgICAgICAgICAgICAgICAgICAg ICAgICBmaWxlLT5mX2ZsYWdzICYgT19ESVJFQ1QsDQo+PiAtICAgICAgICAgICAgICAgICAgICAg ICAgICAodW5zaWduZWQgbG9uZylkYXRhICYgflBBR0VfTUFTSyk7DQo+PiArICAgICAgICAgICAg ICAgcmV0dXJuIHJldDsNCj4+DQo+PiAtICAgICAgIGlmIChyZXQgPj0gMCAmJiAoZmlsZS0+Zl9m bGFncyAmIE9fRElSRUNUKSA9PSAwKQ0KPj4gLSAgICAgICAgICAgICAgIHJldCA9IGNlcGhfY29w eV9wYWdlX3ZlY3Rvcl90b191c2VyKHBhZ2VzLCBkYXRhLCBvZmYsIHJldCk7DQo+PiAtICAgICAg IGlmIChyZXQgPj0gMCkNCj4+IC0gICAgICAgICAgICAgICAqcG9mZiA9IG9mZiArIHJldDsNCj4+ ICsgICAgICAgaWYgKGZpbGUtPmZfZmxhZ3MgJiBPX0RJUkVDVCkgew0KPj4gKyAgICAgICAgICAg ICAgIHdoaWxlIChpb3ZfaXRlcl9jb3VudChpKSkgew0KPj4gKyAgICAgICAgICAgICAgICAgICAg ICAgdm9pZCBfX3VzZXIgKmRhdGEgPSBpLT5pb3ZbMF0uaW92X2Jhc2UgKyBpLT5pb3Zfb2Zmc2V0 Ow0KPj4gKyAgICAgICAgICAgICAgICAgICAgICAgc2l6ZV90IGxlbiA9IGktPmlvdlswXS5pb3Zf bGVuIC0gaS0+aW92X29mZnNldDsNCj4+ICsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgIG51 bV9wYWdlcyA9IGNhbGNfcGFnZXNfZm9yKCh1bnNpZ25lZCBsb25nKWRhdGEsIGxlbik7DQo+PiAr ICAgICAgICAgICAgICAgICAgICAgICBwYWdlcyA9IGNlcGhfZ2V0X2RpcmVjdF9wYWdlX3ZlY3Rv cihkYXRhLA0KPj4gKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgbnVtX3BhZ2VzLCB0cnVlKTsNCj4+ICsgICAgICAgICAgICAgICAgICAg ICAgIGlmIChJU19FUlIocGFnZXMpKQ0KPj4gKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICByZXR1cm4gUFRSX0VSUihwYWdlcyk7DQo+PiArDQo+PiArICAgICAgICAgICAgICAgICAgICAg ICByZXQgPSBzdHJpcGVkX3JlYWQoaW5vZGUsIG9mZiwgbGVuLA0KPj4gKyAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgICAgICAgICAgIHBhZ2VzLCBudW1fcGFnZXMsIGNoZWNrZW9mLA0K Pj4gKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIDEsICh1bnNpZ25l ZCBsb25nKWRhdGEgJiB+UEFHRV9NQVNLKTsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgIGNl cGhfcHV0X3BhZ2VfdmVjdG9yKHBhZ2VzLCBudW1fcGFnZXMsIHRydWUpOw0KPj4gKw0KPj4gKyAg ICAgICAgICAgICAgICAgICAgICAgaWYgKHJldCA8PSAwKQ0KPj4gKyAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICBicmVhazsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgIG9mZiArPSBy ZXQ7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICBpb3ZfaXRlcl9hZHZhbmNlKGksIHJldCk7 DQo+PiArICAgICAgICAgICAgICAgICAgICAgICBpZiAocmV0IDwgbGVuKQ0KPj4gKyAgICAgICAg ICAgICAgICAgICAgICAgICAgICAgICBicmVhazsNCj4+ICsgICAgICAgICAgICAgICB9DQo+PiAr ICAgICAgIH0gZWxzZSB7DQo+PiArICAgICAgICAgICAgICAgc2l6ZV90IGxlbiA9IGlvY2ItPmtp X2xlZnQ7DQo+Pg0KPj4gLWRvbmU6DQo+PiAtICAgICAgIGlmIChmaWxlLT5mX2ZsYWdzICYgT19E SVJFQ1QpDQo+PiAtICAgICAgICAgICAgICAgY2VwaF9wdXRfcGFnZV92ZWN0b3IocGFnZXMsIG51 bV9wYWdlcywgdHJ1ZSk7DQo+PiAtICAgICAgIGVsc2UNCj4+ICsgICAgICAgICAgICAgICBudW1f cGFnZXMgPSBjYWxjX3BhZ2VzX2ZvcihvZmYsIGxlbik7DQo+PiArICAgICAgICAgICAgICAgcGFn ZXMgPSBjZXBoX2FsbG9jX3BhZ2VfdmVjdG9yKG51bV9wYWdlcywgR0ZQX05PRlMpOw0KPj4gKyAg ICAgICAgICAgICAgIGlmIChJU19FUlIocGFnZXMpKQ0KPj4gKyAgICAgICAgICAgICAgICAgICAg ICAgcmV0dXJuIFBUUl9FUlIocGFnZXMpOw0KPj4gKyAgICAgICAgICAgICAgIHJldCA9IHN0cmlw ZWRfcmVhZChpbm9kZSwgb2ZmLCBsZW4sIHBhZ2VzLA0KPj4gKyAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgIG51bV9wYWdlcywgY2hlY2tlb2YsIDAsIDApOw0KPj4gKyAgICAg ICAgICAgICAgIGlmIChyZXQgPiAwKSB7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICBpbnQg bCwgayA9IDA7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICBzaXplX3QgbGVmdCA9IGxlbiA9 IHJldDsNCj4+ICsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgIHdoaWxlIChsZWZ0KSB7DQo+ PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHZvaWQgX191c2VyICpkYXRhID0gaS0+ aW92WzBdLmlvdl9iYXNlDQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgICsgaS0+aW92X29mZnNldDsNCj4+ICsgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgbCA9IG1pbihpLT5pb3ZbMF0uaW92X2xlbiAtIGktPmlvdl9vZmZzZXQs DQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgbGVmdCk7DQo+PiAr DQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIHJldCA9IGNlcGhfY29weV9wYWdl X3ZlY3Rvcl90b191c2VyKCZwYWdlc1trXSwNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgZGF0YSwgb2ZmLA0KPj4g KyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICBsKTsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgaWYgKHJl dCA+IDApIHsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBpb3Zf aXRlcl9hZHZhbmNlKGksIHJldCk7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgbGVmdCAtPSByZXQ7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgb2ZmICs9IHJldDsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICBrID0gY2FsY19wYWdlc19mb3IoaW9jYi0+a2lfcG9zLA0KPj4gKyAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBsZW4gLSBsZWZ0 ICsgMSkgLSAxOw0KPj4gKyAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIEJV R19PTihrID49IG51bV9wYWdlcyAmJiBsZWZ0KTsNCj4+ICsgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgfSBlbHNlDQo+PiArICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgYnJlYWs7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICB9DQo+PiArICAgICAgICAgICAg ICAgfQ0KPj4gICAgICAgICAgICAgICAgIGNlcGhfcmVsZWFzZV9wYWdlX3ZlY3RvcihwYWdlcywg bnVtX3BhZ2VzKTsNCj4+ICsgICAgICAgfQ0KPj4gKw0KPj4gKyAgICAgICBpZiAob2ZmID4gaW9j Yi0+a2lfcG9zKSB7DQo+PiArICAgICAgICAgICAgICAgcmV0ID0gb2ZmIC0gaW9jYi0+a2lfcG9z Ow0KPj4gKyAgICAgICAgICAgICAgIGlvY2ItPmtpX3BvcyA9IG9mZjsNCj4+ICsgICAgICAgICAg ICAgICBpb2NiLT5raV9sZWZ0IC09IHJldDsNCj4+ICsgICAgICAgfQ0KPj4gKw0KPj4gICAgICAg ICBkb3V0KCJzeW5jX3JlYWQgcmVzdWx0ICVkXG4iLCByZXQpOw0KPj4gICAgICAgICByZXR1cm4g cmV0Ow0KPj4gIH0NCj4+IEBAIC02NDcsNTUgKzY5MCw2NyBAQCBzdGF0aWMgc3NpemVfdCBjZXBo X2Fpb19yZWFkKHN0cnVjdCBraW9jYiAqaW9jYiwgY29uc3Qgc3RydWN0IGlvdmVjICppb3YsDQo+ PiAgew0KPj4gICAgICAgICBzdHJ1Y3QgZmlsZSAqZmlscCA9IGlvY2ItPmtpX2ZpbHA7DQo+PiAg ICAgICAgIHN0cnVjdCBjZXBoX2ZpbGVfaW5mbyAqZmkgPSBmaWxwLT5wcml2YXRlX2RhdGE7DQo+ PiAtICAgICAgIGxvZmZfdCAqcHBvcyA9ICZpb2NiLT5raV9wb3M7DQo+PiAtICAgICAgIHNpemVf dCBsZW4gPSBpb3YtPmlvdl9sZW47DQo+PiArICAgICAgIHNpemVfdCBsZW4gPSAwOw0KPj4gICAg ICAgICBzdHJ1Y3QgaW5vZGUgKmlub2RlID0gZmlsZV9pbm9kZShmaWxwKTsNCj4+ICAgICAgICAg c3RydWN0IGNlcGhfaW5vZGVfaW5mbyAqY2kgPSBjZXBoX2lub2RlKGlub2RlKTsNCj4+IC0gICAg ICAgdm9pZCBfX3VzZXIgKmJhc2UgPSBpb3YtPmlvdl9iYXNlOw0KPj4gICAgICAgICBzc2l6ZV90 IHJldDsNCj4+ICAgICAgICAgaW50IHdhbnQsIGdvdCA9IDA7DQo+PiAgICAgICAgIGludCBjaGVj a2VvZiA9IDAsIHJlYWQgPSAwOw0KPj4NCj4+ICAgICAgICAgZG91dCgiYWlvX3JlYWQgJXAgJWxs eC4lbGx4ICVsbHV+JXUgdHJ5aW5nIHRvIGdldCBjYXBzIG9uICVwXG4iLA0KPj4gICAgICAgICAg ICAgIGlub2RlLCBjZXBoX3Zpbm9wKGlub2RlKSwgcG9zLCAodW5zaWduZWQpbGVuLCBpbm9kZSk7 DQo+PiAtYWdhaW46DQo+PiArDQo+PiArICAgICAgIHJldCA9IGdlbmVyaWNfc2VnbWVudF9jaGVj a3MoaW92LCAmbnJfc2VncywgJmxlbiwgVkVSSUZZX1dSSVRFKTsNCj4+ICsgICAgICAgaWYgKHJl dCkNCj4+ICsgICAgICAgICAgICAgICByZXR1cm4gcmV0Ow0KPj4gKw0KPj4gICAgICAgICBpZiAo ZmktPmZtb2RlICYgQ0VQSF9GSUxFX01PREVfTEFaWSkNCj4+ICAgICAgICAgICAgICAgICB3YW50 ID0gQ0VQSF9DQVBfRklMRV9DQUNIRSB8IENFUEhfQ0FQX0ZJTEVfTEFaWUlPOw0KPj4gICAgICAg ICBlbHNlDQo+PiAgICAgICAgICAgICAgICAgd2FudCA9IENFUEhfQ0FQX0ZJTEVfQ0FDSEU7DQo+ PiAgICAgICAgIHJldCA9IGNlcGhfZ2V0X2NhcHMoY2ksIENFUEhfQ0FQX0ZJTEVfUkQsIHdhbnQs ICZnb3QsIC0xKTsNCj4+ICAgICAgICAgaWYgKHJldCA8IDApDQo+PiAtICAgICAgICAgICAgICAg Z290byBvdXQ7DQo+PiArICAgICAgICAgICAgICAgcmV0dXJuIHJldDsNCj4+ICsNCj4+ICAgICAg ICAgZG91dCgiYWlvX3JlYWQgJXAgJWxseC4lbGx4ICVsbHV+JXUgZ290IGNhcCByZWZzIG9uICVz XG4iLA0KPj4gICAgICAgICAgICAgIGlub2RlLCBjZXBoX3Zpbm9wKGlub2RlKSwgcG9zLCAodW5z aWduZWQpbGVuLA0KPj4gICAgICAgICAgICAgIGNlcGhfY2FwX3N0cmluZyhnb3QpKTsNCj4+DQo+ PiAgICAgICAgIGlmICgoZ290ICYgKENFUEhfQ0FQX0ZJTEVfQ0FDSEV8Q0VQSF9DQVBfRklMRV9M QVpZSU8pKSA9PSAwIHx8DQo+PiAgICAgICAgICAgICAoaW9jYi0+a2lfZmlscC0+Zl9mbGFncyAm IE9fRElSRUNUKSB8fA0KPj4gLSAgICAgICAgICAgKGZpLT5mbGFncyAmIENFUEhfRl9TWU5DKSkN Cj4+ICsgICAgICAgICAgIChmaS0+ZmxhZ3MgJiBDRVBIX0ZfU1lOQykpIHsNCj4+ICsgICAgICAg ICAgICAgICBzdHJ1Y3QgaW92X2l0ZXIgaTsNCj4+ICsNCj4+ICsgICAgICAgICAgICAgICBpb2Ni LT5raV9sZWZ0ID0gbGVuOw0KPj4gKyAgICAgICAgICAgICAgIGlvdl9pdGVyX2luaXQoJmksIGlv diwgbnJfc2VncywgbGVuLCAwKTsNCj4+ICthZ2FpbjoNCj4+ICAgICAgICAgICAgICAgICAvKiBo bW0sIHRoaXMgaXNuJ3QgcmVhbGx5IGFzeW5jLi4uICovDQo+PiAtICAgICAgICAgICAgICAgcmV0 ID0gY2VwaF9zeW5jX3JlYWQoZmlscCwgYmFzZSwgbGVuLCBwcG9zLCAmY2hlY2tlb2YpOw0KPj4g LSAgICAgICBlbHNlDQo+PiArICAgICAgICAgICAgICAgcmV0ID0gY2VwaF9zeW5jX3JlYWQoaW9j YiwgJmksICZjaGVja2VvZik7DQo+PiArDQo+PiArICAgICAgICAgICAgICAgaWYgKGNoZWNrZW9m ICYmIHJldCA+PSAwKSB7DQo+PiArICAgICAgICAgICAgICAgICAgICAgICBpbnQgc3RhdHJldCA9 IGNlcGhfZG9fZ2V0YXR0cihpbm9kZSwNCj4+ICsgICAgICAgICAgICAgICAgICAgICAgICAgICAg ICAgICAgICAgICAgICAgICAgICAgICAgIENFUEhfU1RBVF9DQVBfU0laRSk7DQo+DQo+SXQncyB3 cm9uZyB0byBtb3ZlIGdldGF0dHIgdG8gaGVyZS4gYmVjYXVzZSBnZXRhdHRyIHdoaWxlIGhvbGRp bmcgRnINCj5jYXAgY2FuIGNhdXNlIGhhbmcuDQo+DQo+UmVnYXJkcw0KPllhbiwgWmhlbmcNCj4N CkhpLA0KCUNhbiB5b3UgZXhwbGFpbiBpbiBkZXRhaWw/DQoNClRoYW5rcyENCkppYW5wZW5nIE1h -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On 09/22/2013 11:00 AM, majianpeng wrote: >> On Thu, Sep 12, 2013 at 1:25 PM, majianpeng <majianpeng@gmail.com> wrote: >>> For readv/preadv sync-operatoin, ceph only do the first iov. >>> It don't think other iovs.Now implement this. >>> >>> V4: >>> modify one bug. >>> V3: >>> modify some bug. >>> V2: >>> -add generic_segment_checks >>> -using struct iov_iter replace cloning the iovs. >>> -return previous successfully copied if ceph_copy_page_vector_to_user >>> met error. >>> >>> >>> Signed-off-by: Jianpeng Ma <majianpeng@gmail.com> >>> Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com> >>> --- >>> fs/ceph/file.c | 157 ++++++++++++++++++++++++++++++++++++++------------------- >>> 1 file changed, 106 insertions(+), 51 deletions(-) >>> >>> diff --git a/fs/ceph/file.c b/fs/ceph/file.c >>> index 3de8982..bc7fa52 100644 >>> --- a/fs/ceph/file.c >>> +++ b/fs/ceph/file.c >>> @@ -408,51 +408,94 @@ more: >>> * >>> * If the read spans object boundary, just do multiple reads. >>> */ >>> -static ssize_t ceph_sync_read(struct file *file, char __user *data, >>> - unsigned len, loff_t *poff, int *checkeof) >>> +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, >>> + int *checkeof) >>> { >>> + struct file *file = iocb->ki_filp; >>> struct inode *inode = file_inode(file); >>> struct page **pages; >>> - u64 off = *poff; >>> + u64 off = iocb->ki_pos; >>> int num_pages, ret; >>> >>> - dout("sync_read on file %p %llu~%u %s\n", file, off, len, >>> + dout("sync_read on file %p %llu~%u %s\n", file, off, >>> + (unsigned)iocb->ki_left, >>> (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); >>> - >>> - if (file->f_flags & O_DIRECT) { >>> - num_pages = calc_pages_for((unsigned long)data, len); >>> - pages = ceph_get_direct_page_vector(data, num_pages, true); >>> - } else { >>> - num_pages = calc_pages_for(off, len); >>> - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); >>> - } >>> - if (IS_ERR(pages)) >>> - return PTR_ERR(pages); >>> - >>> /* >>> * flush any page cache pages in this range. this >>> * will make concurrent normal and sync io slow, >>> * but it will at least behave sensibly when they are >>> * in sequence. >>> */ >>> - ret = filemap_write_and_wait(inode->i_mapping); >>> + ret = filemap_write_and_wait_range(inode->i_mapping, off, >>> + off + iocb->ki_left); >>> if (ret < 0) >>> - goto done; >>> - >>> - ret = striped_read(inode, off, len, pages, num_pages, checkeof, >>> - file->f_flags & O_DIRECT, >>> - (unsigned long)data & ~PAGE_MASK); >>> + return ret; >>> >>> - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) >>> - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); >>> - if (ret >= 0) >>> - *poff = off + ret; >>> + if (file->f_flags & O_DIRECT) { >>> + while (iov_iter_count(i)) { >>> + void __user *data = i->iov[0].iov_base + i->iov_offset; >>> + size_t len = i->iov[0].iov_len - i->iov_offset; >>> + >>> + num_pages = calc_pages_for((unsigned long)data, len); >>> + pages = ceph_get_direct_page_vector(data, >>> + num_pages, true); >>> + if (IS_ERR(pages)) >>> + return PTR_ERR(pages); >>> + >>> + ret = striped_read(inode, off, len, >>> + pages, num_pages, checkeof, >>> + 1, (unsigned long)data & ~PAGE_MASK); >>> + ceph_put_page_vector(pages, num_pages, true); >>> + >>> + if (ret <= 0) >>> + break; >>> + off += ret; >>> + iov_iter_advance(i, ret); >>> + if (ret < len) >>> + break; >>> + } >>> + } else { >>> + size_t len = iocb->ki_left; >>> >>> -done: >>> - if (file->f_flags & O_DIRECT) >>> - ceph_put_page_vector(pages, num_pages, true); >>> - else >>> + num_pages = calc_pages_for(off, len); >>> + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); >>> + if (IS_ERR(pages)) >>> + return PTR_ERR(pages); >>> + ret = striped_read(inode, off, len, pages, >>> + num_pages, checkeof, 0, 0); >>> + if (ret > 0) { >>> + int l, k = 0; >>> + size_t left = len = ret; >>> + >>> + while (left) { >>> + void __user *data = i->iov[0].iov_base >>> + + i->iov_offset; >>> + l = min(i->iov[0].iov_len - i->iov_offset, >>> + left); >>> + >>> + ret = ceph_copy_page_vector_to_user(&pages[k], >>> + data, off, >>> + l); >>> + if (ret > 0) { >>> + iov_iter_advance(i, ret); >>> + left -= ret; >>> + off += ret; >>> + k = calc_pages_for(iocb->ki_pos, >>> + len - left + 1) - 1; >>> + BUG_ON(k >= num_pages && left); >>> + } else >>> + break; >>> + } >>> + } >>> ceph_release_page_vector(pages, num_pages); >>> + } >>> + >>> + if (off > iocb->ki_pos) { >>> + ret = off - iocb->ki_pos; >>> + iocb->ki_pos = off; >>> + iocb->ki_left -= ret; >>> + } >>> + >>> dout("sync_read result %d\n", ret); >>> return ret; >>> } >>> @@ -647,55 +690,67 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, >>> { >>> struct file *filp = iocb->ki_filp; >>> struct ceph_file_info *fi = filp->private_data; >>> - loff_t *ppos = &iocb->ki_pos; >>> - size_t len = iov->iov_len; >>> + size_t len = 0; >>> struct inode *inode = file_inode(filp); >>> struct ceph_inode_info *ci = ceph_inode(inode); >>> - void __user *base = iov->iov_base; >>> ssize_t ret; >>> int want, got = 0; >>> int checkeof = 0, read = 0; >>> >>> dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", >>> inode, ceph_vinop(inode), pos, (unsigned)len, inode); >>> -again: >>> + >>> + ret = generic_segment_checks(iov, &nr_segs, &len, VERIFY_WRITE); >>> + if (ret) >>> + return ret; >>> + >>> if (fi->fmode & CEPH_FILE_MODE_LAZY) >>> want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; >>> else >>> want = CEPH_CAP_FILE_CACHE; >>> ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); >>> if (ret < 0) >>> - goto out; >>> + return ret; >>> + >>> dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", >>> inode, ceph_vinop(inode), pos, (unsigned)len, >>> ceph_cap_string(got)); >>> >>> if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || >>> (iocb->ki_filp->f_flags & O_DIRECT) || >>> - (fi->flags & CEPH_F_SYNC)) >>> + (fi->flags & CEPH_F_SYNC)) { >>> + struct iov_iter i; >>> + >>> + iocb->ki_left = len; >>> + iov_iter_init(&i, iov, nr_segs, len, 0); >>> +again: >>> /* hmm, this isn't really async... */ >>> - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); >>> - else >>> + ret = ceph_sync_read(iocb, &i, &checkeof); >>> + >>> + if (checkeof && ret >= 0) { >>> + int statret = ceph_do_getattr(inode, >>> + CEPH_STAT_CAP_SIZE); >> >> It's wrong to move getattr to here. because getattr while holding Fr >> cap can cause hang. >> >> Regards >> Yan, Zheng >> > Hi, > Can you explain in detail? getattr need to "read lock" inode's filelock. But the lock can be in unstable state. the getattr request waits for lock's state to become stable, the lock waits for client to release Fr cap. your patches are already in master branch of ceph-client, please send incremental patch to fix the issue. Regards Yan, Zheng. > > Thanks! > Jianpeng Ma > -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3de8982..bc7fa52 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -408,51 +408,94 @@ more: * * If the read spans object boundary, just do multiple reads. */ -static ssize_t ceph_sync_read(struct file *file, char __user *data, - unsigned len, loff_t *poff, int *checkeof) +static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, + int *checkeof) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct page **pages; - u64 off = *poff; + u64 off = iocb->ki_pos; int num_pages, ret; - dout("sync_read on file %p %llu~%u %s\n", file, off, len, + dout("sync_read on file %p %llu~%u %s\n", file, off, + (unsigned)iocb->ki_left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); - - if (file->f_flags & O_DIRECT) { - num_pages = calc_pages_for((unsigned long)data, len); - pages = ceph_get_direct_page_vector(data, num_pages, true); - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); - } - if (IS_ERR(pages)) - return PTR_ERR(pages); - /* * flush any page cache pages in this range. this * will make concurrent normal and sync io slow, * but it will at least behave sensibly when they are * in sequence. */ - ret = filemap_write_and_wait(inode->i_mapping); + ret = filemap_write_and_wait_range(inode->i_mapping, off, + off + iocb->ki_left); if (ret < 0) - goto done; - - ret = striped_read(inode, off, len, pages, num_pages, checkeof, - file->f_flags & O_DIRECT, - (unsigned long)data & ~PAGE_MASK); + return ret; - if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) - ret = ceph_copy_page_vector_to_user(pages, data, off, ret); - if (ret >= 0) - *poff = off + ret; + if (file->f_flags & O_DIRECT) { + while (iov_iter_count(i)) { + void __user *data = i->iov[0].iov_base + i->iov_offset; + size_t len = i->iov[0].iov_len - i->iov_offset; + + num_pages = calc_pages_for((unsigned long)data, len); + pages = ceph_get_direct_page_vector(data, + num_pages, true); + if (IS_ERR(pages)) + return PTR_ERR(pages); + + ret = striped_read(inode, off, len, + pages, num_pages, checkeof, + 1, (unsigned long)data & ~PAGE_MASK); + ceph_put_page_vector(pages, num_pages, true); + + if (ret <= 0) + break; + off += ret; + iov_iter_advance(i, ret); + if (ret < len) + break; + } + } else { + size_t len = iocb->ki_left; -done: - if (file->f_flags & O_DIRECT) - ceph_put_page_vector(pages, num_pages, true); - else + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof, 0, 0); + if (ret > 0) { + int l, k = 0; + size_t left = len = ret; + + while (left) { + void __user *data = i->iov[0].iov_base + + i->iov_offset; + l = min(i->iov[0].iov_len - i->iov_offset, + left); + + ret = ceph_copy_page_vector_to_user(&pages[k], + data, off, + l); + if (ret > 0) { + iov_iter_advance(i, ret); + left -= ret; + off += ret; + k = calc_pages_for(iocb->ki_pos, + len - left + 1) - 1; + BUG_ON(k >= num_pages && left); + } else + break; + } + } ceph_release_page_vector(pages, num_pages); + } + + if (off > iocb->ki_pos) { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + iocb->ki_left -= ret; + } + dout("sync_read result %d\n", ret); return ret; } @@ -647,55 +690,67 @@ static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - loff_t *ppos = &iocb->ki_pos; - size_t len = iov->iov_len; + size_t len = 0; struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - void __user *base = iov->iov_base; ssize_t ret; int want, got = 0; int checkeof = 0, read = 0; dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), pos, (unsigned)len, inode); -again: + + ret = generic_segment_checks(iov, &nr_segs, &len, VERIFY_WRITE); + if (ret) + return ret; + if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, &got, -1); if (ret < 0) - goto out; + return ret; + dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n", inode, ceph_vinop(inode), pos, (unsigned)len, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_filp->f_flags & O_DIRECT) || - (fi->flags & CEPH_F_SYNC)) + (fi->flags & CEPH_F_SYNC)) { + struct iov_iter i; + + iocb->ki_left = len; + iov_iter_init(&i, iov, nr_segs, len, 0); +again: /* hmm, this isn't really async... */ - ret = ceph_sync_read(filp, base, len, ppos, &checkeof); - else + ret = ceph_sync_read(iocb, &i, &checkeof); + + if (checkeof && ret >= 0) { + int statret = ceph_do_getattr(inode, + CEPH_STAT_CAP_SIZE); + + /* hit EOF or hole? */ + if (statret == 0 && iocb->ki_pos < inode->i_size && + iocb->ki_left) { + dout("sync_read hit hole, ppos %lld < size %lld" + ", reading more\n", iocb->ki_pos, + inode->i_size); + + read += ret; + checkeof = 0; + goto again; + } + } + + } else ret = generic_file_aio_read(iocb, iov, nr_segs, pos); -out: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); ceph_put_cap_refs(ci, got); - if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); - - /* hit EOF or hole? */ - if (statret == 0 && *ppos < inode->i_size) { - dout("aio_read sync_read hit hole, ppos %lld < size %lld, reading more\n", *ppos, inode->i_size); - read += ret; - base += ret; - len -= ret; - checkeof = 0; - goto again; - } - } if (ret >= 0) ret += read;