Message ID | 20200619155036.GZ8681@bombadil.infradead.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [RFC] Bypass filesystems for reading cached pages | expand |
On 6/19/20 8:50 AM, Matthew Wilcox wrote: > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > The advantage of this patch is that we can avoid taking any filesystem > lock, as long as the pages being accessed are in the cache (and we don't > need to readahead any pages into the cache). We also avoid an indirect > function call in these cases. > > I'm sure reusing the name call_read_iter() is the wrong way to go about > this, but renaming all the callers would make this a larger patch. > I'm happy to do it if something like this stands a chance of being > accepted. > > Compared to Andreas' patch, I removed the -ECANCELED return value. > We can happily return 0 from generic_file_buffered_read() and it's less > code to handle that. I bypass the attempt to read from the page cache > for O_DIRECT reads, and for inodes which have no cached pages. Hopefully > this will avoid calling generic_file_buffered_read() for drivers which > implement read_iter() (although I haven't audited them all to check that > > This could go horribly wrong if filesystems rely on doing work in their > ->read_iter implementation (eg checking i_size after acquiring their > lock) instead of keeping the page cache uptodate. On the other hand, > the ->map_pages() method is already called without locks, so filesystems > should already be prepared for this. > > Arguably we could do something similar for writes. I'm a little more > scared of that patch since filesystems are more likely to want to do > things to keep their fies in sync for writes. I did a testing with NVMeOF target file backend with buffered I/O enabled with your patch and setting the IOCB_CACHED for each I/O ored '|' with IOCB_NOWAIT calling call_read_iter_cached() [1]. The name was changed from call_read_iter() -> call_read_iter_cached() [2]). For the file system I've used XFS and device was null_blk with memory backed so entire file was cached into the DRAM. Following are the performance numbers :- IOPS/Bandwidth :- default-page-cache: read: IOPS=1389k, BW=5424MiB/s (5688MB/s) default-page-cache: read: IOPS=1381k, BW=5395MiB/s (5657MB/s) default-page-cache: read: IOPS=1391k, BW=5432MiB/s (5696MB/s) iocb-cached-page-cache: read: IOPS=1403k, BW=5481MiB/s (5747MB/s) iocb-cached-page-cache: read: IOPS=1393k, BW=5439MiB/s (5704MB/s) iocb-cached-page-cache: read: IOPS=1399k, BW=5465MiB/s (5731MB/s) Submission lat :- default-page-cache: slat (usec): min=2, max=1076, avg= 3.71, default-page-cache: slat (usec): min=2, max=489, avg= 3.72, default-page-cache: slat (usec): min=2, max=1078, avg= 3.70, iocb-cached-page-cache: slat (usec): min=2, max=1731, avg= 3.70, iocb-cached-page-cache: slat (usec): min=2, max=2115, avg= 3.69, iocb-cached-page-cache: slat (usec): min=2, max=3055, avg= 3.70, CPU :- default-page-cache: cpu : usr=10.36%, sys=49.29%, ctx=5207722, default-page-cache: cpu : usr=10.49%, sys=49.15%, ctx=5179714, default-page-cache: cpu : usr=10.56%, sys=49.22%, ctx=5215474, iocb-cached-page-cache: cpu : usr=10.26%, sys=49.53%, ctx=5262214, iocb-cached-page-cache: cpu : usr=10.43%, sys=49.35%, ctx=5222433, iocb-cached-page-cache: cpu : usr=10.47%, sys=49.59%, ctx=5247344, Regards, Chaitanya [1] diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 0abbefd9925e..02fa272399b6 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -120,6 +120,9 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos, iocb->ki_filp = req->ns->file; iocb->ki_flags = ki_flags | iocb_flags(req->ns->file); + if (rw == READ) + return call_read_iter_cached(req->ns->file, iocb, &iter); + return call_iter(iocb, &iter); } @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) if (req->ns->buffered_io) { if (likely(!req->f.mpool_alloc) && - nvmet_file_execute_io(req, IOCB_NOWAIT)) + nvmet_file_execute_io(req, + IOCB_NOWAIT |IOCB_CACHED)) return; nvmet_file_submit_buffered_io(req); [2] diff --git a/fs/read_write.c b/fs/read_write.c index bbfa9b12b15e..d4bf2581ff0b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t read_write == READ ? MAY_READ : MAY_WRITE); } +ssize_t call_read_iter_cached(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + ssize_t written, ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) + goto uncached; + if (!file->f_mapping->nrpages) + goto uncached; + + iocb->ki_flags |= IOCB_CACHED; + ret = generic_file_buffered_read(iocb, iter, 0); + iocb->ki_flags &= ~IOCB_CACHED; + + if (likely(!iov_iter_count(iter))) + return ret; + + if (ret == -EAGAIN) { + if (iocb->ki_flags & IOCB_NOWAIT) + return ret; + ret = 0; + } else if (ret < 0) { + return ret; + } + +uncached: + written = ret; + + ret = file->f_op->read_iter(iocb, iter); + if (ret > 0) + written += ret; + + return written ? written : ret; +} + static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c4ab4dc1cd7..3fc8b00cd140 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_CACHED (1 << 8) struct kiocb { struct file *ki_filp; @@ -1901,6 +1902,8 @@ static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, return file->f_op->read_iter(kio, iter); } +ssize_t call_read_iter_cached(struct file *, struct kiocb *, struct iov_iter *); + static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..4ee97941a1f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_CACHED) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) { put_page(page); goto would_block; }
On Fri, Jun 19, 2020 at 07:06:19PM +0000, Chaitanya Kulkarni wrote: > On 6/19/20 8:50 AM, Matthew Wilcox wrote: > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > The advantage of this patch is that we can avoid taking any filesystem > > lock, as long as the pages being accessed are in the cache (and we don't > > need to readahead any pages into the cache). We also avoid an indirect > > function call in these cases. > > I did a testing with NVMeOF target file backend with buffered I/O > enabled with your patch and setting the IOCB_CACHED for each I/O ored > '|' with IOCB_NOWAIT calling call_read_iter_cached() [1]. > > The name was changed from call_read_iter() -> call_read_iter_cached() [2]). > > For the file system I've used XFS and device was null_blk with memory > backed so entire file was cached into the DRAM. Thanks for testing! Can you elaborate a little more on what the test does? Are there many threads or tasks? What is the I/O path? XFS on an NVMEoF device, talking over loopback to localhost with nullblk as the server? The nullblk device will have all the data in its pagecache, but each XFS file will have an empty pagecache initially. Then it'll be populated by the test, so does the I/O pattern revisit previously accessed data at all? > Following are the performance numbers :- > > IOPS/Bandwidth :- > > default-page-cache: read: IOPS=1389k, BW=5424MiB/s (5688MB/s) > default-page-cache: read: IOPS=1381k, BW=5395MiB/s (5657MB/s) > default-page-cache: read: IOPS=1391k, BW=5432MiB/s (5696MB/s) > iocb-cached-page-cache: read: IOPS=1403k, BW=5481MiB/s (5747MB/s) > iocb-cached-page-cache: read: IOPS=1393k, BW=5439MiB/s (5704MB/s) > iocb-cached-page-cache: read: IOPS=1399k, BW=5465MiB/s (5731MB/s) That doesn't look bad at all ... about 0.7% increase in IOPS. > Submission lat :- > > default-page-cache: slat (usec): min=2, max=1076, avg= 3.71, > default-page-cache: slat (usec): min=2, max=489, avg= 3.72, > default-page-cache: slat (usec): min=2, max=1078, avg= 3.70, > iocb-cached-page-cache: slat (usec): min=2, max=1731, avg= 3.70, > iocb-cached-page-cache: slat (usec): min=2, max=2115, avg= 3.69, > iocb-cached-page-cache: slat (usec): min=2, max=3055, avg= 3.70, Average latency unchanged, max latency up a little ... makes sense, since we'll do a little more work in the worst case. > @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) > > if (req->ns->buffered_io) { > if (likely(!req->f.mpool_alloc) && > - nvmet_file_execute_io(req, IOCB_NOWAIT)) > + nvmet_file_execute_io(req, > + IOCB_NOWAIT |IOCB_CACHED)) > return; > nvmet_file_submit_buffered_io(req); You'll need a fallback path here, right? IOCB_CACHED can get part-way through doing a request, and then need to be finished off after taking the mutex.
Matthew, On 6/19/20 1:12 PM, Matthew Wilcox wrote: > On Fri, Jun 19, 2020 at 07:06:19PM +0000, Chaitanya Kulkarni wrote: >> On 6/19/20 8:50 AM, Matthew Wilcox wrote: >>> This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. >>> The advantage of this patch is that we can avoid taking any filesystem >>> lock, as long as the pages being accessed are in the cache (and we don't >>> need to readahead any pages into the cache). We also avoid an indirect >>> function call in these cases. >> >> I did a testing with NVMeOF target file backend with buffered I/O >> enabled with your patch and setting the IOCB_CACHED for each I/O ored >> '|' with IOCB_NOWAIT calling call_read_iter_cached() [1]. >> >> The name was changed from call_read_iter() -> call_read_iter_cached() [2]). >> >> For the file system I've used XFS and device was null_blk with memory >> backed so entire file was cached into the DRAM. > > Thanks for testing! Can you elaborate a little more on what the test does? > Are there many threads or tasks? What is the I/O path? XFS on an NVMEoF > device, talking over loopback to localhost with nullblk as the server? > > The nullblk device will have all the data in its pagecache, but each XFS > file will have an empty pagecache initially. Then it'll be populated by > the test, so does the I/O pattern revisit previously accessed data at all? > Will share details shortly. >> Following are the performance numbers :- >> >> IOPS/Bandwidth :- >> >> default-page-cache: read: IOPS=1389k, BW=5424MiB/s (5688MB/s) >> default-page-cache: read: IOPS=1381k, BW=5395MiB/s (5657MB/s) >> default-page-cache: read: IOPS=1391k, BW=5432MiB/s (5696MB/s) >> iocb-cached-page-cache: read: IOPS=1403k, BW=5481MiB/s (5747MB/s) >> iocb-cached-page-cache: read: IOPS=1393k, BW=5439MiB/s (5704MB/s) >> iocb-cached-page-cache: read: IOPS=1399k, BW=5465MiB/s (5731MB/s) > > That doesn't look bad at all ... about 0.7% increase in IOPS. > >> Submission lat :- >> >> default-page-cache: slat (usec): min=2, max=1076, avg= 3.71, >> default-page-cache: slat (usec): min=2, max=489, avg= 3.72, >> default-page-cache: slat (usec): min=2, max=1078, avg= 3.70, >> iocb-cached-page-cache: slat (usec): min=2, max=1731, avg= 3.70, >> iocb-cached-page-cache: slat (usec): min=2, max=2115, avg= 3.69, >> iocb-cached-page-cache: slat (usec): min=2, max=3055, avg= 3.70, > > Average latency unchanged, max latency up a little ... makes sense, > since we'll do a little more work in the worst case. > >> @@ -264,7 +267,8 @@ static void nvmet_file_execute_rw(struct nvmet_req *req) >> >> if (req->ns->buffered_io) { >> if (likely(!req->f.mpool_alloc) && >> - nvmet_file_execute_io(req, IOCB_NOWAIT)) >> + nvmet_file_execute_io(req, >> + IOCB_NOWAIT |IOCB_CACHED)) >> return; >> nvmet_file_submit_buffered_io(req); > > You'll need a fallback path here, right? IOCB_CACHED can get part-way > through doing a request, and then need to be finished off after taking > the mutex. > > That is what something needs to be done in the call_read_iter cached otherwise with a flag otherwise callers will have to duplicate the code all over the kernel. Correct me if I'm wrong. For now I'm populating all the data in the cache for fio runs so 2nd and 3rd run is coming from the cache. Also, looking at the code now I don't think we need to use IOCB_CACHED flag as long as we call call_read_iter_cached() since it takes care of adding this flags which is a correct way of abstracting the code, than caller duplicate this flag all over the kernel. Am I missing something ? I'll run test again and share more details.
On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote: > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > The advantage of this patch is that we can avoid taking any filesystem > lock, as long as the pages being accessed are in the cache (and we don't > need to readahead any pages into the cache). We also avoid an indirect > function call in these cases. > > I'm sure reusing the name call_read_iter() is the wrong way to go about > this, but renaming all the callers would make this a larger patch. > I'm happy to do it if something like this stands a chance of being > accepted. > > Compared to Andreas' patch, I removed the -ECANCELED return value. > We can happily return 0 from generic_file_buffered_read() and it's less > code to handle that. I bypass the attempt to read from the page cache > for O_DIRECT reads, and for inodes which have no cached pages. Hopefully > this will avoid calling generic_file_buffered_read() for drivers which > implement read_iter() (although I haven't audited them all to check that > > This could go horribly wrong if filesystems rely on doing work in their > ->read_iter implementation (eg checking i_size after acquiring their > lock) instead of keeping the page cache uptodate. On the other hand, > the ->map_pages() method is already called without locks, so filesystems > should already be prepared for this. > XFS is taking i_rwsem lock in read_iter() for a surprising reason: https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/ In that post I claim that ocfs2 and cifs also do some work in read_iter(). I didn't go back to check what, but it sounds like cache coherence among nodes. So filesystems will need to opt-in to this behavior. I wonder if we should make this behavior also opt-in by userspace, for example, RWF_OPPORTUNISTIC_CACHED. Because if I am not mistaken, even though this change has a potential to improve many workloads, it may also degrade some workloads in cases where case readahead is not properly tuned. Imagine reading a large file and getting only a few pages worth of data read on every syscall. Or did I misunderstand your patch's behavior in that case? Another up side of user opt-in flag - it can be used to mitigate the objection of XFS developers against changing the "atomic write vs. read" behavior. New flag - no commitment to an XFS specific behavior that nobody knows if any application out there relies on. Thanks, Amir.
On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote: > On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote: > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > The advantage of this patch is that we can avoid taking any filesystem > > lock, as long as the pages being accessed are in the cache (and we don't > > need to readahead any pages into the cache). We also avoid an indirect > > function call in these cases. > > XFS is taking i_rwsem lock in read_iter() for a surprising reason: > https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/ > In that post I claim that ocfs2 and cifs also do some work in read_iter(). > I didn't go back to check what, but it sounds like cache coherence among > nodes. That's out of date. Here's POSIX-2017: https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html "I/O is intended to be atomic to ordinary files and pipes and FIFOs. Atomic means that all the bytes from a single operation that started out together end up together, without interleaving from other I/O operations. It is a known attribute of terminals that this is not honored, and terminals are explicitly (and implicitly permanently) excepted, making the behavior unspecified. The behavior for other device types is also left unspecified, but the wording is intended to imply that future standards might choose to specify atomicity (or not)." That _doesn't_ say "a read cannot observe a write in progress". It says "Two writes cannot interleave". Indeed, further down in that section, it says: "Earlier versions of this standard allowed two very different behaviors with regard to the handling of interrupts. In order to minimize the resulting confusion, it was decided that POSIX.1-2017 should support only one of these behaviors. Historical practice on AT&T-derived systems was to have read() and write() return -1 and set errno to [EINTR] when interrupted after some, but not all, of the data requested had been transferred. However, the US Department of Commerce FIPS 151-1 and FIPS 151-2 require the historical BSD behavior, in which read() and write() return the number of bytes actually transferred before the interrupt. If -1 is returned when any data is transferred, it is difficult to recover from the error on a seekable device and impossible on a non-seekable device. Most new implementations support this behavior. The behavior required by POSIX.1-2017 is to return the number of bytes transferred." That explicitly allows for a write to be interrupted by a signal and later resumed, allowing a read to observe a half-complete write. > Because if I am not mistaken, even though this change has a potential > to improve many workloads, it may also degrade some workloads in cases > where case readahead is not properly tuned. Imagine reading a large file > and getting only a few pages worth of data read on every syscall. > Or did I misunderstand your patch's behavior in that case? I think you did. If the IOCB_CACHED read hits a readahead page, it returns early. Then call_read_iter() notices the read is not yet complete, and calls ->read_iter() to finish the read. So it's two calls to generic_file_buffered_read() rather than one, but it's still one syscall.
[CC: Dave Chinner, Jan Kara, xfs] On Sat, Jun 20, 2020 at 10:15 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote: > > On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote: > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > > The advantage of this patch is that we can avoid taking any filesystem > > > lock, as long as the pages being accessed are in the cache (and we don't > > > need to readahead any pages into the cache). We also avoid an indirect > > > function call in these cases. > > > > XFS is taking i_rwsem lock in read_iter() for a surprising reason: > > https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/ > > In that post I claim that ocfs2 and cifs also do some work in read_iter(). > > I didn't go back to check what, but it sounds like cache coherence among > > nodes. > > That's out of date. Here's POSIX-2017: > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html > > "I/O is intended to be atomic to ordinary files and pipes and > FIFOs. Atomic means that all the bytes from a single operation that > started out together end up together, without interleaving from other > I/O operations. It is a known attribute of terminals that this is not > honored, and terminals are explicitly (and implicitly permanently) > excepted, making the behavior unspecified. The behavior for other > device types is also left unspecified, but the wording is intended to > imply that future standards might choose to specify atomicity (or not)." > > That _doesn't_ say "a read cannot observe a write in progress". It says > "Two writes cannot interleave". Indeed, further down in that section, it says: > > "Earlier versions of this standard allowed two very different behaviors > with regard to the handling of interrupts. In order to minimize the > resulting confusion, it was decided that POSIX.1-2017 should support > only one of these behaviors. Historical practice on AT&T-derived systems > was to have read() and write() return -1 and set errno to [EINTR] when > interrupted after some, but not all, of the data requested had been > transferred. However, the US Department of Commerce FIPS 151-1 and FIPS > 151-2 require the historical BSD behavior, in which read() and write() > return the number of bytes actually transferred before the interrupt. If > -1 is returned when any data is transferred, it is difficult to recover > from the error on a seekable device and impossible on a non-seekable > device. Most new implementations support this behavior. The behavior > required by POSIX.1-2017 is to return the number of bytes transferred." > > That explicitly allows for a write to be interrupted by a signal and > later resumed, allowing a read to observe a half-complete write. > Tell that to Dave Chinner (cc). I too, find it surprising that XFS developers choose to "not regress" a behavior that is XFS specific and there is no proof or even clues of any application that could rely on such behavior. While the price that is being paid by all real world applications that do mixed random rw workload is very much real and very much significant. The original discussion on the original post quickly steered towards the behavior change of rwsem [1], which you Matthew also participated in. The reason for taking the rwsem lock in the first place was never seriously challenged. I posted a followup patch that fixes the performance issue without breaking the "atomic rw" behavior [2] by calling generic_file_read_iter() once without i_rwsem to pre-populate the page cache. Dave had some technical concerns about this patch, regarding racing with truncate_pagecache_range(), which later led to a fix by Jan Kara to solve a readahead(2) vs. hole punch race [3]. At the time, Jan Kara wrote [3]: "...other filesystems need similar protections but e.g. in case of ext4 it isn't so simple without seriously regressing mixed rw workload performance so I'm pushing just xfs fix at this moment which is simple." And w.r.t solving the race without taking i_rwsem: "...So I have an idea how it could be solved: Change calling convention for ->readpage() so that it gets called without page locked and take i_mmap_sem there (and in ->readpages()) to protect from the race..." My question to both Jan and Matthew is - does the new aops ->readahead() API make things any better in that regard? Will it make it easier for us to address the readahead vs. hole punch race without having to take i_rwsem before readahead()? Thanks, Amir. [1] https://lore.kernel.org/linux-xfs/20190325154731.GT1183@magnolia/ [2] https://lore.kernel.org/linux-xfs/20190404165737.30889-1-amir73il@gmail.com/ [3] https://lore.kernel.org/linux-xfs/20200120165830.GB28285@quack2.suse.cz/
On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > The advantage of this patch is that we can avoid taking any filesystem > lock, as long as the pages being accessed are in the cache (and we don't > need to readahead any pages into the cache). We also avoid an indirect > function call in these cases. What does this micro-optimisation actually gain us except for more complexity in the IO path? i.e. if a filesystem lock has such massive overhead that it slows down the cached readahead path in production workloads, then that's something the filesystem needs to address, not unconditionally bypass the filesystem before the IO gets anywhere near it. > This could go horribly wrong if filesystems rely on doing work in their > ->read_iter implementation (eg checking i_size after acquiring their > lock) instead of keeping the page cache uptodate. On the other hand, > the ->map_pages() method is already called without locks, so filesystems > should already be prepared for this. Oh, gawd, we have *yet another* unlocked page cache read path that can race with invalidations during fallocate() operations? /me goes and looks at filemap_map_pages() Yup, filemap_map_pages() is only safe against invalidations beyond EOF (i.e. truncate) and can still race with invalidations within EOF. So, yes, I'm right in that this path is not safe to run without filesystem locking to serialise the IO against fallocate()... Darrick, it looks like we need to wrap filemap_map_pages() with the XFS_MMAPLOCK_SHARED like we do for all the other page fault paths that can call into the IO path. > Arguably we could do something similar for writes. I'm a little more > scared of that patch since filesystems are more likely to want to do > things to keep their fies in sync for writes. Please, no. We can have uptodate cached pages over holes, unwritten extents, shared extents, etc but they all require filesystem level serialisation and space/block allocation work *before* we copy data into the page. i.e. if allocation/space reservation fails, we need to abort before changing data. Cheers, Dave.
On Sat, Jun 20, 2020 at 12:15:21PM -0700, Matthew Wilcox wrote: > On Sat, Jun 20, 2020 at 09:19:37AM +0300, Amir Goldstein wrote: > > On Fri, Jun 19, 2020 at 6:52 PM Matthew Wilcox <willy@infradead.org> wrote: > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > > The advantage of this patch is that we can avoid taking any filesystem > > > lock, as long as the pages being accessed are in the cache (and we don't > > > need to readahead any pages into the cache). We also avoid an indirect > > > function call in these cases. > > > > XFS is taking i_rwsem lock in read_iter() for a surprising reason: > > https://lore.kernel.org/linux-xfs/CAOQ4uxjpqDQP2AKA8Hrt4jDC65cTo4QdYDOKFE-C3cLxBBa6pQ@mail.gmail.com/ > > In that post I claim that ocfs2 and cifs also do some work in read_iter(). > > I didn't go back to check what, but it sounds like cache coherence among > > nodes. > > That's out of date. Here's POSIX-2017: > > https://pubs.opengroup.org/onlinepubs/9699919799/functions/read.html > > "I/O is intended to be atomic to ordinary files and pipes and > FIFOs. Atomic means that all the bytes from a single operation that > started out together end up together, without interleaving from other > I/O operations. It is a known attribute of terminals that this is not > honored, and terminals are explicitly (and implicitly permanently) > excepted, making the behavior unspecified. The behavior for other > device types is also left unspecified, but the wording is intended to > imply that future standards might choose to specify atomicity (or not)." > > That _doesn't_ say "a read cannot observe a write in progress". It says > "Two writes cannot interleave". Indeed, further down in that section, it says: Nope, it says "... without interleaving from other I/O operations". That means read() needs to be atomic w.r.t truncate, hole punching, extent zeroing, etc, not just other write() syscalls. Really, though, I'm not going to get drawn into a language lawyering argument here. We've discussed this before, and it's pretty clear the language supports both arguments in one way or another. And that means we are not going to change behaviour that XFS has provided for 27 years now. Last time this came up, I said: "XFS was designed with the intent that buffered writes are atomic w.r.t. to all other file accesses." Christoph said: "Downgrading these long standing guarantees is simply not an option" Darrick: "I don't like the idea of adding a O_BROKENLOCKINGPONIES flag" Nothing has changed since this was last discussed. Well, except for the fact that since then I've seen the source code to some 20+ year old enterprise applications that have been ported to Linux and that has made me even more certain that we need to maintain XFS's existing behaviour.... Cheers, Dave.
On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote: > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > The advantage of this patch is that we can avoid taking any filesystem > > lock, as long as the pages being accessed are in the cache (and we don't > > need to readahead any pages into the cache). We also avoid an indirect > > function call in these cases. > > What does this micro-optimisation actually gain us except for more > complexity in the IO path? > > i.e. if a filesystem lock has such massive overhead that it slows > down the cached readahead path in production workloads, then that's > something the filesystem needs to address, not unconditionally > bypass the filesystem before the IO gets anywhere near it. I'm fine with not moving that functionality into the VFS. The problem I have in gfs2 is that taking glocks is really expensive. Part of that overhead is accidental, but we definitely won't be able to fix it in the short term. So something like the IOCB_CACHED flag that prevents generic_file_read_iter from issuing readahead I/O would save the day for us. Does that idea stand a chance? We could theoretically also create a copy of generic_file_buffered_read in gfs2 and strip out the I/O parts, but that would be very messy. Thanks, Andreas
On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > I'm fine with not moving that functionality into the VFS. The problem > I have in gfs2 is that taking glocks is really expensive. Part of that > overhead is accidental, but we definitely won't be able to fix it in > the short term. So something like the IOCB_CACHED flag that prevents > generic_file_read_iter from issuing readahead I/O would save the day > for us. Does that idea stand a chance? For the short-term fix, is switching to a trylock in gfs2_readahead() acceptable? diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 72c9560f4467..6ccd478c81ff 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -600,7 +600,7 @@ static void gfs2_readahead(struct readahead_control *rac) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh); if (gfs2_glock_nq(&gh)) goto out_uninit; if (!gfs2_is_stuffed(ip))
On Mon, Jun 22, 2020 at 10:32:15AM +1000, Dave Chinner wrote: > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > The advantage of this patch is that we can avoid taking any filesystem > > lock, as long as the pages being accessed are in the cache (and we don't > > need to readahead any pages into the cache). We also avoid an indirect > > function call in these cases. > > What does this micro-optimisation actually gain us except for more > complexity in the IO path? > > i.e. if a filesystem lock has such massive overhead that it slows > down the cached readahead path in production workloads, then that's > something the filesystem needs to address, not unconditionally > bypass the filesystem before the IO gets anywhere near it. You're been talking about adding a range lock to XFS for a while now. I remain quite sceptical that range locks are a good idea; they have not worked out well as a replacement for the mmap_sem, although the workload for the mmap_sem is quite different and they may yet show promise for the XFS iolock. There are production workloads that do not work well on top of a single file on an XFS filesystem. For example, using an XFS file in a host as the backing store for a guest block device. People tend to work around that kind of performance bug rather than report it. Do you agree that the guarantees that XFS currently supplies regarding locked operation will be maintained if the I/O is contained within a single page and the mutex is not taken? ie add this check to the original patch: if (iocb->ki_pos / PAGE_SIZE != (iocb->ki_pos + iov_iter_count(iter) - 1) / PAGE_SIZE) goto uncached; I think that gets me almost everything I want. Small I/Os are going to notice the pain of the mutex more than large I/Os.
On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote: > > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > > > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > > The advantage of this patch is that we can avoid taking any filesystem > > > lock, as long as the pages being accessed are in the cache (and we don't > > > need to readahead any pages into the cache). We also avoid an indirect > > > function call in these cases. > > > > What does this micro-optimisation actually gain us except for more > > complexity in the IO path? > > > > i.e. if a filesystem lock has such massive overhead that it slows > > down the cached readahead path in production workloads, then that's > > something the filesystem needs to address, not unconditionally > > bypass the filesystem before the IO gets anywhere near it. > > I'm fine with not moving that functionality into the VFS. The problem > I have in gfs2 is that taking glocks is really expensive. Part of that > overhead is accidental, but we definitely won't be able to fix it in > the short term. So something like the IOCB_CACHED flag that prevents > generic_file_read_iter from issuing readahead I/O would save the day > for us. Does that idea stand a chance? I have no problem with a "NOREADAHEAD" flag being passed to generic_file_read_iter(). It's not a "already cached" flag though, it's a "don't start any IO" directive, just like the NOWAIT flag is a "don't block on locks or IO in progress" directive and not an "already cached" flag. Readahead is something we should be doing, unless a filesystem has a very good reason not to, such as the gfs2 locking case here... Cheers, Dave.
On Mon, Jun 22, 2020 at 08:18:57PM +0100, Matthew Wilcox wrote: > On Mon, Jun 22, 2020 at 10:32:15AM +1000, Dave Chinner wrote: > > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > > > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > > The advantage of this patch is that we can avoid taking any filesystem > > > lock, as long as the pages being accessed are in the cache (and we don't > > > need to readahead any pages into the cache). We also avoid an indirect > > > function call in these cases. > > > > What does this micro-optimisation actually gain us except for more > > complexity in the IO path? > > > > i.e. if a filesystem lock has such massive overhead that it slows > > down the cached readahead path in production workloads, then that's > > something the filesystem needs to address, not unconditionally > > bypass the filesystem before the IO gets anywhere near it. > > You're been talking about adding a range lock to XFS for a while now. I don't see what that has to do with this patch. > I remain quite sceptical that range locks are a good idea; they have not > worked out well as a replacement for the mmap_sem, although the workload > for the mmap_sem is quite different and they may yet show promise for > the XFS iolock. <shrug> That was a really poor implementation of a range lock. It had no concurrency to speak of, because the tracking tree required a spinlock to be taken for every lock or unlock the range lock performed. Hence it had an expensive critical section that could not scale past the number of ops a single CPU could perform on that tree. IOWs, it topped out at about 150k lock cycles a second with 2-3 concurrent AIO+DIO threads, and only went slower as the number of concurrent IO submitters went up. So, yeah, if you are going to talk about range locks, you need to forget about the what was tried on the mmap_sem because nobody actually scalability tested the lock implementation by itself and it turned out to be total crap.... > There are production workloads that do not work well on top of a single > file on an XFS filesystem. For example, using an XFS file in a host as > the backing store for a guest block device. People tend to work around > that kind of performance bug rather than report it. *cough* AIO+DIO *cough* You may not like that answer, but anyone who cares about IO performance, especially single file IO performance, is using AIO+DIO. Buffered IO for VM image files in production environments tends to be the exception, not the norm, because caching is done in the guest by the guest page cache. Double caching IO data is generally considered a waste of resources that could otherwise be sold to customers. > Do you agree that the guarantees that XFS currently supplies regarding > locked operation will be maintained if the I/O is contained within a > single page and the mutex is not taken? Not at first glance because block size < file size configurations exist and hence filesystems might be punching out extents from a sub-page range.... > ie add this check to the original > patch: > > if (iocb->ki_pos / PAGE_SIZE != > (iocb->ki_pos + iov_iter_count(iter) - 1) / PAGE_SIZE) > goto uncached; > > I think that gets me almost everything I want. Small I/Os are going to > notice the pain of the mutex more than large I/Os. Exactly what are you trying to optimise, Willy? You haven't explained to anyone what workload needs these micro-optimisations, and without understanding why you want to cut the filesystems out of the readahead path, I can't suggest alternative solutions... Cheers, Dave.
On Tue, Jun 23, 2020 at 2:52 AM Dave Chinner <david@fromorbit.com> wrote: > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > > On Mon, Jun 22, 2020 at 2:32 AM Dave Chinner <david@fromorbit.com> wrote: > > > On Fri, Jun 19, 2020 at 08:50:36AM -0700, Matthew Wilcox wrote: > > > > > > > > This patch lifts the IOCB_CACHED idea expressed by Andreas to the VFS. > > > > The advantage of this patch is that we can avoid taking any filesystem > > > > lock, as long as the pages being accessed are in the cache (and we don't > > > > need to readahead any pages into the cache). We also avoid an indirect > > > > function call in these cases. > > > > > > What does this micro-optimisation actually gain us except for more > > > complexity in the IO path? > > > > > > i.e. if a filesystem lock has such massive overhead that it slows > > > down the cached readahead path in production workloads, then that's > > > something the filesystem needs to address, not unconditionally > > > bypass the filesystem before the IO gets anywhere near it. > > > > I'm fine with not moving that functionality into the VFS. The problem > > I have in gfs2 is that taking glocks is really expensive. Part of that > > overhead is accidental, but we definitely won't be able to fix it in > > the short term. So something like the IOCB_CACHED flag that prevents > > generic_file_read_iter from issuing readahead I/O would save the day > > for us. Does that idea stand a chance? > > I have no problem with a "NOREADAHEAD" flag being passed to > generic_file_read_iter(). It's not a "already cached" flag though, > it's a "don't start any IO" directive, just like the NOWAIT flag is > a "don't block on locks or IO in progress" directive and not an > "already cached" flag. Readahead is something we should be doing, > unless a filesystem has a very good reason not to, such as the gfs2 > locking case here... The requests coming in can have the IOCB_NOWAIT flag set or cleared. The idea was to have an additional flag that implies IOCB_NOWAIT so that you can do: iocb->ki_flags |= IOCB_NOIO; generic_file_read_iter() if ("failed because of IOCB_NOIO") { if ("failed because of IOCB_NOWAIT") return -EAGAIN; iocb->ki_flags &= ~IOCB_NOIO; "locking" generic_file_read_iter() "unlocking" } without having to save iocb->ki_flags. The alternative would be: int flags = iocb->ki_flags; iocb->ki_flags |= IOCB_NOIO | IOCB_NOWAIT; ret = generic_file_read_iter() if ("failed because of IOCB_NOIO or IOCB_NOWAIT") { if ("failed because of IOCB_NOWAIT" && (flags & IOCB_NOWAIT)) return -EAGAIN; iocb->ki_flags &= ~IOCB_NOIO; "locking" generic_file_read_iter() "unlocking" } Andreas
On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote: > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > > I'm fine with not moving that functionality into the VFS. The problem > > I have in gfs2 is that taking glocks is really expensive. Part of that > > overhead is accidental, but we definitely won't be able to fix it in > > the short term. So something like the IOCB_CACHED flag that prevents > > generic_file_read_iter from issuing readahead I/O would save the day > > for us. Does that idea stand a chance? > > For the short-term fix, is switching to a trylock in gfs2_readahead() > acceptable? Well, it's the only thing we can do for now, right? > diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c > index 72c9560f4467..6ccd478c81ff 100644 > --- a/fs/gfs2/aops.c > +++ b/fs/gfs2/aops.c > @@ -600,7 +600,7 @@ static void gfs2_readahead(struct readahead_control *rac) > struct gfs2_inode *ip = GFS2_I(inode); > struct gfs2_holder gh; > > - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); > + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_TRY, &gh); > if (gfs2_glock_nq(&gh)) > goto out_uninit; > if (!gfs2_is_stuffed(ip)) Thanks, Andreas
On Wed, Jun 24, 2020 at 2:35 PM Andreas Gruenbacher <agruenba@redhat.com> wrote: > On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote: > > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > > > I'm fine with not moving that functionality into the VFS. The problem > > > I have in gfs2 is that taking glocks is really expensive. Part of that > > > overhead is accidental, but we definitely won't be able to fix it in > > > the short term. So something like the IOCB_CACHED flag that prevents > > > generic_file_read_iter from issuing readahead I/O would save the day > > > for us. Does that idea stand a chance? > > > > For the short-term fix, is switching to a trylock in gfs2_readahead() > > acceptable? > > Well, it's the only thing we can do for now, right? It turns out that gfs2 can still deadlock with a trylock in gfs2_readahead, just differently: in this instance, gfs2_glock_nq will call inode_dio_wait. When there is pending direct I/O, we'll end up waiting for iomap_dio_complete, which will call invalidate_inode_pages2_range, which will try to lock the pages already locked for gfs2_readahead. This late in the 5.8 release cycle, I'd like to propose converting gfs2 back to use mpage_readpages. This requires reinstating mpage_readpages, but it's otherwise relatively trivial. We can then introduce an IOCB_CACHED or equivalent flag, fix the locking order in gfs2, convert gfs2 to mpage_readahead, and finally remove mage_readpages in 5.9. I'll post a patch queue that does this for comment. Thanks, Andreas
On Thu, Jul 02, 2020 at 05:16:43PM +0200, Andreas Gruenbacher wrote: > On Wed, Jun 24, 2020 at 2:35 PM Andreas Gruenbacher <agruenba@redhat.com> wrote: > > On Mon, Jun 22, 2020 at 8:13 PM Matthew Wilcox <willy@infradead.org> wrote: > > > On Mon, Jun 22, 2020 at 04:35:05PM +0200, Andreas Gruenbacher wrote: > > > > I'm fine with not moving that functionality into the VFS. The problem > > > > I have in gfs2 is that taking glocks is really expensive. Part of that > > > > overhead is accidental, but we definitely won't be able to fix it in > > > > the short term. So something like the IOCB_CACHED flag that prevents > > > > generic_file_read_iter from issuing readahead I/O would save the day > > > > for us. Does that idea stand a chance? > > > > > > For the short-term fix, is switching to a trylock in gfs2_readahead() > > > acceptable? > > > > Well, it's the only thing we can do for now, right? > > It turns out that gfs2 can still deadlock with a trylock in > gfs2_readahead, just differently: in this instance, gfs2_glock_nq will > call inode_dio_wait. When there is pending direct I/O, we'll end up > waiting for iomap_dio_complete, which will call > invalidate_inode_pages2_range, which will try to lock the pages > already locked for gfs2_readahead. That seems like a bug in trylock. If I trylock something I'm not expecting it to wait; i'm expecting it to fail because it would have to wait. ie something like this: diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c84887769b5a..97ca8f5ed22b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -470,7 +470,10 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } - if (gh->gh_state != LM_ST_DEFERRED) + if (gh->gh_flags & LM_FLAG_TRY) { + if (atomic_read(&ip->i_inode.i_dio_count)) + return -EWOULDBLOCK; + } else if (gh->gh_state != LM_ST_DEFERRED) inode_dio_wait(&ip->i_inode); if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && ... but probably not exactly that because I didn't try to figure out the calling conventions or whether I should set some state in the gfs2_holder. > This late in the 5.8 release cycle, I'd like to propose converting > gfs2 back to use mpage_readpages. This requires reinstating > mpage_readpages, but it's otherwise relatively trivial. > We can then introduce an IOCB_CACHED or equivalent flag, fix the > locking order in gfs2, convert gfs2 to mpage_readahead, and finally > remove mage_readpages in 5.9. I would rather not do that.
diff --git a/fs/read_write.c b/fs/read_write.c index bbfa9b12b15e..7b899538d3c0 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -401,6 +401,41 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t read_write == READ ? MAY_READ : MAY_WRITE); } +ssize_t call_read_iter(struct file *file, struct kiocb *iocb, + struct iov_iter *iter) +{ + ssize_t written, ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) + goto uncached; + if (!file->f_mapping->nrpages) + goto uncached; + + iocb->ki_flags |= IOCB_CACHED; + ret = generic_file_buffered_read(iocb, iter, 0); + iocb->ki_flags &= ~IOCB_CACHED; + + if (likely(!iov_iter_count(iter))) + return ret; + + if (ret == -EAGAIN) { + if (iocb->ki_flags & IOCB_NOWAIT) + return ret; + ret = 0; + } else if (ret < 0) { + return ret; + } + +uncached: + written = ret; + + ret = file->f_op->read_iter(iocb, iter); + if (ret > 0) + written += ret; + + return written ? written : ret; +} + static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6c4ab4dc1cd7..0985773feffd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -315,6 +315,7 @@ enum rw_hint { #define IOCB_SYNC (1 << 5) #define IOCB_WRITE (1 << 6) #define IOCB_NOWAIT (1 << 7) +#define IOCB_CACHED (1 << 8) struct kiocb { struct file *ki_filp; @@ -1895,11 +1896,7 @@ struct inode_operations { int (*set_acl)(struct inode *, struct posix_acl *, int); } ____cacheline_aligned; -static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, - struct iov_iter *iter) -{ - return file->f_op->read_iter(kio, iter); -} +ssize_t call_read_iter(struct file *, struct kiocb *, struct iov_iter *); static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) diff --git a/mm/filemap.c b/mm/filemap.c index f0ae9a6308cb..4ee97941a1f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2028,7 +2028,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, page = find_get_page(mapping, index); if (!page) { - if (iocb->ki_flags & IOCB_NOWAIT) + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) goto would_block; page_cache_sync_readahead(mapping, ra, filp, @@ -2038,12 +2038,16 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, goto no_cached_page; } if (PageReadahead(page)) { + if (iocb->ki_flags & IOCB_CACHED) { + put_page(page); + goto out; + } page_cache_async_readahead(mapping, ra, filp, page, index, last_index - index); } if (!PageUptodate(page)) { - if (iocb->ki_flags & IOCB_NOWAIT) { + if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_CACHED)) { put_page(page); goto would_block; }