@@ -257,8 +257,11 @@ int invalidate_inode_page(struct page *page)
struct address_space *mapping = page_mapping(page);
if (!mapping)
return 0;
- if (PageDirty(page) || PageWriteback(page))
+ if (PageDirty(page) || PageWriteback(page)) {
+ trace_printk("ino 0x%lx page %p, offset 0x%lx\n",
+ mapping->host->i_ino, page, page->index * PAGE_SIZE);
return 0;
+ }
if (page_mapped(page))
return 0;
return invalidate_complete_page(mapping, page);
And that alone, without even enabling tracepoints, made the
corruption go completely away. So I suspect a page state race
condition and look at POSIX_FADV_DONTNEED, which fio is issuing
before running it's verification reads. First thing that does:
if (!inode_write_congested(mapping->host))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
It starts async writeback of the dirty pages. There's 256MB of dirty
pages on these inodes, and iomap tracing indicates the entire 256MB
immediately runs through the trace_iomap_writepage() tracepoint.
i.e. every page goes Dirty -> Writeback and is submitted for async
IO.
Then the POSIX_FADV_DONTNEED code goes and runs
invalidate_mapping_pages(), which ends up try-locking each page and
then running invalidate_inode_page() on the page, which is where the
trace debug I put in on pages under writeback gets hit. So if
changing the invalidation code for pages under writeback makes the
problem go away, then stopping invalidate_mapping_pages() from
racing with page writeback should make the problem go away, too.
This does indeed make the corruption go away:
@@ -109,9 +109,8 @@ int generic_fadvise(struct file *file, loff_t offset,
loff_t len, int advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
if (!inode_write_congested(mapping->host))
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ filemap_write_and_wait_range(mapping, offset, endbyte);
/*