@@ -1134,6 +1134,146 @@ xfs_buffered_write_delalloc_punch(
end_fsb - start_fsb);
}
+/*
+ * Scan the data range passed to us for dirty page cache folios. If we find a
+ * dirty folio, punch out the preceeding range and update the offset from which
+ * the next punch will start from.
+ *
+ * We can punch out clean pages because they either contain data that has been
+ * written back - in which case the delalloc punch over that range is a no-op -
+ * or they have been read faults in which case they contain zeroes and we can
+ * remove the delalloc backing range and any new writes to those pages will do
+ * the normal hole filling operation...
+ *
+ * This makes the logic simple: we only need to keep the delalloc extents only
+ * over the dirty ranges of the page cache.
+ */
+static int
+xfs_buffered_write_delalloc_scan(
+ struct inode *inode,
+ loff_t *punch_start_byte,
+ loff_t start_byte,
+ loff_t end_byte)
+{
+ loff_t offset = start_byte;
+
+ while (offset < end_byte) {
+ struct folio *folio;
+
+ /* grab locked page */
+ folio = filemap_lock_folio(inode->i_mapping, offset >> PAGE_SHIFT);
+ if (!folio) {
+ offset = ALIGN_DOWN(offset, PAGE_SIZE) + PAGE_SIZE;
+ continue;
+ }
+
+ /* if dirty, punch up to offset */
+ if (folio_test_dirty(folio)) {
+ if (offset > *punch_start_byte) {
+ int error;
+
+ error = xfs_buffered_write_delalloc_punch(inode,
+ *punch_start_byte, offset);
+ if (error) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return error;
+ }
+ }
+
+ /*
+ * Make sure the next punch start is correctly bound to
+ * the end of this data range, not the end of the folio.
+ */
+ *punch_start_byte = min_t(loff_t, end_byte,
+ folio_next_index(folio) << PAGE_SHIFT);
+ }
+
+ /* move offset to start of next folio in range */
+ offset = folio_next_index(folio) << PAGE_SHIFT;
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ return 0;
+}
+
+/*
+ * Punch out all the delalloc blocks in the range given except for those that
+ * have dirty data still pending in the page cache - those are going to be
+ * written and so must still retain the delalloc backing for writeback.
+ *
+ * As we are scanning the page cache for data, we don't need to reimplement the
+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
+ * start and end of data ranges correctly even for sub-folio block sizes. This
+ * byte range based iteration is especially convenient because it means we don't
+ * have to care about variable size folios, nor where the start or end of the
+ * data range lies within a folio, if they lie within the same folio or even if
+ * there are multiple discontiguous data ranges within the folio.
+ */
+static int
+xfs_buffered_write_delalloc_release(
+ struct inode *inode,
+ loff_t start_byte,
+ loff_t end_byte)
+{
+ loff_t punch_start_byte = start_byte;
+ int error = 0;
+
+ /*
+ * Lock the mapping to avoid races with page faults re-instantiating
+ * folios and dirtying them via ->page_mkwrite whilst we walk the
+ * cache and perform delalloc extent removal. Failing to do this can
+ * leave dirty pages with no space reservation in the cache.
+ */
+ filemap_invalidate_lock(inode->i_mapping);
+ while (start_byte < end_byte) {
+ loff_t data_end;
+
+ start_byte = mapping_seek_hole_data(inode->i_mapping,
+ start_byte, end_byte, SEEK_DATA);
+ /*
+ * If there is no more data to scan, all that is left is to
+ * punch out the remaining range.
+ */
+ if (start_byte == -ENXIO || start_byte == end_byte)
+ break;
+ if (start_byte < 0) {
+ error = start_byte;
+ goto out_unlock;
+ }
+ ASSERT(start_byte >= punch_start_byte);
+ ASSERT(start_byte < end_byte);
+
+ /*
+ * We find the end of this contiguous cached data range by
+ * seeking from start_byte to the beginning of the next hole.
+ */
+ data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
+ end_byte, SEEK_HOLE);
+ if (data_end < 0) {
+ error = data_end;
+ goto out_unlock;
+ }
+ ASSERT(data_end > start_byte);
+ ASSERT(data_end <= end_byte);
+
+ error = xfs_buffered_write_delalloc_scan(inode,
+ &punch_start_byte, start_byte, data_end);
+ if (error)
+ goto out_unlock;
+
+ /* The next data search starts at the end of this one. */
+ start_byte = data_end;
+ }
+
+ if (punch_start_byte < end_byte)
+ error = xfs_buffered_write_delalloc_punch(inode,
+ punch_start_byte, end_byte);
+out_unlock:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return error;
+}
+
static int
xfs_buffered_write_iomap_end(
struct inode *inode,
@@ -1179,16 +1319,7 @@ xfs_buffered_write_iomap_end(
if (start_byte >= end_byte)
return 0;
- /*
- * Lock the mapping to avoid races with page faults re-instantiating
- * folios and dirtying them via ->page_mkwrite between the page cache
- * truncation and the delalloc extent removal. Failing to do this can
- * leave dirty pages with no space reservation in the cache.
- */
- filemap_invalidate_lock(inode->i_mapping);
- truncate_pagecache_range(inode, start_byte, end_byte - 1);
- error = xfs_buffered_write_delalloc_punch(inode, start_byte, end_byte);
- filemap_invalidate_unlock(inode->i_mapping);
+ error = xfs_buffered_write_delalloc_release(inode, start_byte, end_byte);
if (error && !xfs_is_shutdown(mp)) {
xfs_alert(mp, "%s: unable to clean up ino 0x%llx",
__func__, XFS_I(inode)->i_ino);
@@ -2925,6 +2925,7 @@ loff_t mapping_seek_hole_data(struct address_space *mapping, loff_t start,
return end;
return start;
}
+EXPORT_SYMBOL(mapping_seek_hole_data);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)