@@ -159,8 +159,10 @@ xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ array.o \
bitmap.o \
repair.o \
+ xfile.o \
)
endif
endif
new file mode 100644
@@ -0,0 +1,647 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/array.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+
+/*
+ * XFS Fixed-Size Big Memory Array
+ * ===============================
+ * The file-backed memory array uses a memfd "file" to store large numbers of
+ * fixed-size records in memory that can be paged out. This puts less stress
+ * on the memory reclaim algorithms because memfd file pages are not pinned and
+ * can be paged out; however, array access is less direct than would be in a
+ * regular memory array. Access to the array is performed via indexed get and
+ * put methods, and an append method is provided for convenience. Array
+ * elements can be set to all zeroes, which means that the entry is NULL and
+ * will be skipped during iteration.
+ */
+
+#define XFBMA_MAX_TEMP (2)
+
+/*
+ * Pointer to temp space. Because we can't access the memfd data directly, we
+ * allocate a small amount of memory on the end of the xfbma to buffer array
+ * items when we need space to store values temporarily.
+ */
+static inline void *
+xfbma_temp(
+ struct xfbma *array,
+ unsigned int nr)
+{
+ ASSERT(nr < XFBMA_MAX_TEMP);
+
+ return ((char *)(array + 1)) + (nr * array->obj_size);
+}
+
+/* Initialize a big memory array. */
+struct xfbma *
+xfbma_init(
+ size_t obj_size)
+{
+ struct xfbma *array;
+ struct file *filp;
+ int error;
+
+ filp = xfile_create("big array");
+ if (!filp)
+ return ERR_PTR(-ENOMEM);
+ if (IS_ERR(filp))
+ return ERR_CAST(filp);
+
+ error = -ENOMEM;
+ array = kmem_alloc(sizeof(struct xfbma) + (XFBMA_MAX_TEMP * obj_size),
+ KM_NOFS | KM_MAYFAIL);
+ if (!array)
+ goto out_filp;
+
+ array->filp = filp;
+ array->obj_size = obj_size;
+ array->nr = 0;
+ return array;
+out_filp:
+ fput(filp);
+ return ERR_PTR(error);
+}
+
+void
+xfbma_destroy(
+ struct xfbma *array)
+{
+ xfile_destroy(array->filp);
+ kmem_free(array);
+}
+
+/* Compute offset of array element. */
+static inline loff_t
+xfbma_offset(
+ struct xfbma *array,
+ uint64_t nr)
+{
+ if (nr >= array->nr)
+ return -1;
+ return nr * array->obj_size;
+}
+
+/* Get an element from the array. */
+int
+xfbma_get(
+ struct xfbma *array,
+ uint64_t nr,
+ void *ptr)
+{
+ loff_t pos = xfbma_offset(array, nr);
+
+ if (pos < 0) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ return xfile_io(array->filp, XFILE_IO_READ, &pos, ptr, array->obj_size);
+}
+
+/* Put an element in the array. */
+int
+xfbma_set(
+ struct xfbma *array,
+ uint64_t nr,
+ void *ptr)
+{
+ loff_t pos = xfbma_offset(array, nr);
+
+ if (pos < 0) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ return xfile_io(array->filp, XFILE_IO_WRITE, &pos, ptr,
+ array->obj_size);
+}
+
+/* Is this array element NULL? */
+bool
+xfbma_is_null(
+ struct xfbma *array,
+ void *ptr)
+{
+ return !memchr_inv(ptr, 0, array->obj_size);
+}
+
+/* Put an element anywhere in the array that isn't NULL. */
+int
+xfbma_insert_anywhere(
+ struct xfbma *array,
+ void *ptr)
+{
+ void *temp = xfbma_temp(array, 0);
+ uint64_t i;
+ int error;
+
+ /* Find a null slot to put it in. */
+ for (i = 0; i < array->nr; i++) {
+ error = xfbma_get(array, i, temp);
+ if (error || !xfbma_is_null(array, temp))
+ continue;
+ return xfbma_set(array, i, ptr);
+ }
+
+ /* No null slots, just dump it on the end. */
+ return xfbma_append(array, ptr);
+}
+
+/* NULL an element in the array. */
+int
+xfbma_nullify(
+ struct xfbma *array,
+ uint64_t nr)
+{
+ void *temp = xfbma_temp(array, 0);
+ loff_t pos = xfbma_offset(array, nr);
+
+ if (pos < 0) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ memset(temp, 0, array->obj_size);
+ return xfile_io(array->filp, XFILE_IO_WRITE, &pos, temp,
+ array->obj_size);
+}
+
+/* Append an element to the array. */
+int
+xfbma_append(
+ struct xfbma *array,
+ void *ptr)
+{
+ loff_t pos = array->obj_size * array->nr;
+ int error;
+
+ if (pos < 0) {
+ ASSERT(0);
+ return -ENODATA;
+ }
+
+ error = xfile_io(array->filp, XFILE_IO_WRITE, &pos, ptr,
+ array->obj_size);
+ if (error)
+ return error;
+ array->nr++;
+ return 0;
+}
+
+/*
+ * Iterate every element in this array, freeing each element as we go.
+ * Array elements will be nulled out.
+ */
+int
+xfbma_iter_del(
+ struct xfbma *array,
+ xfbma_iter_fn iter_fn,
+ void *priv)
+{
+ void *temp = xfbma_temp(array, 0);
+ pgoff_t oldpagenr = 0;
+ uint64_t max_bytes;
+ uint64_t i;
+ loff_t pos;
+ int error = 0;
+
+ max_bytes = array->nr * array->obj_size;
+ for (pos = 0, i = 0; pos < max_bytes; i++) {
+ pgoff_t pagenr;
+
+ error = xfile_io(array->filp, XFILE_IO_READ, &pos, temp,
+ array->obj_size);
+ if (error)
+ break;
+ if (xfbma_is_null(array, temp))
+ goto next;
+ error = iter_fn(temp, priv);
+ if (error)
+ break;
+next:
+ /* Release the previous page if possible. */
+ pagenr = pos >> PAGE_SHIFT;
+ if (pagenr != oldpagenr)
+ xfile_discard(array->filp, oldpagenr << PAGE_SHIFT,
+ pos - 1);
+ oldpagenr = pagenr;
+ }
+
+ return error;
+}
+
+/* Return length of array. */
+uint64_t
+xfbma_length(
+ struct xfbma *array)
+{
+ return array->nr;
+}
+
+/*
+ * Select the median value from a[lo], a[mid], and a[hi]. Put the median in
+ * a[lo], the lowest in a[lo], and the highest in a[hi]. Using the median of
+ * the three reduces the chances that we pick the worst case pivot value, since
+ * it's likely that our array values are nearly sorted.
+ */
+STATIC int
+xfbma_qsort_pivot(
+ struct xfbma *array,
+ xfbma_cmp_fn cmp_fn,
+ uint64_t lo,
+ uint64_t mid,
+ uint64_t hi)
+{
+ void *a = xfbma_temp(array, 0);
+ void *b = xfbma_temp(array, 1);
+ int error;
+
+ /* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+ error = xfbma_get(array, mid, a);
+ if (error)
+ return error;
+ error = xfbma_get(array, lo, b);
+ if (error)
+ return error;
+ if (cmp_fn(a, b) < 0) {
+ error = xfbma_set(array, lo, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, mid, b);
+ if (error)
+ return error;
+ }
+
+ /* if a[hi] < a[mid], swap a[mid] and a[hi]. */
+ error = xfbma_get(array, hi, a);
+ if (error)
+ return error;
+ error = xfbma_get(array, mid, b);
+ if (error)
+ return error;
+ if (cmp_fn(a, b) < 0) {
+ error = xfbma_set(array, mid, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, hi, b);
+ if (error)
+ return error;
+ } else {
+ goto move_front;
+ }
+
+ /* if a[mid] < a[lo], swap a[mid] and a[lo]. */
+ error = xfbma_get(array, mid, a);
+ if (error)
+ return error;
+ error = xfbma_get(array, lo, b);
+ if (error)
+ return error;
+ if (cmp_fn(a, b) < 0) {
+ error = xfbma_set(array, lo, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, mid, b);
+ if (error)
+ return error;
+ }
+move_front:
+ /* move our selected pivot to a[lo] */
+ error = xfbma_get(array, lo, b);
+ if (error)
+ return error;
+ error = xfbma_get(array, mid, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, mid, b);
+ if (error)
+ return error;
+ return xfbma_set(array, lo, a);
+}
+
+/*
+ * Perform an insertion sort on a subset of the array.
+ * Though insertion sort is an O(n^2) algorithm, for small set sizes it's
+ * faster than quicksort's stack machine, so we let it take over for that.
+ */
+STATIC int
+xfbma_isort(
+ struct xfbma *array,
+ xfbma_cmp_fn cmp_fn,
+ uint64_t start,
+ uint64_t end)
+{
+ void *a = xfbma_temp(array, 0);
+ void *b = xfbma_temp(array, 1);
+ uint64_t tmp;
+ uint64_t i;
+ uint64_t run;
+ int error;
+
+ /*
+ * Move the smallest element in a[start..end] to a[start]. This
+ * simplifies the loop control logic below.
+ */
+ tmp = start;
+ error = xfbma_get(array, tmp, b);
+ if (error)
+ return error;
+ for (run = start + 1; run <= end; run++) {
+ /* if a[run] < a[tmp], tmp = run */
+ error = xfbma_get(array, run, a);
+ if (error)
+ return error;
+ if (cmp_fn(a, b) < 0) {
+ tmp = run;
+ memcpy(b, a, array->obj_size);
+ }
+ }
+
+ /*
+ * The smallest element is a[tmp]; swap with a[start] if tmp != start.
+ * Recall that a[tmp] is already in *b.
+ */
+ if (tmp != start) {
+ error = xfbma_get(array, start, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, tmp, a);
+ if (error)
+ return error;
+ error = xfbma_set(array, start, b);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Perform an insertion sort on a[start+1..end]. We already made sure
+ * that the smallest value in the original range is now in a[start],
+ * so the inner loop should never underflow.
+ *
+ * For each a[start+2..end], make sure it's in the correct position
+ * with respect to the elements that came before it.
+ */
+ for (run = start + 2; run <= end; run++) {
+ error = xfbma_get(array, run, a);
+ if (error)
+ return error;
+
+ /*
+ * Find the correct place for a[run] by walking leftwards
+ * towards the start of the range until a[tmp] is no longer
+ * greater than a[run].
+ */
+ tmp = run - 1;
+ error = xfbma_get(array, tmp, b);
+ if (error)
+ return error;
+ while (cmp_fn(a, b) < 0) {
+ tmp--;
+ error = xfbma_get(array, tmp, b);
+ if (error)
+ return error;
+ }
+ tmp++;
+
+ /*
+ * If tmp != run, then a[tmp..run-1] are all less than a[run],
+ * so right barrel roll a[tmp..run] to get this range in
+ * sorted order.
+ */
+ if (tmp == run)
+ continue;
+
+ for (i = run; i >= tmp; i--) {
+ error = xfbma_get(array, i - 1, b);
+ if (error)
+ return error;
+ error = xfbma_set(array, i, b);
+ if (error)
+ return error;
+ }
+ error = xfbma_set(array, tmp, a);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Sort the array elements via quicksort. This implementation incorporates
+ * four optimizations discussed in Sedgewick:
+ *
+ * 1. Use an explicit stack of array indicies to store the next array
+ * partition to sort. This helps us to avoid recursion in the call stack,
+ * which is particularly expensive in the kernel.
+ *
+ * 2. Choose the pivot element using a median-of-three decision tree. This
+ * reduces the probability of selecting a bad pivot value which causes
+ * worst case behavior (i.e. partition sizes of 1). Chance are fairly good
+ * that the list is nearly sorted, so this is important.
+ *
+ * 3. The smaller of the two sub-partitions is pushed onto the stack to start
+ * the next level of recursion, and the larger sub-partition replaces the
+ * current stack frame. This guarantees that we won't need more than
+ * log2(nr) stack space.
+ *
+ * 4. Use insertion sort for small sets since since insertion sort is faster
+ * for small, mostly sorted array segments. In the author's experience,
+ * substituting insertion sort for arrays smaller than 4 elements yields
+ * a ~10% reduction in runtime.
+ */
+
+/*
+ * Due to the use of signed indices, we can only support up to 2^63 records.
+ * Files can only grow to 2^63 bytes, so this is not much of a limitation.
+ */
+#define QSORT_MAX_RECS (1ULL << 63)
+
+/*
+ * For array subsets smaller than 4 elements, it's slightly faster to use
+ * insertion sort than quicksort's stack machine.
+ */
+#define ISORT_THRESHOLD (4)
+int
+xfbma_sort(
+ struct xfbma *array,
+ xfbma_cmp_fn cmp_fn)
+{
+ int64_t *stack;
+ int64_t *beg;
+ int64_t *end;
+ void *pivot = xfbma_temp(array, 0);
+ void *temp = xfbma_temp(array, 1);
+ int64_t lo, mid, hi;
+ const int max_stack_depth = ilog2(array->nr) + 1;
+ int stack_depth = 0;
+ int max_stack_used = 0;
+ int error = 0;
+
+ if (array->nr == 0)
+ return 0;
+ if (array->nr >= QSORT_MAX_RECS)
+ return -E2BIG;
+ if (array->nr <= ISORT_THRESHOLD)
+ return xfbma_isort(array, cmp_fn, 0, array->nr - 1);
+
+ /* Allocate our pointer stacks for sorting. */
+ stack = kmem_alloc(sizeof(int64_t) * 2 * max_stack_depth,
+ KM_NOFS | KM_MAYFAIL);
+ if (!stack)
+ return -ENOMEM;
+ beg = stack;
+ end = &stack[max_stack_depth];
+
+ beg[0] = 0;
+ end[0] = array->nr;
+ while (stack_depth >= 0) {
+ lo = beg[stack_depth];
+ hi = end[stack_depth] - 1;
+
+ /* Nothing left in this partition to sort; pop stack. */
+ if (lo >= hi) {
+ stack_depth--;
+ continue;
+ }
+
+ /* Small enough for insertion sort? */
+ if (hi - lo <= ISORT_THRESHOLD) {
+ error = xfbma_isort(array, cmp_fn, lo, hi);
+ if (error)
+ goto out_free;
+ stack_depth--;
+ continue;
+ }
+
+ /* Pick a pivot, move it to a[lo] and stash it. */
+ mid = lo + ((hi - lo) / 2);
+ error = xfbma_qsort_pivot(array, cmp_fn, lo, mid, hi);
+ if (error)
+ goto out_free;
+
+ error = xfbma_get(array, lo, pivot);
+ if (error)
+ goto out_free;
+
+ /*
+ * Rearrange a[lo..hi] such that everything smaller than the
+ * pivot is on the left side of the range and everything larger
+ * than the pivot is on the right side of the range.
+ */
+ while (lo < hi) {
+ /*
+ * Decrement hi until it finds an a[hi] less than the
+ * pivot value.
+ */
+ error = xfbma_get(array, hi, temp);
+ if (error)
+ goto out_free;
+ while (cmp_fn(temp, pivot) >= 0 && lo < hi) {
+ hi--;
+ error = xfbma_get(array, hi, temp);
+ if (error)
+ goto out_free;
+ }
+
+ /* Copy that item (a[hi]) to a[lo]. */
+ if (lo < hi) {
+ error = xfbma_set(array, lo++, temp);
+ if (error)
+ goto out_free;
+ }
+
+ /*
+ * Increment lo until it finds an a[lo] greater than
+ * the pivot value.
+ */
+ error = xfbma_get(array, lo, temp);
+ if (error)
+ goto out_free;
+ while (cmp_fn(temp, pivot) <= 0 && lo < hi) {
+ lo++;
+ error = xfbma_get(array, lo, temp);
+ if (error)
+ goto out_free;
+ }
+
+ /* Copy that item (a[lo]) to a[hi]. */
+ if (lo < hi) {
+ error = xfbma_set(array, hi--, temp);
+ if (error)
+ goto out_free;
+ }
+ }
+
+ /*
+ * Put our pivot value in the correct place at a[lo]. All
+ * values between a[beg[i]] and a[lo - 1] should be less than
+ * the pivot; and all values between a[lo + 1] and a[end[i]-1]
+ * should be greater than the pivot.
+ */
+ error = xfbma_set(array, lo, pivot);
+ if (error)
+ goto out_free;
+
+ /*
+ * Set up the pointers for the next iteration. We push onto
+ * the stack all of the unsorted values between a[lo + 1] and
+ * a[end[i]], and we tweak the current stack frame to point to
+ * the unsorted values between a[beg[i]] and a[lo] so that
+ * those values will be sorted when we pop the stack.
+ */
+ beg[stack_depth + 1] = lo + 1;
+ end[stack_depth + 1] = end[stack_depth];
+ end[stack_depth++] = lo;
+
+ /* Check our stack usage. */
+ max_stack_used = max(max_stack_used, stack_depth);
+ if (stack_depth >= max_stack_depth) {
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ goto out_free;
+ }
+
+ /*
+ * Always start with the smaller of the two partitions to keep
+ * the amount of recursion in check.
+ */
+ if (end[stack_depth] - beg[stack_depth] >
+ end[stack_depth - 1] - beg[stack_depth - 1]) {
+ swap(beg[stack_depth], beg[stack_depth - 1]);
+ swap(end[stack_depth], end[stack_depth - 1]);
+ }
+ }
+
+out_free:
+ kfree(stack);
+ trace_xfbma_sort_stats(array->nr, max_stack_depth, max_stack_used,
+ error);
+ return error;
+}
+
+/* Convenience method to step through an array until we find a non-null item. */
+int
+xfbma_get_data(
+ struct xfbma *array,
+ uint64_t *nr,
+ void *rec)
+{
+ int error;
+
+ do {
+ error = xfbma_get(array, (*nr)++, rec);
+ } while (error == 0 && xfbma_is_null(array, rec));
+
+ return error;
+}
new file mode 100644
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_ARRAY_H__
+#define __XFS_SCRUB_ARRAY_H__
+
+struct xfbma {
+ struct file *filp;
+ size_t obj_size;
+ uint64_t nr;
+};
+
+struct xfbma *xfbma_init(size_t obj_size);
+void xfbma_destroy(struct xfbma *array);
+int xfbma_get(struct xfbma *array, uint64_t nr, void *ptr);
+int xfbma_set(struct xfbma *array, uint64_t nr, void *ptr);
+int xfbma_insert_anywhere(struct xfbma *array, void *ptr);
+bool xfbma_is_null(struct xfbma *array, void *ptr);
+int xfbma_nullify(struct xfbma *array, uint64_t nr);
+int xfbma_append(struct xfbma *array, void *ptr);
+uint64_t xfbma_length(struct xfbma *array);
+int xfbma_get_data(struct xfbma *array, uint64_t *nr, void *rec);
+
+/*
+ * Return codes for the array iterator function are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iteration caller. The special value -ECANCELED can be used to stop
+ * iteration, because the iterator never generates that error code on its own.
+ */
+typedef int (*xfbma_iter_fn)(const void *item, void *priv);
+
+int xfbma_iter_del(struct xfbma *array, xfbma_iter_fn iter_fn, void *priv);
+
+typedef int (*xfbma_cmp_fn)(const void *a, const void *b);
+
+int xfbma_sort(struct xfbma *array, xfbma_cmp_fn cmp_fn);
+
+#define foreach_xfbma_item(array, i, rec) \
+ for ((i) = 0; (i) < xfbma_length((array)); (i)++) \
+ if (xfbma_get((array), (i), &(rec)) == 0 && \
+ !xfbma_is_null((array), &(rec)))
+
+#endif /* __XFS_SCRUB_ARRAY_H__ */
@@ -940,6 +940,29 @@ DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+TRACE_EVENT(xfbma_sort_stats,
+ TP_PROTO(uint64_t nr, unsigned int max_stack_depth,
+ unsigned int max_stack_used, int error),
+ TP_ARGS(nr, max_stack_depth, max_stack_used, error),
+ TP_STRUCT__entry(
+ __field(uint64_t, nr)
+ __field(unsigned int, max_stack_depth)
+ __field(unsigned int, max_stack_used)
+ __field(int, error)
+ ),
+ TP_fast_assign(
+ __entry->nr = nr;
+ __entry->max_stack_depth = max_stack_depth;
+ __entry->max_stack_used = max_stack_used;
+ __entry->error = error;
+ ),
+ TP_printk("nr %llu max_depth %u max_used %u error %d",
+ __entry->nr,
+ __entry->max_stack_depth,
+ __entry->max_stack_used,
+ __entry->error)
+);
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
new file mode 100644
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "scrub/array.h"
+#include "scrub/scrub.h"
+#include "scrub/trace.h"
+#include "scrub/xfile.h"
+#include <linux/shmem_fs.h>
+
+/*
+ * Create a memfd to our specifications and return a file pointer. The file
+ * is not installed in the file description table (because userspace has no
+ * business accessing our internal data), which means that the caller /must/
+ * fput the file when finished.
+ */
+struct file *
+xfile_create(
+ const char *description)
+{
+ struct file *filp;
+
+ filp = shmem_file_setup(description, 0, 0);
+ if (IS_ERR_OR_NULL(filp))
+ return filp;
+
+ filp->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME;
+ filp->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
+ return filp;
+}
+
+void
+xfile_destroy(
+ struct file *filp)
+{
+ fput(filp);
+}
+
+/*
+ * Perform a read or write IO to the file backing the array. We can defer
+ * the work to a workqueue if the caller so desires, either to reduce stack
+ * usage or because the xfs is frozen and we want to avoid deadlocking on the
+ * page fault that might be about to happen.
+ */
+int
+xfile_io(
+ struct file *filp,
+ unsigned int cmd_flags,
+ loff_t *pos,
+ void *ptr,
+ size_t count)
+{
+ ssize_t ret;
+ unsigned int pflags = memalloc_nofs_save();
+
+ if ((cmd_flags & XFILE_IO_MASK) == XFILE_IO_READ)
+ ret = kernel_read(filp, ptr, count, pos);
+ else
+ ret = kernel_write(filp, ptr, count, pos);
+ memalloc_nofs_restore(pflags);
+
+ /*
+ * Since we're treating this file as "memory", any IO error should be
+ * treated as a failure to find any memory.
+ */
+ return ret == count ? 0 : -ENOMEM;
+}
+
+/* Discard pages backing a range of the file. */
+void
+xfile_discard(
+ struct file *filp,
+ loff_t start,
+ loff_t end)
+{
+ shmem_truncate_range(file_inode(filp), start, end);
+}
new file mode 100644
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2019 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef __XFS_SCRUB_XFILE_H__
+#define __XFS_SCRUB_XFILE_H__
+
+struct file *xfile_create(const char *description);
+void xfile_destroy(struct file *filp);
+
+/* read or write? */
+#define XFILE_IO_READ (0)
+#define XFILE_IO_WRITE (1)
+#define XFILE_IO_MASK (1 << 0)
+int xfile_io(struct file *filp, unsigned int cmd_flags, loff_t *pos,
+ void *ptr, size_t count);
+
+void xfile_discard(struct file *filp, loff_t start, loff_t end);
+
+#endif /* __XFS_SCRUB_XFILE_H__ */