@@ -717,6 +717,8 @@ enum {
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/* Caller wants strictly aligned allocation */
+#define EXT4_GET_BLOCKS_ALIGNED 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -3683,6 +3685,8 @@ extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
+extern int ext4_map_blocks_aligned(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags);
extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode,
int num,
struct ext4_ext_path *path);
@@ -4091,6 +4091,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
int err = 0, depth, ret;
unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0;
+ unsigned int orig_mlen = map->m_len;
struct ext4_allocation_request ar;
ext4_lblk_t cluster_offset;
@@ -4282,6 +4283,19 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
ar.flags |= EXT4_MB_DELALLOC_RESERVED;
if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
ar.flags |= EXT4_MB_USE_RESERVED;
+ if (flags & EXT4_GET_BLOCKS_ALIGNED) {
+ /*
+ * During aligned allocation we dont want to map a length smaller
+ * than the originally requested length since we use this len to
+ * determine alignment and changing it can misalign the blocks.
+ */
+ if (ar.len != orig_mlen) {
+ ext4_warning(inode->i_sb,
+ "Tried to modify requested len of aligned allocation.");
+ goto out;
+ }
+ ar.flags |= EXT4_MB_ALIGNED_ALLOC;
+ }
newblock = ext4_mb_new_blocks(handle, &ar, &err);
if (!newblock)
goto out;
@@ -430,6 +430,48 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
.end_io = ext4_dio_write_end_io,
};
+/*
+ * Check loff_t because the iov_iter_count() used in blkdev was size_t
+ */
+static bool ext4_dio_atomic_write_checks(struct kiocb *iocb,
+ struct iov_iter *from)
+{
+ struct inode *inode = iocb->ki_filp->f_inode;
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ size_t len = iov_iter_count(from);
+ loff_t pos = iocb->ki_pos;
+ u8 blkbits = inode->i_blkbits;
+
+ /*
+ * Currently aligned alloc, which is needed for atomic IO, is only
+ * supported with extent based files and non bigalloc file systems
+ */
+ if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
+ ext4_warning(inode->i_sb,
+ "Atomic write not supported with bigalloc");
+ return false;
+ }
+ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+ ext4_warning(inode->i_sb,
+ "Atomic write not supported on non-extent files");
+ return false;
+ }
+ if (len & ((1 << blkbits) - 1))
+ /* len should be blocksize aligned */
+ return false;
+ else if (pos % len)
+ /* pos should be naturally aligned to len */
+ return false;
+ else if (!is_power_of_2(len >> blkbits))
+ /*
+ * len in blocks should be power of 2 for mballoc to ensure
+ * alignment
+ */
+ return false;
+
+ return blkdev_atomic_write_valid(bdev, pos, len);
+}
+
/*
* The intention here is to start with shared lock acquired then see if any
* condition requires an exclusive inode lock. If yes, then we restart the
@@ -458,12 +500,19 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
size_t count;
ssize_t ret;
bool overwrite, unaligned_io;
+ bool atomic_write = (iocb->ki_flags & IOCB_ATOMIC);
restart:
ret = ext4_generic_write_checks(iocb, from);
if (ret <= 0)
goto out;
+ if (atomic_write && !ext4_dio_atomic_write_checks(iocb, from)) {
+ ext4_warning(inode->i_sb, "Atomic write checks failed.");
+ ret = -EIO;
+ goto out;
+ }
+
offset = iocb->ki_pos;
count = ret;
@@ -453,6 +453,77 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+/*
+ * This function checks if the map returned by ext4_map_blocks satisfies aligned
+ * allocation requirements. This should be used as the entry point for aligned
+ * allocations
+ */
+static bool ext4_map_check_alignment(struct ext4_map_blocks *map,
+ unsigned int orig_mlen,
+ ext4_lblk_t orig_mlblk,
+ int flags)
+{
+ if (flags & EXT4_GET_BLOCKS_CREATE) {
+ /*
+ * A create lookup must be mapped to satisfy alignment
+ * requirements
+ */
+ if (!(map->m_flags & EXT4_MAP_MAPPED))
+ return false;
+ } else {
+ /*
+ * For create=0, if we find a hole, this hole should be big
+ * enough to accommodate our aligned extent later
+ */
+ if (!(map->m_flags & EXT4_MAP_MAPPED) &&
+ (!(map->m_flags & EXT4_MAP_UNWRITTEN))) {
+ if (map->m_len < orig_mlen)
+ return false;
+ if (map->m_lblk != orig_mlblk)
+ /* Ideally shouldn't happen */
+ return false;
+ return true;
+ }
+ }
+
+ /*
+ * For all the remaining cases, to satisfy alignment, the extent should
+ * be exactly as big as requests and be at the right physical block
+ * alignment
+ */
+ if (map->m_len != orig_mlen)
+ return false;
+ if (map->m_lblk != orig_mlblk)
+ return false;
+ if (!map->m_len || map->m_pblk % map->m_len)
+ return false;
+
+ return true;
+}
+
+int ext4_map_blocks_aligned(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int flags)
+{
+ int ret;
+ unsigned int orig_mlen = map->m_len;
+ ext4_lblk_t orig_mlblk = map->m_lblk;
+
+ if (flags & EXT4_GET_BLOCKS_CREATE)
+ flags |= EXT4_GET_BLOCKS_ALIGNED;
+
+ ret = ext4_map_blocks(handle, inode, map, flags);
+
+ if (ret >= 0 &&
+ !ext4_map_check_alignment(map, orig_mlen, orig_mlblk, flags)) {
+ ext4_warning(
+ inode->i_sb,
+ "Returned extent couldn't satisfy alignment requirements");
+ ret = -EIO;
+ }
+
+ return ret;
+}
+
/*
* The ext4_map_blocks() function tries to look up the requested blocks,
* and returns if the blocks are already mapped.
@@ -474,6 +545,12 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
* indicate the length of a hole starting at map->m_lblk.
*
* It returns the error in case of allocation failure.
+ *
+ * Note for aligned allocations: While most of the alignment related checks are
+ * done by higher level functions, we do have some optimizations here. When
+ * trying to *create* a new aligned extent if at any point we are sure that the
+ * extent won't be as big as the full length, we exit early instead of going for
+ * the allocation and failing later.
*/
int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags)
@@ -481,6 +558,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -583,6 +661,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
return retval;
+ /* For aligned allocation, we must not change original alignment */
+ if (retval < 0 && (flags & EXT4_GET_BLOCKS_ALIGNED) &&
+ map->m_len != orig_mlen) {
+ return retval;
+ }
+
/*
* Returns if the blocks have already allocated
*
@@ -3307,7 +3391,10 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC_WRITE)
+ ret = ext4_map_blocks_aligned(handle, inode, map, m_flags);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3353,7 +3440,11 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
* especially in multi-threaded overwrite requests.
*/
if (offset + length <= i_size_read(inode)) {
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ if (flags & IOMAP_ATOMIC_WRITE)
+ ret = ext4_map_blocks_aligned(NULL, inode, &map, 0);
+ else
+ ret = ext4_map_blocks(NULL, inode, &map, 0);
+
if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
goto out;
}
@@ -3372,6 +3463,15 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Ensure the found extent meets the alignment requirements for aligned
+ * allocation
+ */
+ if ((flags & IOMAP_ATOMIC_WRITE) &&
+ ((map.m_pblk << blkbits) % length ||
+ (map.m_len << blkbits) != length))
+ return -EIO;
+
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -50,6 +50,7 @@ struct partial_cluster;
{ EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \
{ EXT4_GET_BLOCKS_ZERO, "ZERO" }, \
{ EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \
+ { EXT4_GET_BLOCKS_ALIGNED, "ALIGNED" }, \
{ EXT4_EX_NOCACHE, "EX_NOCACHE" })
/*
If the direct IO write is meant to be atomic, ext4 will now try to allocate aligned physical blocks so that atomic write request can be satisfied. This patch also makes the ext4_map_blocks() family of function alignment aware and defines ext4_map_blocks_aligned() function that can allow users to ask for aligned blocks and has checks to ensure the returned extent actually follows the alignment requirements. As usual, alignment requirement is always determined by the length and the offset should be naturally aligned to this len. Although aligned mapping usually makes sense with EXT4_GET_BLOCKS_CREATE we can call ext4_map_blocks_aligned() without that flag aswell. This can be useful to check: 1. If an aligned extent is already present and can be reused. 2. If a pre-existing extent at the location can't satisfy the alignment in which case and aligned write of given len and offset won't be possible. 3. If there is a hole, is it big enough that a subsequent map blocks would be able to allocate the required aligned extent of off and len. Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com> --- fs/ext4/ext4.h | 4 ++ fs/ext4/extents.c | 14 +++++ fs/ext4/file.c | 49 +++++++++++++++++ fs/ext4/inode.c | 104 +++++++++++++++++++++++++++++++++++- include/trace/events/ext4.h | 1 + 5 files changed, 170 insertions(+), 2 deletions(-)