new file mode 100644
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "types.h"
+
+/* The maximum logical space is 4 petabytes, which is 1 terablock. */
+const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS = 1024ULL * 1024 * 1024 * 1024;
+
+/* The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS = 1024ULL * 1024 * 1024 * 64;
+
+/* unit test minimum */
+const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2;
new file mode 100644
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_CONSTANTS_H
+#define VDO_CONSTANTS_H
+
+#include <linux/blkdev.h>
+
+#include "types.h"
+
+enum {
+ /*
+ * The maximum number of contiguous PBNs which will go to a single bio submission queue,
+ * assuming there is more than one queue.
+ */
+ VDO_BIO_ROTATION_INTERVAL_LIMIT = 1024,
+
+ /** The number of entries on a block map page */
+ VDO_BLOCK_MAP_ENTRIES_PER_PAGE = 812,
+
+ /** The origin of the flat portion of the block map */
+ VDO_BLOCK_MAP_FLAT_PAGE_ORIGIN = 1,
+
+ /*
+ * The height of a block map tree. Assuming a root count of 60 and 812 entries per page,
+ * this is big enough to represent almost 95 PB of logical space.
+ */
+ VDO_BLOCK_MAP_TREE_HEIGHT = 5,
+
+ /** The default number of bio submission queues. */
+ DEFAULT_VDO_BIO_SUBMIT_QUEUE_COUNT = 4,
+
+ /** The number of contiguous PBNs to be submitted to a single bio queue. */
+ DEFAULT_VDO_BIO_SUBMIT_QUEUE_ROTATE_INTERVAL = 64,
+
+ /** The number of trees in the arboreal block map */
+ DEFAULT_VDO_BLOCK_MAP_TREE_ROOT_COUNT = 60,
+
+ /** The default size of the recovery journal, in blocks */
+ DEFAULT_VDO_RECOVERY_JOURNAL_SIZE = 32 * 1024,
+
+ /** The default size of each slab journal, in blocks */
+ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224,
+
+ /*
+ * The initial size of lbn_operations and pbn_operations, which is based upon the expected
+ * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely
+ * that the maps would need to be resized.
+ */
+ VDO_LOCK_MAP_CAPACITY = 10000,
+
+ /** The maximum number of logical zones */
+ MAX_VDO_LOGICAL_ZONES = 60,
+
+ /** The maximum number of physical zones */
+ MAX_VDO_PHYSICAL_ZONES = 16,
+
+ /** The base-2 logarithm of the maximum blocks in one slab */
+ MAX_VDO_SLAB_BITS = 23,
+
+ /** The maximum number of slabs the slab depot supports */
+ MAX_VDO_SLABS = 8192,
+
+ /*
+ * The maximum number of block map pages to load simultaneously during recovery or rebuild.
+ */
+ MAXIMUM_SIMULTANEOUS_VDO_BLOCK_MAP_RESTORATION_READS = 1024,
+
+ /** The maximum number of entries in the slab summary */
+ MAXIMUM_VDO_SLAB_SUMMARY_ENTRIES = MAX_VDO_SLABS * MAX_VDO_PHYSICAL_ZONES,
+
+ /** The maximum number of total threads in a VDO thread configuration. */
+ MAXIMUM_VDO_THREADS = 100,
+
+ /** The maximum number of VIOs in the system at once */
+ MAXIMUM_VDO_USER_VIOS = 2048,
+
+ /** The only physical block size supported by VDO */
+ VDO_BLOCK_SIZE = 4096,
+
+ /** The number of sectors per block */
+ VDO_SECTORS_PER_BLOCK = (VDO_BLOCK_SIZE >> SECTOR_SHIFT),
+
+ /** The size of a sector that will not be torn */
+ VDO_SECTOR_SIZE = 512,
+
+ /** The physical block number reserved for storing the zero block */
+ VDO_ZERO_BLOCK = 0,
+};
+
+/** The maximum logical space is 4 petabytes, which is 1 terablock. */
+extern const block_count_t MAXIMUM_VDO_LOGICAL_BLOCKS;
+
+/** The maximum physical space is 256 terabytes, which is 64 gigablocks. */
+extern const block_count_t MAXIMUM_VDO_PHYSICAL_BLOCKS;
+
+/** unit test minimum */
+extern const block_count_t MINIMUM_VDO_SLAB_JOURNAL_BLOCKS;
+
+#endif /* VDO_CONSTANTS_H */
new file mode 100644
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "status-codes.h"
+
+#include "errors.h"
+#include "logger.h"
+#include "permassert.h"
+#include "uds-threads.h"
+
+const struct error_info vdo_status_list[] = {
+ { "VDO_NOT_IMPLEMENTED", "Not implemented" },
+ { "VDO_OUT_OF_RANGE", "Out of range" },
+ { "VDO_REF_COUNT_INVALID", "Reference count would become invalid" },
+ { "VDO_NO_SPACE", "Out of space" },
+ { "VDO_UNEXPECTED_EOF", "Unexpected EOF on block read" },
+ { "VDO_BAD_CONFIGURATION", "Bad configuration option" },
+ { "VDO_SOCKET_ERROR", "Socket error" },
+ { "VDO_BAD_ALIGNMENT", "Mis-aligned block reference" },
+ { "VDO_COMPONENT_BUSY", "Prior operation still in progress" },
+ { "VDO_BAD_PAGE", "Corrupt or incorrect page" },
+ { "VDO_UNSUPPORTED_VERSION", "Unsupported component version" },
+ { "VDO_INCORRECT_COMPONENT", "Component id mismatch in decoder" },
+ { "VDO_PARAMETER_MISMATCH", "Parameters have conflicting values" },
+ { "VDO_BLOCK_SIZE_TOO_SMALL", "The block size is too small" },
+ { "VDO_UNKNOWN_PARTITION", "No partition exists with a given id" },
+ { "VDO_PARTITION_EXISTS", "A partition already exists with a given id" },
+ { "VDO_NOT_READ_ONLY", "The device is not in read-only mode" },
+ { "VDO_INCREMENT_TOO_SMALL", "Physical block growth of too few blocks" },
+ { "VDO_CHECKSUM_MISMATCH", "Incorrect checksum" },
+ { "VDO_RECOVERY_JOURNAL_FULL", "The recovery journal is full" },
+ { "VDO_LOCK_ERROR", "A lock is held incorrectly" },
+ { "VDO_READ_ONLY", "The device is in read-only mode" },
+ { "VDO_SHUTTING_DOWN", "The device is shutting down" },
+ { "VDO_CORRUPT_JOURNAL", "Recovery journal entries corrupted" },
+ { "VDO_TOO_MANY_SLABS", "Exceeds maximum number of slabs supported" },
+ { "VDO_INVALID_FRAGMENT", "Compressed block fragment is invalid" },
+ { "VDO_RETRY_AFTER_REBUILD", "Retry operation after rebuilding finishes" },
+ { "VDO_UNKNOWN_COMMAND", "The extended command is not known" },
+ { "VDO_COMMAND_ERROR", "Bad extended command parameters" },
+ { "VDO_CANNOT_DETERMINE_SIZE", "Cannot determine config sizes to fit" },
+ { "VDO_BAD_MAPPING", "Invalid page mapping" },
+ { "VDO_READ_CACHE_BUSY", "Read cache has no free slots" },
+ { "VDO_BIO_CREATION_FAILED", "Bio creation failed" },
+ { "VDO_BAD_MAGIC", "Bad magic number" },
+ { "VDO_BAD_NONCE", "Bad nonce" },
+ { "VDO_JOURNAL_OVERFLOW", "Journal sequence number overflow" },
+ { "VDO_INVALID_ADMIN_STATE", "Invalid operation for current state" },
+ { "VDO_CANT_ADD_SYSFS_NODE", "Failed to add sysfs node" },
+};
+
+static atomic_t vdo_status_codes_registered = ATOMIC_INIT(0);
+static int status_code_registration_result;
+
+static void do_status_code_registration(void)
+{
+ int result;
+
+ STATIC_ASSERT((VDO_STATUS_CODE_LAST - VDO_STATUS_CODE_BASE) ==
+ ARRAY_SIZE(vdo_status_list));
+
+ result = uds_register_error_block("VDO Status",
+ VDO_STATUS_CODE_BASE,
+ VDO_STATUS_CODE_BLOCK_END,
+ vdo_status_list,
+ sizeof(vdo_status_list));
+ /*
+ * The following test handles cases where libvdo is statically linked against both the test
+ * modules and the test driver (because multiple instances of this module call their own
+ * copy of this function once each, resulting in multiple calls to register_error_block
+ * which is shared in libuds).
+ */
+ if (result == UDS_DUPLICATE_NAME)
+ result = UDS_SUCCESS;
+
+ status_code_registration_result = (result == UDS_SUCCESS) ? VDO_SUCCESS : result;
+}
+
+/**
+ * vdo_register_status_codes() - Register the VDO status codes if needed.
+ * Return: A success or error code.
+ */
+int vdo_register_status_codes(void)
+{
+ uds_perform_once(&vdo_status_codes_registered, do_status_code_registration);
+ return status_code_registration_result;
+}
+
+/**
+ * vdo_map_to_system_error() - Given an error code, return a value we can return to the OS.
+ * @error: The error code to convert.
+ *
+ * The input error code may be a system-generated value (such as -EIO), an errno macro used in our
+ * code (such as EIO), or a UDS or VDO status code; the result must be something the rest of the OS
+ * can consume (negative errno values such as -EIO, in the case of the kernel).
+ *
+ * Return: A system error code value.
+ */
+int vdo_map_to_system_error(int error)
+{
+ char error_name[UDS_MAX_ERROR_NAME_SIZE];
+ char error_message[UDS_MAX_ERROR_MESSAGE_SIZE];
+
+ /* 0 is success, negative a system error code */
+ if (likely(error <= 0))
+ return error;
+ if (error < 1024)
+ return -error;
+
+ /* VDO or UDS error */
+ switch (error) {
+ case VDO_NO_SPACE:
+ return -ENOSPC;
+ case VDO_READ_ONLY:
+ return -EIO;
+ default:
+ uds_log_info("%s: mapping internal status code %d (%s: %s) to EIO",
+ __func__,
+ error,
+ uds_string_error_name(error, error_name, sizeof(error_name)),
+ uds_string_error(error, error_message, sizeof(error_message)));
+ return -EIO;
+ }
+}
new file mode 100644
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_STATUS_CODES_H
+#define VDO_STATUS_CODES_H
+
+#include "errors.h"
+
+enum {
+ UDS_BLOCK_SIZE = UDS_ERROR_CODE_BLOCK_END - UDS_ERROR_CODE_BASE,
+ VDO_BLOCK_START = UDS_ERROR_CODE_BLOCK_END,
+ VDO_BLOCK_END = VDO_BLOCK_START + UDS_BLOCK_SIZE,
+ PRP_BLOCK_START = VDO_BLOCK_END,
+ PRP_BLOCK_END = PRP_BLOCK_START + UDS_BLOCK_SIZE,
+};
+
+/* VDO-specific status codes. */
+enum vdo_status_codes {
+ /* successful result */
+ VDO_SUCCESS,
+ /* base of all VDO errors */
+ VDO_STATUS_CODE_BASE = VDO_BLOCK_START,
+ /* we haven't written this yet */
+ VDO_NOT_IMPLEMENTED = VDO_STATUS_CODE_BASE,
+ /* input out of range */
+ VDO_OUT_OF_RANGE,
+ /* an invalid reference count would result */
+ VDO_REF_COUNT_INVALID,
+ /* a free block could not be allocated */
+ VDO_NO_SPACE,
+ /* unexpected EOF on block read */
+ VDO_UNEXPECTED_EOF,
+ /* improper or missing configuration option */
+ VDO_BAD_CONFIGURATION,
+ /* socket opening or binding problem */
+ VDO_SOCKET_ERROR,
+ /* read or write on non-aligned offset */
+ VDO_BAD_ALIGNMENT,
+ /* prior operation still in progress */
+ VDO_COMPONENT_BUSY,
+ /* page contents incorrect or corrupt data */
+ VDO_BAD_PAGE,
+ /* unsupported version of some component */
+ VDO_UNSUPPORTED_VERSION,
+ /* component id mismatch in decoder */
+ VDO_INCORRECT_COMPONENT,
+ /* parameters have conflicting values */
+ VDO_PARAMETER_MISMATCH,
+ /* the block size is too small */
+ VDO_BLOCK_SIZE_TOO_SMALL,
+ /* no partition exists with a given id */
+ VDO_UNKNOWN_PARTITION,
+ /* a partition already exists with a given id */
+ VDO_PARTITION_EXISTS,
+ /* the VDO is not in read-only mode */
+ VDO_NOT_READ_ONLY,
+ /* physical block growth of too few blocks */
+ VDO_INCREMENT_TOO_SMALL,
+ /* incorrect checksum */
+ VDO_CHECKSUM_MISMATCH,
+ /* the recovery journal is full */
+ VDO_RECOVERY_JOURNAL_FULL,
+ /* a lock is held incorrectly */
+ VDO_LOCK_ERROR,
+ /* the VDO is in read-only mode */
+ VDO_READ_ONLY,
+ /* the VDO is shutting down */
+ VDO_SHUTTING_DOWN,
+ /* the recovery journal has corrupt entries */
+ VDO_CORRUPT_JOURNAL,
+ /* exceeds maximum number of slabs supported */
+ VDO_TOO_MANY_SLABS,
+ /* a compressed block fragment is invalid */
+ VDO_INVALID_FRAGMENT,
+ /* action is unsupported while rebuilding */
+ VDO_RETRY_AFTER_REBUILD,
+ /* the extended command is not known */
+ VDO_UNKNOWN_COMMAND,
+ /* bad extended command parameters */
+ VDO_COMMAND_ERROR,
+ /* cannot determine sizes to fit */
+ VDO_CANNOT_DETERMINE_SIZE,
+ /* a block map entry is invalid */
+ VDO_BAD_MAPPING,
+ /* read cache has no free slots */
+ VDO_READ_CACHE_BUSY,
+ /* bio_add_page failed */
+ VDO_BIO_CREATION_FAILED,
+ /* bad magic number */
+ VDO_BAD_MAGIC,
+ /* bad nonce */
+ VDO_BAD_NONCE,
+ /* sequence number overflow */
+ VDO_JOURNAL_OVERFLOW,
+ /* the VDO is not in a state to perform an admin operation */
+ VDO_INVALID_ADMIN_STATE,
+ /* failure adding a sysfs node */
+ VDO_CANT_ADD_SYSFS_NODE,
+ /* one more than last error code */
+ VDO_STATUS_CODE_LAST,
+ VDO_STATUS_CODE_BLOCK_END = VDO_BLOCK_END
+};
+
+extern const struct error_info vdo_status_list[];
+
+int vdo_register_status_codes(void);
+
+int vdo_map_to_system_error(int error);
+
+#endif /* VDO_STATUS_CODES_H */
new file mode 100644
@@ -0,0 +1,393 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_TYPES_H
+#define VDO_TYPES_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/device-mapper.h>
+#include <linux/list.h>
+#include <linux/compiler_attributes.h>
+#include <linux/types.h>
+
+#include "funnel-queue.h"
+
+/* A size type in blocks. */
+typedef u64 block_count_t;
+
+/* The size of a block. */
+typedef u16 block_size_t;
+
+/* A counter for data_vios */
+typedef u16 data_vio_count_t;
+
+/* A height within a tree. */
+typedef u8 height_t;
+
+/* The logical block number as used by the consumer. */
+typedef u64 logical_block_number_t;
+
+/* The type of the nonce used to identify instances of VDO. */
+typedef u64 nonce_t;
+
+/* A size in pages. */
+typedef u32 page_count_t;
+
+/* A page number. */
+typedef u32 page_number_t;
+
+/*
+ * The physical (well, less logical) block number at which the block is found on the underlying
+ * device.
+ */
+typedef u64 physical_block_number_t;
+
+/* A count of tree roots. */
+typedef u8 root_count_t;
+
+/* A number of sectors. */
+typedef u8 sector_count_t;
+
+/* A sequence number. */
+typedef u64 sequence_number_t;
+
+/* The offset of a block within a slab. */
+typedef u32 slab_block_number;
+
+/* A size type in slabs. */
+typedef u16 slab_count_t;
+
+/* A slot in a bin or block map page. */
+typedef u16 slot_number_t;
+
+/* typedef thread_count_t - A thread counter. */
+typedef u8 thread_count_t;
+
+/* typedef thread_id_t - A thread ID, vdo threads are numbered sequentially from 0. */
+typedef u8 thread_id_t;
+
+/* A zone counter */
+typedef u8 zone_count_t;
+
+/* The following enums are persisted on storage, so the values must be preserved. */
+
+/* The current operating mode of the VDO. */
+enum vdo_state {
+ VDO_DIRTY = 0,
+ VDO_NEW = 1,
+ VDO_CLEAN = 2,
+ VDO_READ_ONLY_MODE = 3,
+ VDO_FORCE_REBUILD = 4,
+ VDO_RECOVERING = 5,
+ VDO_REPLAYING = 6, /* VDO_REPLAYING is never set anymore, but retained for upgrade */
+ VDO_REBUILD_FOR_UPGRADE = 7,
+
+ /* Keep VDO_STATE_COUNT at the bottom. */
+ VDO_STATE_COUNT
+};
+
+/**
+ * vdo_state_requires_read_only_rebuild() - Check whether a vdo_state indicates
+ * that a read-only rebuild is required.
+ * @state: The vdo_state to check.
+ *
+ * Return: true if the state indicates a rebuild is required
+ */
+static inline bool __must_check vdo_state_requires_read_only_rebuild(enum vdo_state state)
+{
+ return ((state == VDO_FORCE_REBUILD) || (state == VDO_REBUILD_FOR_UPGRADE));
+}
+
+/**
+ * vdo_state_requires_recovery() - Check whether a vdo state indicates that recovery is needed.
+ * @state: The state to check.
+ *
+ * Return: true if the state indicates a recovery is required
+ */
+static inline bool __must_check vdo_state_requires_recovery(enum vdo_state state)
+{
+ return ((state == VDO_DIRTY) || (state == VDO_REPLAYING) || (state == VDO_RECOVERING));
+}
+
+/*
+ * The current operation on a physical block (from the point of view of the recovery journal, slab
+ * journals, and reference counts.
+ */
+enum journal_operation {
+ VDO_JOURNAL_DATA_REMAPPING = 0,
+ VDO_JOURNAL_BLOCK_MAP_REMAPPING = 1,
+} __packed;
+
+/* Partition IDs encoded in the volume layout in the super block. */
+enum partition_id {
+ VDO_BLOCK_MAP_PARTITION = 0,
+ VDO_SLAB_DEPOT_PARTITION = 1,
+ VDO_RECOVERY_JOURNAL_PARTITION = 2,
+ VDO_SLAB_SUMMARY_PARTITION = 3,
+} __packed;
+
+/* Metadata types for the vdo. */
+enum vdo_metadata_type {
+ VDO_METADATA_RECOVERY_JOURNAL = 1,
+ VDO_METADATA_SLAB_JOURNAL = 2,
+ VDO_METADATA_RECOVERY_JOURNAL_2 = 3,
+} __packed;
+
+/* A position in the block map where a block map entry is stored. */
+struct block_map_slot {
+ physical_block_number_t pbn;
+ slot_number_t slot;
+};
+
+/*
+ * Four bits of each five-byte block map entry contain a mapping state value used to distinguish
+ * unmapped or trimmed logical blocks (which are treated as mapped to the zero block) from entries
+ * that have been mapped to a physical block, including the zero block.
+ *
+ * FIXME: these should maybe be defines.
+ */
+enum block_mapping_state {
+ VDO_MAPPING_STATE_UNMAPPED = 0, /* Must be zero to be the default value */
+ VDO_MAPPING_STATE_UNCOMPRESSED = 1, /* A normal (uncompressed) block */
+ VDO_MAPPING_STATE_COMPRESSED_BASE = 2, /* Compressed in slot 0 */
+ VDO_MAPPING_STATE_COMPRESSED_MAX = 15, /* Compressed in slot 13 */
+};
+
+enum {
+ VDO_MAX_COMPRESSION_SLOTS =
+ (VDO_MAPPING_STATE_COMPRESSED_MAX - VDO_MAPPING_STATE_COMPRESSED_BASE + 1),
+};
+
+
+struct data_location {
+ physical_block_number_t pbn;
+ enum block_mapping_state state;
+};
+
+/* The configuration of a single slab derived from the configured block size and slab size. */
+struct slab_config {
+ /* total number of blocks in the slab */
+ block_count_t slab_blocks;
+ /* number of blocks available for data */
+ block_count_t data_blocks;
+ /* number of blocks for reference counts */
+ block_count_t reference_count_blocks;
+ /* number of blocks for the slab journal */
+ block_count_t slab_journal_blocks;
+ /*
+ * Number of blocks after which the slab journal starts pushing out a reference_block for
+ * each new entry it receives.
+ */
+ block_count_t slab_journal_flushing_threshold;
+ /*
+ * Number of blocks after which the slab journal pushes out all reference_blocks and makes
+ * all vios wait.
+ */
+ block_count_t slab_journal_blocking_threshold;
+ /* Number of blocks after which the slab must be scrubbed before coming online. */
+ block_count_t slab_journal_scrubbing_threshold;
+} __packed;
+
+/*
+ * This structure is memcmp'd for equality. Keep it packed and don't add any fields that are not
+ * properly set in both extant and parsed configs.
+ */
+struct thread_count_config {
+ unsigned int bio_ack_threads;
+ unsigned int bio_threads;
+ unsigned int bio_rotation_interval;
+ unsigned int cpu_threads;
+ unsigned int logical_zones;
+ unsigned int physical_zones;
+ unsigned int hash_zones;
+} __packed;
+
+struct device_config {
+ struct dm_target *owning_target;
+ struct dm_dev *owned_device;
+ struct vdo *vdo;
+ /* All configs referencing a layer are kept on a list in the layer */
+ struct list_head config_list;
+ char *original_string;
+ unsigned int version;
+ char *parent_device_name;
+ block_count_t physical_blocks;
+ /*
+ * This is the number of logical blocks from VDO's internal point of view. It is the number
+ * of 4K blocks regardless of the value of the logical_block_size parameter below.
+ */
+ block_count_t logical_blocks;
+ unsigned int logical_block_size;
+ unsigned int cache_size;
+ unsigned int block_map_maximum_age;
+ bool deduplication;
+ bool compression;
+ struct thread_count_config thread_counts;
+ block_count_t max_discard_blocks;
+};
+
+enum vdo_completion_type {
+ /* Keep VDO_UNSET_COMPLETION_TYPE at the top. */
+ VDO_UNSET_COMPLETION_TYPE,
+ VDO_ACTION_COMPLETION,
+ VDO_ADMIN_COMPLETION,
+ VDO_BLOCK_ALLOCATOR_COMPLETION,
+ VDO_DATA_VIO_POOL_COMPLETION,
+ VDO_DECREMENT_COMPLETION,
+ VDO_FLUSH_COMPLETION,
+ VDO_FLUSH_NOTIFICATION_COMPLETION,
+ VDO_GENERATION_FLUSHED_COMPLETION,
+ VDO_HASH_ZONE_COMPLETION,
+ VDO_HASH_ZONES_COMPLETION,
+ VDO_LOCK_COUNTER_COMPLETION,
+ VDO_PAGE_COMPLETION,
+ VDO_READ_ONLY_MODE_COMPLETION,
+ VDO_REPAIR_COMPLETION,
+ VDO_SYNC_COMPLETION,
+ VIO_COMPLETION,
+} __packed;
+
+struct vdo_completion;
+
+/**
+ * typedef vdo_action - An asynchronous VDO operation.
+ * @completion: The completion of the operation.
+ */
+typedef void vdo_action(struct vdo_completion *completion);
+
+enum vdo_completion_priority {
+ BIO_ACK_Q_ACK_PRIORITY = 0,
+ BIO_ACK_Q_MAX_PRIORITY = 0,
+ BIO_Q_COMPRESSED_DATA_PRIORITY = 0,
+ BIO_Q_DATA_PRIORITY = 0,
+ BIO_Q_FLUSH_PRIORITY = 2,
+ BIO_Q_HIGH_PRIORITY = 2,
+ BIO_Q_METADATA_PRIORITY = 1,
+ BIO_Q_VERIFY_PRIORITY = 1,
+ BIO_Q_MAX_PRIORITY = 2,
+ CPU_Q_COMPLETE_VIO_PRIORITY = 0,
+ CPU_Q_COMPLETE_READ_PRIORITY = 0,
+ CPU_Q_COMPRESS_BLOCK_PRIORITY = 0,
+ CPU_Q_EVENT_REPORTER_PRIORITY = 0,
+ CPU_Q_HASH_BLOCK_PRIORITY = 0,
+ CPU_Q_MAX_PRIORITY = 0,
+ UDS_Q_PRIORITY = 0,
+ UDS_Q_MAX_PRIORITY = 0,
+ VDO_DEFAULT_Q_COMPLETION_PRIORITY = 1,
+ VDO_DEFAULT_Q_FLUSH_PRIORITY = 2,
+ VDO_DEFAULT_Q_MAP_BIO_PRIORITY = 0,
+ VDO_DEFAULT_Q_SYNC_PRIORITY = 2,
+ VDO_DEFAULT_Q_VIO_CALLBACK_PRIORITY = 1,
+ VDO_DEFAULT_Q_MAX_PRIORITY = 2,
+ /* The maximum allowable priority */
+ VDO_WORK_Q_MAX_PRIORITY = 2,
+ /* A value which must be out of range for a valid priority */
+ VDO_WORK_Q_DEFAULT_PRIORITY = VDO_WORK_Q_MAX_PRIORITY + 1,
+};
+
+struct vdo_completion {
+ /* The type of completion this is */
+ enum vdo_completion_type type;
+
+ /*
+ * <code>true</code> once the processing of the operation is complete. This flag should not
+ * be used by waiters external to the VDO base as it is used to gate calling the callback.
+ */
+ bool complete;
+
+ /*
+ * If true, queue this completion on the next callback invocation, even if it is already
+ * running on the correct thread.
+ */
+ bool requeue;
+
+ /* The ID of the thread which should run the next callback */
+ thread_id_t callback_thread_id;
+
+ /* The result of the operation */
+ int result;
+
+ /* The VDO on which this completion operates */
+ struct vdo *vdo;
+
+ /* The callback which will be called once the operation is complete */
+ vdo_action *callback;
+
+ /* Callback which, if set, will be called if an error result is set */
+ vdo_action *error_handler;
+
+ /* The parent object, if any, that spawned this completion */
+ void *parent;
+
+ /* Entry link for lock-free work queue */
+ struct funnel_queue_entry work_queue_entry_link;
+ enum vdo_completion_priority priority;
+ struct vdo_work_queue *my_queue;
+};
+
+struct block_allocator;
+struct data_vio;
+struct vdo;
+struct vdo_config;
+
+/* vio types for statistics and instrumentation. */
+enum vio_type {
+ VIO_TYPE_UNINITIALIZED = 0,
+ VIO_TYPE_DATA,
+ VIO_TYPE_BLOCK_ALLOCATOR,
+ VIO_TYPE_BLOCK_MAP,
+ VIO_TYPE_BLOCK_MAP_INTERIOR,
+ VIO_TYPE_GEOMETRY,
+ VIO_TYPE_PARTITION_COPY,
+ VIO_TYPE_RECOVERY_JOURNAL,
+ VIO_TYPE_SLAB_JOURNAL,
+ VIO_TYPE_SLAB_SUMMARY,
+ VIO_TYPE_SUPER_BLOCK,
+} __packed;
+
+/* Priority levels for asynchronous I/O operations performed on a vio. */
+enum vio_priority {
+ VIO_PRIORITY_LOW = 0,
+ VIO_PRIORITY_DATA = VIO_PRIORITY_LOW,
+ VIO_PRIORITY_COMPRESSED_DATA = VIO_PRIORITY_DATA,
+ VIO_PRIORITY_METADATA,
+ VIO_PRIORITY_HIGH,
+} __packed;
+
+/*
+ * A wrapper for a bio. All I/O to the storage below a vdo is conducted via vios.
+ */
+struct vio {
+ /* The completion for this vio */
+ struct vdo_completion completion;
+
+ /* The bio zone in which I/O should be processed */
+ zone_count_t bio_zone;
+
+ /* The queueing priority of the vio operation */
+ enum vio_priority priority;
+
+ /* The vio type is used for statistics and instrumentation. */
+ enum vio_type type;
+
+ /* The size of this vio in blocks */
+ unsigned int block_count;
+
+ /* The data being read or written. */
+ char *data;
+
+ /* The VDO-owned bio to use for all IO for this vio */
+ struct bio *bio;
+
+ /*
+ * A list of enqueued bios with consecutive block numbers, stored by vdo_submit_bio() under
+ * the first-enqueued vio. The other vios are found via their bio entries in this list, and
+ * are not added to the work queue as separate completions.
+ */
+ struct bio_list bios_merged;
+};
+
+#endif /* VDO_TYPES_H */
new file mode 100644
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#include "wait-queue.h"
+
+#include <linux/device-mapper.h>
+
+#include "permassert.h"
+
+#include "status-codes.h"
+
+/**
+ * vdo_enqueue_waiter() - Add a waiter to the tail end of a wait queue.
+ * @queue: The queue to which to add the waiter.
+ * @waiter: The waiter to add to the queue.
+ *
+ * The waiter must not already be waiting in a queue.
+ *
+ * Return: VDO_SUCCESS or an error code.
+ */
+void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter)
+{
+ BUG_ON(waiter->next_waiter != NULL);
+
+ if (queue->last_waiter == NULL) {
+ /*
+ * The queue is empty, so form the initial circular list by self-linking the
+ * initial waiter.
+ */
+ waiter->next_waiter = waiter;
+ } else {
+ /* Splice the new waiter in at the end of the queue. */
+ waiter->next_waiter = queue->last_waiter->next_waiter;
+ queue->last_waiter->next_waiter = waiter;
+ }
+
+ /* In both cases, the waiter we added to the ring becomes the last waiter. */
+ queue->last_waiter = waiter;
+ queue->queue_length += 1;
+}
+
+/**
+ * vdo_transfer_all_waiters() - Transfer all waiters from one wait queue to a second queue,
+ * emptying the first queue.
+ * @from_queue: The queue containing the waiters to move.
+ * @to_queue: The queue that will receive the waiters from the first queue.
+ */
+void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue)
+{
+ /* If the source queue is empty, there's nothing to do. */
+ if (!vdo_has_waiters(from_queue))
+ return;
+
+ if (vdo_has_waiters(to_queue)) {
+ /*
+ * Both queues are non-empty. Splice the two circular lists together by swapping
+ * the next (head) pointers in the list tails.
+ */
+ struct waiter *from_head = from_queue->last_waiter->next_waiter;
+ struct waiter *to_head = to_queue->last_waiter->next_waiter;
+
+ to_queue->last_waiter->next_waiter = from_head;
+ from_queue->last_waiter->next_waiter = to_head;
+ }
+
+ to_queue->last_waiter = from_queue->last_waiter;
+ to_queue->queue_length += from_queue->queue_length;
+ vdo_initialize_wait_queue(from_queue);
+}
+
+/**
+ * vdo_notify_all_waiters() - Notify all the entries waiting in a queue.
+ * @queue: The wait queue containing the waiters to notify.
+ * @callback: The function to call to notify each waiter, or NULL to invoke the callback field
+ * registered in each waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies all the entries waiting in a queue to continue execution by invoking a callback
+ * function on each of them in turn. The queue is copied and emptied before invoking any callbacks,
+ * and only the waiters that were in the queue at the start of the call will be notified.
+ */
+void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context)
+{
+ /*
+ * Copy and empty the queue first, avoiding the possibility of an infinite loop if entries
+ * are returned to the queue by the callback function.
+ */
+ struct wait_queue waiters;
+
+ vdo_initialize_wait_queue(&waiters);
+ vdo_transfer_all_waiters(queue, &waiters);
+
+ /* Drain the copied queue, invoking the callback on every entry. */
+ while (vdo_notify_next_waiter(&waiters, callback, context))
+ /* All the work is done by the loop condition. */
+ ;
+}
+
+/**
+ * vdo_get_first_waiter() - Return the waiter that is at the head end of a wait queue.
+ * @queue: The queue from which to get the first waiter.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+struct waiter *vdo_get_first_waiter(const struct wait_queue *queue)
+{
+ struct waiter *last_waiter = queue->last_waiter;
+
+ if (last_waiter == NULL)
+ /* There are no waiters, so we're done. */
+ return NULL;
+
+ /* The queue is circular, so the last entry links to the head of the queue. */
+ return last_waiter->next_waiter;
+}
+
+/**
+ * vdo_dequeue_matching_waiters() - Remove all waiters that match based on the specified matching
+ * method and append them to a wait_queue.
+ * @queue: The wait queue to process.
+ * @match_method: The method to determine matching.
+ * @match_context: Contextual info for the match method.
+ * @matched_queue: A wait_queue to store matches.
+ */
+void vdo_dequeue_matching_waiters(struct wait_queue *queue,
+ waiter_match *match_method,
+ void *match_context,
+ struct wait_queue *matched_queue)
+{
+ struct wait_queue matched_waiters, iteration_queue;
+
+ vdo_initialize_wait_queue(&matched_waiters);
+
+ vdo_initialize_wait_queue(&iteration_queue);
+ vdo_transfer_all_waiters(queue, &iteration_queue);
+ while (vdo_has_waiters(&iteration_queue)) {
+ struct waiter *waiter = vdo_dequeue_next_waiter(&iteration_queue);
+
+ vdo_enqueue_waiter((match_method(waiter, match_context) ?
+ &matched_waiters :
+ queue),
+ waiter);
+ }
+
+ vdo_transfer_all_waiters(&matched_waiters, matched_queue);
+}
+
+/**
+ * vdo_dequeue_next_waiter() - Remove the first waiter from the head end of a wait queue.
+ * @queue: The wait queue from which to remove the first entry.
+ *
+ * The caller will be responsible for waking the waiter by invoking the correct callback function
+ * to resume its execution.
+ *
+ * Return: The first (oldest) waiter in the queue, or NULL if the queue is empty.
+ */
+struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue)
+{
+ struct waiter *first_waiter = vdo_get_first_waiter(queue);
+ struct waiter *last_waiter = queue->last_waiter;
+
+ if (first_waiter == NULL)
+ return NULL;
+
+ if (first_waiter == last_waiter)
+ /* The queue has a single entry, so just empty it out by nulling the tail. */
+ queue->last_waiter = NULL;
+ else
+ /*
+ * The queue has more than one entry, so splice the first waiter out of the
+ * circular queue.
+ */
+ last_waiter->next_waiter = first_waiter->next_waiter;
+
+ /* The waiter is no longer in a wait queue. */
+ first_waiter->next_waiter = NULL;
+ queue->queue_length -= 1;
+ return first_waiter;
+}
+
+/**
+ * vdo_notify_next_waiter() - Notify the next entry waiting in a queue.
+ * @queue: The wait queue containing the waiter to notify.
+ * @callback: The function to call to notify the waiter, or NULL to invoke the callback field
+ * registered in the waiter.
+ * @context: The context to pass to the callback function.
+ *
+ * Notifies the next entry waiting in a queue to continue execution by invoking a callback function
+ * on it after removing it from the queue.
+ *
+ * Return: true if there was a waiter in the queue.
+ */
+bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context)
+{
+ struct waiter *waiter = vdo_dequeue_next_waiter(queue);
+
+ if (waiter == NULL)
+ return false;
+
+ if (callback == NULL)
+ callback = waiter->callback;
+ (*callback)(waiter, context);
+ return true;
+}
+
+/**
+ * vdo_get_next_waiter() - Get the waiter after this one, for debug iteration.
+ * @queue: The wait queue.
+ * @waiter: A waiter.
+ *
+ * Return: The next waiter, or NULL.
+ */
+const struct waiter *
+vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter)
+{
+ struct waiter *first_waiter = vdo_get_first_waiter(queue);
+
+ if (waiter == NULL)
+ return first_waiter;
+ return ((waiter->next_waiter != first_waiter) ? waiter->next_waiter : NULL);
+}
new file mode 100644
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright 2023 Red Hat
+ */
+
+#ifndef VDO_WAIT_QUEUE_H
+#define VDO_WAIT_QUEUE_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+/**
+ * DOC: Wait queues.
+ *
+ * A wait queue is a circular list of entries waiting to be notified of a change in a condition.
+ * Keeping a circular list allows the queue structure to simply be a pointer to the tail (newest)
+ * entry in the queue, supporting constant-time enqueue and dequeue operations. A null pointer is
+ * an empty queue.
+ *
+ * An empty queue:
+ * queue0.last_waiter -> NULL
+ *
+ * A singleton queue:
+ * queue1.last_waiter -> entry1 -> entry1 -> [...]
+ *
+ * A three-element queue:
+ * queue2.last_waiter -> entry3 -> entry1 -> entry2 -> entry3 -> [...]
+ */
+
+struct waiter;
+
+struct wait_queue {
+ /* The tail of the queue, the last (most recently added) entry */
+ struct waiter *last_waiter;
+ /* The number of waiters currently in the queue */
+ size_t queue_length;
+};
+
+/**
+ * typedef waiter_callback - Callback type for functions which will be called to resume processing
+ * of a waiter after it has been removed from its wait queue.
+ */
+typedef void waiter_callback(struct waiter *waiter, void *context);
+
+/**
+ * typedef waiter_match - Method type for waiter matching methods.
+ *
+ * A waiter_match method returns false if the waiter does not match.
+ */
+typedef bool waiter_match(struct waiter *waiter, void *context);
+
+/* The queue entry structure for entries in a wait_queue. */
+struct waiter {
+ /*
+ * The next waiter in the queue. If this entry is the last waiter, then this is actually a
+ * pointer back to the head of the queue.
+ */
+ struct waiter *next_waiter;
+
+ /* Optional waiter-specific callback to invoke when waking this waiter. */
+ waiter_callback *callback;
+};
+
+/**
+ * is_waiting() - Check whether a waiter is waiting.
+ * @waiter: The waiter to check.
+ *
+ * Return: true if the waiter is on some wait_queue.
+ */
+static inline bool vdo_is_waiting(struct waiter *waiter)
+{
+ return (waiter->next_waiter != NULL);
+}
+
+/**
+ * initialize_wait_queue() - Initialize a wait queue.
+ * @queue: The queue to initialize.
+ */
+static inline void vdo_initialize_wait_queue(struct wait_queue *queue)
+{
+ *queue = (struct wait_queue) {
+ .last_waiter = NULL,
+ .queue_length = 0,
+ };
+}
+
+/**
+ * has_waiters() - Check whether a wait queue has any entries waiting in it.
+ * @queue: The queue to query.
+ *
+ * Return: true if there are any waiters in the queue.
+ */
+static inline bool __must_check vdo_has_waiters(const struct wait_queue *queue)
+{
+ return (queue->last_waiter != NULL);
+}
+
+void vdo_enqueue_waiter(struct wait_queue *queue, struct waiter *waiter);
+
+void vdo_notify_all_waiters(struct wait_queue *queue, waiter_callback *callback, void *context);
+
+bool vdo_notify_next_waiter(struct wait_queue *queue, waiter_callback *callback, void *context);
+
+void vdo_transfer_all_waiters(struct wait_queue *from_queue, struct wait_queue *to_queue);
+
+struct waiter *vdo_get_first_waiter(const struct wait_queue *queue);
+
+void vdo_dequeue_matching_waiters(struct wait_queue *queue,
+ waiter_match *match_method,
+ void *match_context,
+ struct wait_queue *matched_queue);
+
+struct waiter *vdo_dequeue_next_waiter(struct wait_queue *queue);
+
+/**
+ * count_waiters() - Count the number of waiters in a wait queue.
+ * @queue: The wait queue to query.
+ *
+ * Return: The number of waiters in the queue.
+ */
+static inline size_t __must_check vdo_count_waiters(const struct wait_queue *queue)
+{
+ return queue->queue_length;
+}
+
+const struct waiter * __must_check
+vdo_get_next_waiter(const struct wait_queue *queue, const struct waiter *waiter);
+
+#endif /* VDO_WAIT_QUEUE_H */