===================================================================
@@ -186,6 +186,7 @@ struct dm_integrity_c {
/* these variables are locked with endio_wait.lock */
struct rb_root in_progress;
+ struct list_head wait_list;
wait_queue_head_t endio_wait;
struct workqueue_struct *wait_wq;
@@ -233,7 +234,14 @@ struct dm_integrity_c {
struct dm_integrity_range {
sector_t logical_sector;
unsigned n_sectors;
- struct rb_node node;
+ bool waiting;
+ union {
+ struct rb_node node;
+ struct {
+ struct task_struct *task;
+ struct list_head wait_entry;
+ };
+ };
};
struct dm_integrity_io {
@@ -867,13 +875,27 @@ static void copy_from_journal(struct dm_
}
}
-static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+static bool ranges_overlap(struct dm_integrity_range *range1, struct dm_integrity_range *range2)
+{
+ return range1->logical_sector < range2->logical_sector + range2->n_sectors &&
+ range2->logical_sector + range2->n_sectors > range2->logical_sector;
+}
+
+static bool add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range, bool check_waiting)
{
struct rb_node **n = &ic->in_progress.rb_node;
struct rb_node *parent;
BUG_ON((new_range->logical_sector | new_range->n_sectors) & (unsigned)(ic->sectors_per_block - 1));
+ if (likely(check_waiting)) {
+ struct dm_integrity_range *range;
+ list_for_each_entry(range, &ic->wait_list, wait_entry) {
+ if (unlikely(ranges_overlap(range, new_range)))
+ return false;
+ }
+ }
+
parent = NULL;
while (*n) {
@@ -898,7 +920,21 @@ static bool add_new_range(struct dm_inte
static void remove_range_unlocked(struct dm_integrity_c *ic, struct dm_integrity_range *range)
{
rb_erase(&range->node, &ic->in_progress);
- wake_up_locked(&ic->endio_wait);
+ while (unlikely(!list_empty(&ic->wait_list))) {
+ struct dm_integrity_range *last_range = list_first_entry(&ic->wait_list, struct dm_integrity_range, wait_entry);
+ struct task_struct *last_range_task;
+ if (!ranges_overlap(range, last_range))
+ break;
+ last_range_task = last_range->task;
+ list_del(&last_range->wait_entry);
+ if (!add_new_range(ic, last_range, false)) {
+ last_range->task = last_range_task;
+ list_add(&last_range->wait_entry, &ic->wait_list);
+ break;
+ }
+ last_range->waiting = false;
+ wake_up_process(last_range_task);
+ }
}
static void remove_range(struct dm_integrity_c *ic, struct dm_integrity_range *range)
@@ -910,6 +946,19 @@ static void remove_range(struct dm_integ
spin_unlock_irqrestore(&ic->endio_wait.lock, flags);
}
+static void wait_and_add_new_range(struct dm_integrity_c *ic, struct dm_integrity_range *new_range)
+{
+ new_range->waiting = true;
+ list_add_tail(&new_range->wait_entry, &ic->wait_list);
+ new_range->task = current;
+ do {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ spin_unlock_irq(&ic->endio_wait.lock);
+ io_schedule();
+ spin_lock_irq(&ic->endio_wait.lock);
+ } while (unlikely(new_range->waiting));
+}
+
static void init_journal_node(struct journal_node *node)
{
RB_CLEAR_NODE(&node->node);
@@ -1662,7 +1711,7 @@ retry:
}
}
}
- if (unlikely(!add_new_range(ic, &dio->range))) {
+ if (unlikely(!add_new_range(ic, &dio->range, true))) {
/*
* We must not sleep in the request routine because it could
* stall bios on current->bio_list.
@@ -1674,10 +1723,8 @@ offload_to_thread:
INIT_WORK(&dio->work, integrity_bio_wait);
queue_work(ic->wait_wq, &dio->work);
return;
- } else {
- sleep_on_endio_wait(ic);
- goto retry;
}
+ wait_and_add_new_range(ic, &dio->range);
}
spin_unlock_irq(&ic->endio_wait.lock);
@@ -1900,8 +1947,8 @@ static void do_journal_write(struct dm_i
io->range.n_sectors = (k - j) << ic->sb->log2_sectors_per_block;
spin_lock_irq(&ic->endio_wait.lock);
- while (unlikely(!add_new_range(ic, &io->range)))
- sleep_on_endio_wait(ic);
+ if (unlikely(!add_new_range(ic, &io->range, true)))
+ wait_and_add_new_range(ic, &io->range);
if (likely(!from_replay)) {
struct journal_node *section_node = &ic->journal_tree[i * ic->journal_section_entries];
@@ -2856,6 +2903,7 @@ static int dm_integrity_ctr(struct dm_ta
ti->per_io_data_size = sizeof(struct dm_integrity_io);
ic->in_progress = RB_ROOT;
+ INIT_LIST_HEAD(&ic->wait_list);
init_waitqueue_head(&ic->endio_wait);
bio_list_init(&ic->flush_bio_list);
init_waitqueue_head(&ic->copy_to_journal_wait);
@@ -3200,6 +3248,7 @@ static void dm_integrity_dtr(struct dm_t
struct dm_integrity_c *ic = ti->private;
BUG_ON(!RB_EMPTY_ROOT(&ic->in_progress));
+ BUG_ON(!list_empty(&ic->wait_list));
if (ic->metadata_wq)
destroy_workqueue(ic->metadata_wq);
dm-integrity locks a range of sectors to prevent concurrent I/O or journal writeback. These locks were not fair - so that many small overlapping I/Os could starve a large I/O indefinitely. This patch makes the range locks fair. The ranges that are waiting are added to the list "wait_list". If a new I/O overlaps some of the waiting I/Os, it is not dispatches, but it is also added to that wait list. Entries on the wait list are processed in first-in-first-out order, so that an I/O can't starve indefinitely. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> --- drivers/md/dm-integrity.c | 67 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 58 insertions(+), 9 deletions(-) -- dm-devel mailing list dm-devel@redhat.com https://www.redhat.com/mailman/listinfo/dm-devel