@@ -81,6 +81,7 @@ void part_in_flight(struct request_queue *q, struct hd_struct *part,
atomic_read(&part->in_flight[1]);
}
}
+EXPORT_SYMBOL_GPL(part_in_flight);
void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
@@ -13,6 +13,7 @@
#include <linux/raid/pq.h>
#include <linux/semaphore.h>
#include <linux/uuid.h>
+#include <linux/genhd.h>
#include <linux/list_sort.h>
#include "ctree.h"
#include "extent_map.h"
@@ -28,6 +29,8 @@
#include "dev-replace.h"
#include "sysfs.h"
+#define BTRFS_RAID_1_10_MAX_MIRRORS 2
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -5166,6 +5169,104 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
return ret;
}
+/**
+ * bdev_get_queue_len - return rounded down in flight queue length of bdev
+ *
+ * @bdev: target bdev
+ * @round_down: round factor big for hdd and small for ssd, like 8 and 2
+ */
+static int bdev_get_queue_len(struct block_device *bdev, int round_down)
+{
+ int sum;
+ struct hd_struct *bd_part = bdev->bd_part;
+ struct request_queue *rq = bdev_get_queue(bdev);
+ uint32_t inflight[2] = {0, 0};
+
+ part_in_flight(rq, bd_part, inflight);
+
+ sum = max_t(uint32_t, inflight[0], inflight[1]);
+
+ /*
+ * Try prevent switch for every sneeze
+ * By roundup output num by some value
+ */
+ return ALIGN_DOWN(sum, round_down);
+}
+
+/**
+ * guess_optimal - return guessed optimal mirror
+ *
+ * Optimal expected to be pid % num_stripes
+ *
+ * That's generaly ok for spread load
+ * Add some balancer based on queue length to device
+ *
+ * Basic ideas:
+ * - Sequential read generate low amount of request
+ * so if load of drives are equal, use pid % num_stripes balancing
+ * - For mixed rotate/non-rotate mirrors, pick non-rotate as optimal
+ * and repick if other dev have "significant" less queue length
+ * - Repick optimal if queue length of other mirror are less
+ */
+static int guess_optimal(struct map_lookup *map, int num, int optimal)
+{
+ int i;
+ int round_down = 8;
+ /* Init for missing bdevs */
+ int qlen[2] = { INT_MAX, INT_MAX };
+ bool is_nonrot[2] = { false, false };
+ bool all_bdev_nonrot = true;
+ bool all_bdev_rotate = true;
+ struct block_device *bdev;
+
+ /* That function supposed to work with up to 2 mirrors */
+ ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == 2);
+ ASSERT(BTRFS_RAID_1_10_MAX_MIRRORS == num);
+
+ /* Check accessible bdevs */
+ for (i = 0; i < 2; i++) {
+ bdev = map->stripes[i].dev->bdev;
+ if (bdev) {
+ qlen[i] = 0;
+ is_nonrot[i] = blk_queue_nonrot(bdev_get_queue(bdev));
+ if (is_nonrot[i])
+ all_bdev_rotate = false;
+ else
+ all_bdev_nonrot = false;
+ }
+ }
+
+ /*
+ * Don't bother with computation
+ * if only one of two bdevs are accessible
+ */
+ if (qlen[0] == INT_MAX)
+ return 1;
+ if (qlen[1] == INT_MAX)
+ return 0;
+
+ if (all_bdev_nonrot)
+ round_down = 2;
+
+ for (i = 0; i < 2; i++) {
+ bdev = map->stripes[i].dev->bdev;
+ qlen[i] = bdev_get_queue_len(bdev, round_down);
+ }
+
+ /* For mixed case, pick non rotational dev as optimal */
+ if (all_bdev_rotate == all_bdev_nonrot) {
+ if (is_nonrot[0])
+ optimal = 0;
+ else
+ optimal = 1;
+ }
+
+ if (qlen[optimal] > qlen[(optimal + 1) % 2])
+ optimal = i;
+
+ return optimal;
+}
+
static int find_live_mirror(struct btrfs_fs_info *fs_info,
struct map_lookup *map, int first,
int dev_replace_is_ongoing)
@@ -5184,7 +5285,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
else
num_stripes = map->num_stripes;
- preferred_mirror = first + current->pid % num_stripes;
+ preferred_mirror = first + guess_optimal(map, num_stripes,
+ current->pid % num_stripes);
if (dev_replace_is_ongoing &&
fs_info->dev_replace.cont_reading_from_srcdev_mode ==