@@ -47,10 +47,12 @@ static inline char *bmname(struct bitmap *bitmap)
* if we find our page, we increment the page's refcount so that it stays
* allocated while we're using it
*/
-static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
- unsigned long page, int create, int no_hijack)
-__releases(bitmap->lock)
-__acquires(bitmap->lock)
+static int md_bitmap_checkpage(struct bitmap_counts *bitmap, unsigned
long page,
+ int create, int no_hijack, spinlock_t *bmclock)
+__releases(bmclock)
+__acquires(bmclock)
+__releases(bitmap->mlock)
+__acquires(bitmap->mlock)
{
unsigned char *mappage;
@@ -73,7 +75,10 @@ __acquires(bitmap->lock)
/* this page has not been allocated yet */
- spin_unlock_irq(&bitmap->lock);
+ if (bmclock)
+ spin_unlock_irq(bmclock); /* lock for bmc */
+ else
+ write_unlock_irq(&bitmap->mlock); /* lock for metadata */
/* It is possible that this is being called inside a
* prepare_to_wait/finish_wait loop from raid5c:make_request().
* In general it is not permitted to sleep in that context as it
@@ -88,7 +93,11 @@ __acquires(bitmap->lock)
*/
sched_annotate_sleep();
mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
- spin_lock_irq(&bitmap->lock);
+
+ if (bmclock)
+ spin_lock_irq(bmclock); /* lock for bmc */
+ else
+ write_lock_irq(&bitmap->mlock); /* lock for metadata */
if (mappage == NULL) {
pr_debug("md/bitmap: map page allocation failed, hijacking\n");
@@ -1202,16 +1211,35 @@ void md_bitmap_write_all(struct bitmap *bitmap)
static void md_bitmap_count_page(struct bitmap_counts *bitmap,
sector_t offset, int inc)
{
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ /*
+ * The stripe heads are spread across different locations in the
+ * SSDs via a configurable hash function rather than mapping to a
+ * continuous SSD space.
+ * Sequential write requests are shuffled to different counter to
+ * reduce the counter preemption.
+ */
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift -
(PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
+ (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+
bitmap->bp[page].count += inc;
md_bitmap_checkfree(bitmap, page);
}
static void md_bitmap_set_pending(struct bitmap_counts *bitmap,
sector_t offset)
{
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift -
(PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
+ (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+
struct bitmap_page *bp = &bitmap->bp[page];
if (!bp->pending)
@@ -1220,7 +1248,7 @@ static void md_bitmap_set_pending(struct
bitmap_counts *bitmap, sector_t offset)
static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts
*bitmap,
sector_t offset, sector_t *blocks,
- int create);
+ int create, spinlock_t *bmclock);
/*
* bitmap daemon -- periodically wakes up to clean bits and flush pages
@@ -1288,7 +1316,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
* decrement and handle accordingly.
*/
counts = &bitmap->counts;
- spin_lock_irq(&counts->lock);
+ write_lock_irq(&counts->mlock);
nextpage = 0;
for (j = 0; j < counts->chunks; j++) {
bitmap_counter_t *bmc;
@@ -1303,7 +1331,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
}
- bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
+ bmc = md_bitmap_get_counter(counts, block, &blocks, 0, NULL);
if (!bmc) {
j |= PAGE_COUNTER_MASK;
continue;
@@ -1319,7 +1347,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
bitmap->allclean = 0;
}
}
- spin_unlock_irq(&counts->lock);
+ write_unlock_irq(&counts->mlock);
md_bitmap_wait_writes(bitmap);
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
@@ -1353,21 +1381,29 @@ void md_bitmap_daemon_work(struct mddev *mddev)
static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts
*bitmap,
sector_t offset, sector_t *blocks,
- int create)
-__releases(bitmap->lock)
-__acquires(bitmap->lock)
+ int create, spinlock_t *bmclock)
+__releases(bmclock)
+__acquires(bmclock)
+__releases(bitmap->mlock)
+__acquires(bitmap->mlock)
{
/* If 'create', we might release the lock and reclaim it.
* The lock must have been taken with interrupts enabled.
* If !create, we don't release the lock.
*/
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
- unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift -
(PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX <<
+ (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+ unsigned long pageoff = (cntid & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+
sector_t csize;
int err;
- err = md_bitmap_checkpage(bitmap, page, create, 0);
+ err = md_bitmap_checkpage(bitmap, page, create, 0, bmclock);
if (bitmap->bp[page].hijacked ||
bitmap->bp[page].map == NULL)
@@ -1393,6 +1429,28 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
+/* set-association */
+static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap,
sector_t offset);
+
+static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap,
sector_t offset)
+{
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift -
(PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bitscnt = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long maskcnt = ULONG_MAX << bitscnt | ~(ULONG_MAX << (bitscnt -
+ (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & maskcnt;
+
+ unsigned long totcnts = bitmap->chunks;
+ unsigned long bitslock = totcnts ? fls((totcnts - 1)) : 0;
+ unsigned long masklock = ULONG_MAX << bitslock | ~(ULONG_MAX <<
+ (bitslock - BITMAP_COUNTER_LOCK_RATIO_SHIFT));
+ unsigned long lockid = cntid & masklock;
+
+ spinlock_t *bmclock = &(bitmap->bmclocks[lockid]);
+ return bmclock;
+}
+
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset,
unsigned long sectors, int behind)
{
if (!bitmap)
@@ -1412,11 +1470,15 @@ int md_bitmap_startwrite(struct bitmap *bitmap,
sector_t offset, unsigned long s
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1,
bmclock);
if (!bmc) {
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return 0;
}
@@ -1428,7 +1490,8 @@ int md_bitmap_startwrite(struct bitmap *bitmap,
sector_t offset, unsigned long s
*/
prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
schedule();
finish_wait(&bitmap->overflow_wait, &__wait);
continue;
@@ -1445,7 +1508,8 @@ int md_bitmap_startwrite(struct bitmap *bitmap,
sector_t offset, unsigned long s
(*bmc)++;
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
offset += blocks;
if (sectors > blocks)
@@ -1474,11 +1538,15 @@ void md_bitmap_endwrite(struct bitmap *bitmap,
sector_t offset,
sector_t blocks;
unsigned long flags;
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
- spin_lock_irqsave(&bitmap->counts.lock, flags);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irqsave(bmclock, flags);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0,
bmclock);
if (!bmc) {
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
return;
}
@@ -1500,7 +1568,8 @@ void md_bitmap_endwrite(struct bitmap *bitmap,
sector_t offset,
md_bitmap_set_pending(&bitmap->counts, offset);
bitmap->allclean = 0;
}
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
offset += blocks;
if (sectors > blocks)
sectors -= blocks;
@@ -1514,13 +1583,16 @@ static int __bitmap_start_sync(struct bitmap
*bitmap, sector_t offset, sector_t
int degraded)
{
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
int rv;
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024;
return 1; /* always resync if no bitmap */
}
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock);
rv = 0;
if (bmc) {
/* locked */
@@ -1534,7 +1606,8 @@ static int __bitmap_start_sync(struct bitmap
*bitmap, sector_t offset, sector_t
}
}
}
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return rv;
}
@@ -1566,13 +1639,16 @@ void md_bitmap_end_sync(struct bitmap *bitmap,
sector_t offset, sector_t *blocks
{
bitmap_counter_t *bmc;
unsigned long flags;
+ spinlock_t *bmclock;
if (bitmap == NULL) {
*blocks = 1024;
return;
}
- spin_lock_irqsave(&bitmap->counts.lock, flags);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irqsave(bmclock, flags);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock);
if (bmc == NULL)
goto unlock;
/* locked */
@@ -1589,7 +1665,8 @@ void md_bitmap_end_sync(struct bitmap *bitmap,
sector_t offset, sector_t *blocks
}
}
unlock:
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
}
EXPORT_SYMBOL(md_bitmap_end_sync);
@@ -1670,10 +1747,15 @@ static void md_bitmap_set_memory_bits(struct
bitmap *bitmap, sector_t offset, in
sector_t secs;
bitmap_counter_t *bmc;
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
+ spinlock_t *bmclock;
+
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1, bmclock);
if (!bmc) {
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return;
}
if (!*bmc) {
@@ -1684,7 +1766,8 @@ static void md_bitmap_set_memory_bits(struct
bitmap *bitmap, sector_t offset, in
}
if (needed)
*bmc |= NEEDED_MASK;
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1736,6 +1819,7 @@ void md_bitmap_free(struct bitmap *bitmap)
{
unsigned long k, pages;
struct bitmap_page *bp;
+ spinlock_t *bmclocks;
if (!bitmap) /* there was no bitmap */
return;
@@ -1756,6 +1840,7 @@ void md_bitmap_free(struct bitmap *bitmap)
bp = bitmap->counts.bp;
pages = bitmap->counts.pages;
+ bmclocks = bitmap->counts.bmclocks;
/* free all allocated memory */
@@ -1764,6 +1849,7 @@ void md_bitmap_free(struct bitmap *bitmap)
if (bp[k].map && !bp[k].hijacked)
kfree(bp[k].map);
kfree(bp);
+ kfree(bmclocks);
kfree(bitmap);
}
EXPORT_SYMBOL(md_bitmap_free);
@@ -1831,7 +1917,9 @@ struct bitmap *md_bitmap_create(struct mddev
*mddev, int slot)
if (!bitmap)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&bitmap->counts.lock);
+ /* initialize metadata lock */
+ rwlock_init(&bitmap->counts.mlock);
+
atomic_set(&bitmap->pending_writes, 0);
init_waitqueue_head(&bitmap->write_wait);
init_waitqueue_head(&bitmap->overflow_wait);
@@ -2072,6 +2160,8 @@ int md_bitmap_resize(struct bitmap *bitmap,
sector_t blocks,
int ret = 0;
long pages;
struct bitmap_page *new_bp;
+ spinlock_t *new_bmclocks;
+ int num_bmclocks, i;
if (bitmap->storage.file && !init) {
pr_info("md: cannot resize file-based bitmap\n");
@@ -2154,12 +2244,25 @@ int md_bitmap_resize(struct bitmap *bitmap,
sector_t blocks,
blocks = min(old_counts.chunks << old_counts.chunkshift,
chunks << chunkshift);
- spin_lock_irq(&bitmap->counts.lock);
+ write_lock_irq(&bitmap->counts.mlock);
+
+ /* initialize bmc locks */
+ num_bmclocks = DIV_ROUND_UP(chunks, BITMAP_COUNTER_LOCK_RATIO);
+ num_bmclocks = min(num_bmclocks, BITMAP_COUNTER_LOCK_MAX);
+
+ new_bmclocks = kvcalloc(num_bmclocks, sizeof(*new_bmclocks), GFP_KERNEL);
+ bitmap->counts.bmclocks = new_bmclocks;
+ for (i = 0; i < num_bmclocks; ++i) {
+ spinlock_t *bmclock = &(bitmap->counts.bmclocks)[i];
+
+ spin_lock_init(bmclock);
+ }
+
/* For cluster raid, need to pre-allocate bitmap */
if (mddev_is_clustered(bitmap->mddev)) {
unsigned long page;
for (page = 0; page < pages; page++) {
- ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
+ ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1, NULL);
if (ret) {
unsigned long k;
@@ -2189,11 +2292,12 @@ int md_bitmap_resize(struct bitmap *bitmap,
sector_t blocks,
bitmap_counter_t *bmc_old, *bmc_new;
int set;
- bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
+ bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0,
NULL);
set = bmc_old && NEEDED(*bmc_old);
if (set) {
- bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
+ bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks,
++ 1, NULL);
if (*bmc_new == 0) {
/* need to set on-disk bits too. */
sector_t end = block + new_blocks;
@@ -2226,7 +2330,7 @@ int md_bitmap_resize(struct bitmap *bitmap,
sector_t blocks,
int i;
while (block < (chunks << chunkshift)) {
bitmap_counter_t *bmc;
- bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
+ bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1,
NULL);
if (bmc) {
/* new space. It needs to be resynced, so
* we set NEEDED_MASK.
@@ -2242,7 +2346,8 @@ int md_bitmap_resize(struct bitmap *bitmap,
sector_t blocks,
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
}
- spin_unlock_irq(&bitmap->counts.lock);
+ write_unlock_irq(&bitmap->counts.mlock);
+ read_unlock(&bitmap->counts.mlock);
if (!init) {
md_bitmap_unplug(bitmap);
@@ -2,7 +2,9 @@
/*
* bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
- * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye
Technology, Inc.
+ * additions:
+ * Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ * Copyright (C) 2022-2023, Shushu Yi (firnyee@gmail.com)
*/
#ifndef BITMAP_H