@@ -48,9 +48,11 @@ static inline char *bmname(struct bitmap
* allocated while we're using it
*/
static int md_bitmap_checkpage(struct bitmap_counts *bitmap,
- unsigned long page, int create, int no_hijack)
-__releases(bitmap->lock)
-__acquires(bitmap->lock)
+ unsigned long page, int create, int no_hijack, spinlock_t *bmclock, int locktype)
+__releases(bmclock)
+__acquires(bmclock)
+__releases(bitmap->mlock)
+__acquires(bitmap->mlock)
{
unsigned char *mappage;
@@ -65,8 +67,10 @@ __acquires(bitmap->lock)
return -ENOENT;
/* this page has not been allocated yet */
-
- spin_unlock_irq(&bitmap->lock);
+ if (locktype)
+ spin_unlock_irq(bmclock);
+ else
+ write_unlock_irq(&bitmap->mlock);
/* It is possible that this is being called inside a
* prepare_to_wait/finish_wait loop from raid5c:make_request().
* In general it is not permitted to sleep in that context as it
@@ -81,7 +85,11 @@ __acquires(bitmap->lock)
*/
sched_annotate_sleep();
mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
- spin_lock_irq(&bitmap->lock);
+
+ if (locktype)
+ spin_lock_irq(bmclock);
+ else
+ write_lock_irq(&bitmap->mlock);
if (mappage == NULL) {
pr_debug("md/bitmap: map page allocation failed, hijacking\n");
@@ -398,7 +406,7 @@ static int read_file_page(struct file *f
}
wait_event(bitmap->write_wait,
- atomic_read(&bitmap->pending_writes)==0);
+ atomic_read(&bitmap->pending_writes) == 0);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
ret = -EIO;
out:
@@ -457,7 +465,7 @@ static void md_bitmap_wait_writes(struct
{
if (bitmap->storage.file)
wait_event(bitmap->write_wait,
- atomic_read(&bitmap->pending_writes)==0);
+ atomic_read(&bitmap->pending_writes) == 0);
else
/* Note that we ignore the return value. The writes
* might have failed, but that would just mean that
@@ -1246,16 +1254,32 @@ void md_bitmap_write_all(struct bitmap *
static void md_bitmap_count_page(struct bitmap_counts *bitmap,
sector_t offset, int inc)
{
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+
+ // sector_t chunk = offset >> bitmap->chunkshift;
+ // unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+
bitmap->bp[page].count += inc;
md_bitmap_checkfree(bitmap, page);
}
static void md_bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset)
{
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+
+ // sector_t chunk = offset >> bitmap->chunkshift;
+ // unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+
struct bitmap_page *bp = &bitmap->bp[page];
if (!bp->pending)
@@ -1264,7 +1288,7 @@ static void md_bitmap_set_pending(struct
static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks,
- int create);
+ int create, spinlock_t *bmclock, int locktype);
static void mddev_set_timeout(struct mddev *mddev, unsigned long timeout,
bool force)
@@ -1349,7 +1373,7 @@ void md_bitmap_daemon_work(struct mddev
* decrement and handle accordingly.
*/
counts = &bitmap->counts;
- spin_lock_irq(&counts->lock);
+ write_lock_irq(&counts->mlock);
nextpage = 0;
for (j = 0; j < counts->chunks; j++) {
bitmap_counter_t *bmc;
@@ -1364,7 +1388,7 @@ void md_bitmap_daemon_work(struct mddev
counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0;
}
- bmc = md_bitmap_get_counter(counts, block, &blocks, 0);
+ bmc = md_bitmap_get_counter(counts, block, &blocks, 0, 0, 0);
if (!bmc) {
j |= PAGE_COUNTER_MASK;
continue;
@@ -1380,7 +1404,7 @@ void md_bitmap_daemon_work(struct mddev
bitmap->allclean = 0;
}
}
- spin_unlock_irq(&counts->lock);
+ write_unlock_irq(&counts->mlock);
md_bitmap_wait_writes(bitmap);
/* Now start writeout on any page in NEEDWRITE that isn't DIRTY.
@@ -1413,17 +1437,27 @@ void md_bitmap_daemon_work(struct mddev
static bitmap_counter_t *md_bitmap_get_counter(struct bitmap_counts *bitmap,
sector_t offset, sector_t *blocks,
- int create)
-__releases(bitmap->lock)
-__acquires(bitmap->lock)
+ int create, spinlock_t *bmclock, int locktype)
+__releases(bmclock)
+__acquires(bmclock)
+__releases(bitmap->mlock)
+__acquires(bitmap->mlock)
{
/* If 'create', we might release the lock and reclaim it.
* The lock must have been taken with interrupts enabled.
* If !create, we don't release the lock.
*/
- sector_t chunk = offset >> bitmap->chunkshift;
- unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
- unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bits = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << (bits - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & mask;
+ unsigned long page = cntid >> PAGE_COUNTER_SHIFT;
+ unsigned long pageoff = (cntid & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
+
+ // sector_t chunk = offset >> bitmap->chunkshift;
+ // unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
+ // unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
sector_t csize;
int err;
@@ -1435,7 +1469,7 @@ __acquires(bitmap->lock)
*/
return NULL;
}
- err = md_bitmap_checkpage(bitmap, page, create, 0);
+ err = md_bitmap_checkpage(bitmap, page, create, 0, bmclock, 1);
if (bitmap->bp[page].hijacked ||
bitmap->bp[page].map == NULL)
@@ -1461,6 +1495,36 @@ __acquires(bitmap->lock)
&(bitmap->bp[page].map[pageoff]);
}
+/* set-association
+ * e.g. if we have 14 counters & BITMAP_COUNTER_LOCK_RATIO_SHIFT is 2 (means every 2^2 counters share the same lock),
+ * counter 0, 4, 8 and 12 share the same lock. (1, 5, 9, 13 | 2, 6, 10)
+ */
+static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap, sector_t offset);
+
+static spinlock_t *md_bitmap_get_bmclock(struct bitmap_counts *bitmap, sector_t offset)
+{
+ sector_t blockno = offset >> (PAGE_SHIFT - SECTOR_SHIFT);
+ sector_t totblocks = bitmap->chunks << (bitmap->chunkshift - (PAGE_SHIFT - SECTOR_SHIFT));
+ unsigned long bitscnt = totblocks ? fls((totblocks - 1)) : 0;
+ unsigned long maskcnt = ULONG_MAX << bitscnt | ~(ULONG_MAX << (bitscnt - (bitmap->chunkshift + SECTOR_SHIFT - PAGE_SHIFT)));
+ unsigned long cntid = blockno & maskcnt;
+
+ unsigned long totcnts = bitmap->chunks;
+ unsigned long bitslock = totcnts ? fls((totcnts - 1)) : 0;
+ unsigned long masklock = ULONG_MAX << bitslock | ~(ULONG_MAX << (bitslock - BITMAP_COUNTER_LOCK_RATIO_SHIFT));
+ unsigned long lockid = cntid & masklock;
+
+ // unsigned long chunks = bitmap->chunks; /* Total number of (bitmap) chunks for the array */
+ // sector_t chunk = offset >> bitmap->chunkshift;
+ // unsigned long bits = chunks ? fls((chunks - 1)) : 0;
+ // unsigned long mask = ULONG_MAX << bits | ~(ULONG_MAX << (bits - BITMAP_COUNTER_LOCK_RATIO_SHIFT));
+ // unsigned long lockidx = chunk & mask;
+
+ spinlock_t *bmclock = &(bitmap->bmclocks[lockid]);
+ pr_debug("========>>> offset =%lld, blockno = %lld, totblocks = %lld ,totcnts = %ld, cntid = %ld, lockidx = %ld", offset, blockno, totblocks, totcnts, cntid, lockid);
+ return bmclock;
+}
+
int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind)
{
if (!bitmap)
@@ -1480,11 +1544,15 @@ int md_bitmap_startwrite(struct bitmap *
while (sectors) {
sector_t blocks;
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 1, bmclock, 1);
if (!bmc) {
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return 0;
}
@@ -1496,7 +1564,8 @@ int md_bitmap_startwrite(struct bitmap *
*/
prepare_to_wait(&bitmap->overflow_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
schedule();
finish_wait(&bitmap->overflow_wait, &__wait);
continue;
@@ -1513,7 +1582,8 @@ int md_bitmap_startwrite(struct bitmap *
(*bmc)++;
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
offset += blocks;
if (sectors > blocks)
@@ -1542,11 +1612,15 @@ void md_bitmap_endwrite(struct bitmap *b
sector_t blocks;
unsigned long flags;
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
- spin_lock_irqsave(&bitmap->counts.lock, flags);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irqsave(bmclock, flags);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &blocks, 0, bmclock, 1);
if (!bmc) {
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
return;
}
@@ -1568,7 +1642,8 @@ void md_bitmap_endwrite(struct bitmap *b
md_bitmap_set_pending(&bitmap->counts, offset);
bitmap->allclean = 0;
}
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
offset += blocks;
if (sectors > blocks)
sectors -= blocks;
@@ -1582,13 +1657,16 @@ static int __bitmap_start_sync(struct bi
int degraded)
{
bitmap_counter_t *bmc;
+ spinlock_t *bmclock;
int rv;
if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */
*blocks = 1024;
return 1; /* always resync if no bitmap */
}
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock, 1);
rv = 0;
if (bmc) {
/* locked */
@@ -1602,7 +1680,8 @@ static int __bitmap_start_sync(struct bi
}
}
}
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return rv;
}
@@ -1634,13 +1713,16 @@ void md_bitmap_end_sync(struct bitmap *b
{
bitmap_counter_t *bmc;
unsigned long flags;
+ spinlock_t *bmclock;
if (bitmap == NULL) {
*blocks = 1024;
return;
}
- spin_lock_irqsave(&bitmap->counts.lock, flags);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0);
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irqsave(bmclock, flags);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, blocks, 0, bmclock, 1);
if (bmc == NULL)
goto unlock;
/* locked */
@@ -1657,7 +1739,8 @@ void md_bitmap_end_sync(struct bitmap *b
}
}
unlock:
- spin_unlock_irqrestore(&bitmap->counts.lock, flags);
+ spin_unlock_irqrestore(bmclock, flags);
+ read_unlock(&bitmap->counts.mlock);
}
EXPORT_SYMBOL(md_bitmap_end_sync);
@@ -1738,10 +1821,15 @@ static void md_bitmap_set_memory_bits(st
sector_t secs;
bitmap_counter_t *bmc;
- spin_lock_irq(&bitmap->counts.lock);
- bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1);
+ spinlock_t *bmclock;
+
+ bmclock = md_bitmap_get_bmclock(&bitmap->counts, offset);
+ read_lock(&bitmap->counts.mlock);
+ spin_lock_irq(bmclock);
+ bmc = md_bitmap_get_counter(&bitmap->counts, offset, &secs, 1, bmclock, 1);
if (!bmc) {
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
return;
}
if (!*bmc) {
@@ -1752,7 +1840,8 @@ static void md_bitmap_set_memory_bits(st
}
if (needed)
*bmc |= NEEDED_MASK;
- spin_unlock_irq(&bitmap->counts.lock);
+ spin_unlock_irq(bmclock);
+ read_unlock(&bitmap->counts.mlock);
}
/* dirty the memory and file bits for bitmap chunks "s" to "e" */
@@ -1806,6 +1895,7 @@ void md_bitmap_free(struct bitmap *bitma
{
unsigned long k, pages;
struct bitmap_page *bp;
+ spinlock_t *bmclocks;
if (!bitmap) /* there was no bitmap */
return;
@@ -1826,6 +1916,7 @@ void md_bitmap_free(struct bitmap *bitma
bp = bitmap->counts.bp;
pages = bitmap->counts.pages;
+ bmclocks = bitmap->counts.bmclocks;
/* free all allocated memory */
@@ -1834,6 +1925,7 @@ void md_bitmap_free(struct bitmap *bitma
if (bp[k].map && !bp[k].hijacked)
kfree(bp[k].map);
kfree(bp);
+ kfree(bmclocks);
kfree(bitmap);
}
EXPORT_SYMBOL(md_bitmap_free);
@@ -1900,7 +1992,9 @@ struct bitmap *md_bitmap_create(struct m
if (!bitmap)
return ERR_PTR(-ENOMEM);
- spin_lock_init(&bitmap->counts.lock);
+ /* initialize metadata lock */
+ rwlock_init(&bitmap->counts.mlock);
+
atomic_set(&bitmap->pending_writes, 0);
init_waitqueue_head(&bitmap->write_wait);
init_waitqueue_head(&bitmap->overflow_wait);
@@ -2143,6 +2237,8 @@ int md_bitmap_resize(struct bitmap *bitm
int ret = 0;
long pages;
struct bitmap_page *new_bp;
+ spinlock_t *new_bmclocks;
+ int num_bmclocks, i;
if (bitmap->storage.file && !init) {
pr_info("md: cannot resize file-based bitmap\n");
@@ -2211,7 +2307,7 @@ int md_bitmap_resize(struct bitmap *bitm
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
sizeof(bitmap_super_t));
- spin_lock_irq(&bitmap->counts.lock);
+ write_lock_irq(&bitmap->counts.mlock);
md_bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
@@ -2227,18 +2323,30 @@ int md_bitmap_resize(struct bitmap *bitm
blocks = min(old_counts.chunks << old_counts.chunkshift,
chunks << chunkshift);
+ /* initialize bmc locks */
+ num_bmclocks = DIV_ROUND_UP(chunks, BITMAP_COUNTER_LOCK_RATIO);
+ pr_debug("========>>> num_bmclocks = %d, blocks = %lld\n", num_bmclocks, blocks);
+
+ // new_bmclocks = kcalloc(num_bmclocks, sizeof(*new_bmclocks), GFP_KERNEL);
+ new_bmclocks = kvzalloc(num_bmclocks * sizeof(*new_bmclocks), GFP_KERNEL);
+ bitmap->counts.bmclocks = new_bmclocks;
+ for (i = 0; i < num_bmclocks; ++i) {
+ spinlock_t *bmclock = &(bitmap->counts.bmclocks)[i];
+ // pr_debug("========>>> &bmclocks[0] = 0x%pK, i = %d, bmclock = 0x%pK\n", &bitmap->counts.bmclocks[0], i, bmclock);
+ spin_lock_init(bmclock);
+ }
+
/* For cluster raid, need to pre-allocate bitmap */
if (mddev_is_clustered(bitmap->mddev)) {
unsigned long page;
for (page = 0; page < pages; page++) {
- ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1);
+ ret = md_bitmap_checkpage(&bitmap->counts, page, 1, 1, 0, 0);
if (ret) {
unsigned long k;
/* deallocate the page memory */
- for (k = 0; k < page; k++) {
+ for (k = 0; k < page; k++)
kfree(new_bp[k].map);
- }
kfree(new_bp);
/* restore some fields from old_counts */
@@ -2261,11 +2369,11 @@ int md_bitmap_resize(struct bitmap *bitm
bitmap_counter_t *bmc_old, *bmc_new;
int set;
- bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0);
+ bmc_old = md_bitmap_get_counter(&old_counts, block, &old_blocks, 0, 0, 0);
set = bmc_old && NEEDED(*bmc_old);
if (set) {
- bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
+ bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1, 0, 0);
if (bmc_new) {
if (*bmc_new == 0) {
/* need to set on-disk bits too. */
@@ -2301,7 +2409,7 @@ int md_bitmap_resize(struct bitmap *bitm
int i;
while (block < (chunks << chunkshift)) {
bitmap_counter_t *bmc;
- bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
+ bmc = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1, 0, 0);
if (bmc) {
/* new space. It needs to be resynced, so
* we set NEEDED_MASK.
@@ -2317,7 +2425,7 @@ int md_bitmap_resize(struct bitmap *bitm
for (i = 0; i < bitmap->storage.file_pages; i++)
set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
}
- spin_unlock_irq(&bitmap->counts.lock);
+ write_unlock_irq(&bitmap->counts.mlock);
if (!init) {
md_bitmap_unplug(bitmap);
@@ -2,7 +2,9 @@
/*
* bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
*
- * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ * additions:
+ * Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ * Copyright (C) 2022-2023, Shushu Yi (firnyee@gmail.com)
*/
#ifndef BITMAP_H
#define BITMAP_H 1
@@ -103,6 +105,9 @@ typedef __u16 bitmap_counter_t;
#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
#define BITMAP_BLOCK_SHIFT 9
+/* how many conters share the same bmclock? */
+#define BITMAP_COUNTER_LOCK_RATIO_SHIFT 0
+#define BITMAP_COUNTER_LOCK_RATIO (1 << BITMAP_COUNTER_LOCK_RATIO_SHIFT)
#endif
@@ -116,7 +121,7 @@ typedef __u16 bitmap_counter_t;
enum bitmap_state {
BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */
BITMAP_WRITE_ERROR = 2, /* A write error has occurred */
- BITMAP_HOSTENDIAN =15,
+ BITMAP_HOSTENDIAN = 15,
};
/* the superblock at the front of the bitmap file -- little endian */
@@ -180,7 +185,8 @@ struct bitmap_page {
struct bitmap {
struct bitmap_counts {
- spinlock_t lock;
+ rwlock_t mlock; /* lock for metadata */
+ spinlock_t *bmclocks; /* locks for bmc */
struct bitmap_page *bp;
unsigned long pages; /* total number of pages
* in the bitmap */
@@ -501,7 +501,7 @@ struct disk_info {
* and creating that much locking depth can cause
* problems.
*/
-#define NR_STRIPE_HASH_LOCKS 8
+#define NR_STRIPE_HASH_LOCKS 128
#define STRIPE_HASH_LOCKS_MASK (NR_STRIPE_HASH_LOCKS - 1)
struct r5worker {