From patchwork Tue Apr 16 01:00:08 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13630952 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id A84C51C06 for ; Tue, 16 Apr 2024 01:00:08 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229208; cv=none; b=pElZW3WGTO8VdroKMJWrCco0jhov2e3V/Jms8SS4exdxAOX2JEUzVhFgAKHq7tv6uMtsvPACBnPr1Yt75G47Nr+iIebZI2SbG4UI6S/spR8dhghISSN1BCdq8P/8k0F2sc4rvtXvFT4zHRXwkfNRX1So/i5nCkSns9AuREip5fM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229208; c=relaxed/simple; bh=3NA0WAMcbyN7KOCWGEqtmcc/8nB4pqQyDV8mRIqXjHE=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=plw8jZsO4jZOTSYobMeLJDyRofl7KwIEX8PzEY7Dmz6t1iUAvGZ/Kl22xTCKz3fdc8ztvjFqHxeqQlKz8iKCUEBDSIaEPk/76GxpL8qJqaQE3NymXyTr0L3wGR+U9d4hhxmmWGr1INF9XqubmCGRjWvC5OH8Vd+lKlU//qZcdhI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=SvNk0zlV; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="SvNk0zlV" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 8238EC113CC; Tue, 16 Apr 2024 01:00:08 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1713229208; bh=3NA0WAMcbyN7KOCWGEqtmcc/8nB4pqQyDV8mRIqXjHE=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=SvNk0zlV61UprdrAo0ZbZD7JSPSUPyRDJtb473gc4Sv4YxaZeNsmh/sgLizsmYMbr HJX83BzG7FICeUmCZZAmoo8QtfYU8GNM8wQW7iv51fNxwnvYuiOdAcTkccvYOhsV7F wySMcIbB+TP6ydaIK1+qZc4OeMjNvL1mP2SQfzt8VTQd9HbQ2iQmq3HZhpKypNOTNX TB2AQXFlkaE2ZqP7FzZz9nmE6dixsQdxiU+EW5cAh2dljho+otBbZwdnH2rN1GeYl1 +0ondqNriMMa3NejiRIk9UEVrsJNojN0BvS+NEayh+Hudo6hkZM+k31O+WWxVhtbFG n/gHn4ZOAElcA== Date: Mon, 15 Apr 2024 18:00:08 -0700 Subject: [PATCH 088/111] libxfs: teach buftargs to maintain their own buffer hashtable From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: Christoph Hellwig , cmaiolino@redhat.com, linux-xfs@vger.kernel.org, hch@infradead.org Message-ID: <171322883488.211103.12606746845604045696.stgit@frogsfrogsfrogs> In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> References: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Currently, cached buffers are indexed with a single global bcache structure. This works ok for the limited use case where we only support reading from the data device, but will fail badly when we want to support buffers from in-memory btrees. Move the bcache structure into the buftarg. As a side effect, we don't need to compare buftarg->bt_bdev anymore since libxfs is careful enough not to create more than one buftarg per open fd. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- include/libxfs.h | 1 - libxfs/init.c | 48 +++++++++++++++++++++--------------------------- libxfs/libxfs_io.h | 10 ++++++---- libxfs/logitem.c | 2 +- libxfs/rdwr.c | 45 +++++++++++++++++++++++++++++---------------- mkfs/xfs_mkfs.c | 2 +- repair/prefetch.c | 12 ++++++++---- repair/prefetch.h | 1 + repair/progress.c | 14 +++++++++----- repair/progress.h | 2 +- repair/scan.c | 2 +- repair/xfs_repair.c | 32 +++++++++++++++++--------------- 12 files changed, 95 insertions(+), 76 deletions(-) diff --git a/include/libxfs.h b/include/libxfs.h index aeec2bc76126..60d3b7968775 100644 --- a/include/libxfs.h +++ b/include/libxfs.h @@ -147,7 +147,6 @@ int libxfs_init(struct libxfs_init *); void libxfs_destroy(struct libxfs_init *li); extern int libxfs_device_alignment (void); -extern void libxfs_report(FILE *); /* check or write log footer: specify device, log size in blocks & uuid */ typedef char *(libxfs_get_block_t)(char *, int, void *); diff --git a/libxfs/init.c b/libxfs/init.c index 5641b9bef6bd..f002dc93cd56 100644 --- a/libxfs/init.c +++ b/libxfs/init.c @@ -36,7 +36,6 @@ pthread_mutex_t atomic64_lock = PTHREAD_MUTEX_INITIALIZER; char *progname = "libxfs"; /* default, changed by each tool */ -struct cache *libxfs_bcache; /* global buffer cache */ int libxfs_bhash_size; /* #buckets in bcache */ int use_xfs_buf_lock; /* global flag: use xfs_buf locks for MT */ @@ -267,8 +266,6 @@ libxfs_init(struct libxfs_init *a) if (!libxfs_bhash_size) libxfs_bhash_size = LIBXFS_BHASHSIZE(sbp); - libxfs_bcache = cache_init(a->bcache_flags, libxfs_bhash_size, - &libxfs_bcache_operations); use_xfs_buf_lock = a->flags & LIBXFS_USEBUFLOCK; xfs_dir_startup(); init_caches(); @@ -451,6 +448,7 @@ xfs_set_inode_alloc( static struct xfs_buftarg * libxfs_buftarg_alloc( struct xfs_mount *mp, + struct libxfs_init *xi, struct libxfs_dev *dev, unsigned long write_fails) { @@ -472,6 +470,9 @@ libxfs_buftarg_alloc( } pthread_mutex_init(&btp->lock, NULL); + btp->bcache = cache_init(xi->bcache_flags, libxfs_bhash_size, + &libxfs_bcache_operations); + return btp; } @@ -568,12 +569,13 @@ libxfs_buftarg_init( return; } - mp->m_ddev_targp = libxfs_buftarg_alloc(mp, &xi->data, dfail); + mp->m_ddev_targp = libxfs_buftarg_alloc(mp, xi, &xi->data, dfail); if (!xi->log.dev || xi->log.dev == xi->data.dev) mp->m_logdev_targp = mp->m_ddev_targp; else - mp->m_logdev_targp = libxfs_buftarg_alloc(mp, &xi->log, lfail); - mp->m_rtdev_targp = libxfs_buftarg_alloc(mp, &xi->rt, rfail); + mp->m_logdev_targp = libxfs_buftarg_alloc(mp, xi, &xi->log, + lfail); + mp->m_rtdev_targp = libxfs_buftarg_alloc(mp, xi, &xi->rt, rfail); } /* Compute maximum possible height for per-AG btree types for this fs. */ @@ -856,7 +858,7 @@ libxfs_flush_mount( * LOST_WRITE flag to be set in the buftarg. Once that's done, * instruct the disks to persist their write caches. */ - libxfs_bcache_flush(); + libxfs_bcache_flush(mp); /* Flush all kernel and disk write caches, and report failures. */ if (mp->m_ddev_targp) { @@ -882,6 +884,14 @@ libxfs_flush_mount( return error; } +static void +libxfs_buftarg_free( + struct xfs_buftarg *btp) +{ + cache_destroy(btp->bcache); + kmem_free(btp); +} + /* * Release any resource obtained during a mount. */ @@ -898,7 +908,7 @@ libxfs_umount( * all incore buffers, then pick up the outcome when we tell the disks * to persist their write caches. */ - libxfs_bcache_purge(); + libxfs_bcache_purge(mp); error = libxfs_flush_mount(mp); /* @@ -913,10 +923,10 @@ libxfs_umount( free(mp->m_fsname); mp->m_fsname = NULL; - kmem_free(mp->m_rtdev_targp); + libxfs_buftarg_free(mp->m_rtdev_targp); if (mp->m_logdev_targp != mp->m_ddev_targp) - kmem_free(mp->m_logdev_targp); - kmem_free(mp->m_ddev_targp); + libxfs_buftarg_free(mp->m_logdev_targp); + libxfs_buftarg_free(mp->m_ddev_targp); return error; } @@ -932,10 +942,7 @@ libxfs_destroy( libxfs_close_devices(li); - /* Free everything from the buffer cache before freeing buffer cache */ - libxfs_bcache_purge(); libxfs_bcache_free(); - cache_destroy(libxfs_bcache); leaked = destroy_caches(); rcu_unregister_thread(); if (getenv("LIBXFS_LEAK_CHECK") && leaked) @@ -947,16 +954,3 @@ libxfs_device_alignment(void) { return platform_align_blockdev(); } - -void -libxfs_report(FILE *fp) -{ - time_t t; - char *c; - - cache_report(fp, "libxfs_bcache", libxfs_bcache); - - t = time(NULL); - c = asctime(localtime(&t)); - fprintf(fp, "%s", c); -} diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h index 259c6a7cf771..7877e17685b8 100644 --- a/libxfs/libxfs_io.h +++ b/libxfs/libxfs_io.h @@ -28,6 +28,7 @@ struct xfs_buftarg { dev_t bt_bdev; int bt_bdev_fd; unsigned int flags; + struct cache *bcache; /* buffer cache */ }; /* We purged a dirty buffer and lost a write. */ @@ -36,6 +37,8 @@ struct xfs_buftarg { #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) /* Simulate failure after a certain number of writes. */ #define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) +/* purge buffers when lookups find a size mismatch */ +#define XFS_BUFTARG_MISCOMPARE_PURGE (1 << 3) /* Simulate the system crashing after a certain number of writes. */ static inline void @@ -140,7 +143,6 @@ int libxfs_buf_priority(struct xfs_buf *bp); /* Buffer Cache Interfaces */ -extern struct cache *libxfs_bcache; extern struct cache_operations libxfs_bcache_operations; #define LIBXFS_GETBUF_TRYLOCK (1 << 0) @@ -184,10 +186,10 @@ libxfs_buf_read( int libxfs_readbuf_verify(struct xfs_buf *bp, const struct xfs_buf_ops *ops); struct xfs_buf *libxfs_getsb(struct xfs_mount *mp); -extern void libxfs_bcache_purge(void); +extern void libxfs_bcache_purge(struct xfs_mount *mp); extern void libxfs_bcache_free(void); -extern void libxfs_bcache_flush(void); -extern int libxfs_bcache_overflowed(void); +extern void libxfs_bcache_flush(struct xfs_mount *mp); +extern int libxfs_bcache_overflowed(struct xfs_mount *mp); /* Buffer (Raw) Interfaces */ int libxfs_bwrite(struct xfs_buf *bp); diff --git a/libxfs/logitem.c b/libxfs/logitem.c index 3ce2d7574a37..7757259dfc5e 100644 --- a/libxfs/logitem.c +++ b/libxfs/logitem.c @@ -46,7 +46,7 @@ xfs_trans_buf_item_match( list_for_each_entry(lip, &tp->t_items, li_trans) { blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && - blip->bli_buf->b_target->bt_bdev == btp->bt_bdev && + blip->bli_buf->b_target == btp && xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c index 153007d5fc86..cf986a7e7820 100644 --- a/libxfs/rdwr.c +++ b/libxfs/rdwr.c @@ -198,18 +198,20 @@ libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift) } static int -libxfs_bcompare(struct cache_node *node, cache_key_t key) +libxfs_bcompare( + struct cache_node *node, + cache_key_t key) { struct xfs_buf *bp = container_of(node, struct xfs_buf, b_node); struct xfs_bufkey *bkey = (struct xfs_bufkey *)key; + struct cache *bcache = bkey->buftarg->bcache; - if (bp->b_target->bt_bdev == bkey->buftarg->bt_bdev && - bp->b_cache_key == bkey->blkno) { + if (bp->b_cache_key == bkey->blkno) { if (bp->b_length == bkey->bblen) return CACHE_HIT; #ifdef IO_BCOMPARE_CHECK - if (!(libxfs_bcache->c_flags & CACHE_MISCOMPARE_PURGE)) { + if (!(bcache->c_flags & CACHE_MISCOMPARE_PURGE)) { fprintf(stderr, "%lx: Badness in key lookup (length)\n" "bp=(bno 0x%llx, len %u bytes) key=(bno 0x%llx, len %u bytes)\n", @@ -399,11 +401,12 @@ __cache_lookup( struct xfs_buf **bpp) { struct cache_node *cn = NULL; + struct cache *bcache = key->buftarg->bcache; struct xfs_buf *bp; *bpp = NULL; - cache_node_get(libxfs_bcache, key, &cn); + cache_node_get(bcache, key, &cn); if (!cn) return -ENOMEM; bp = container_of(cn, struct xfs_buf, b_node); @@ -415,7 +418,7 @@ __cache_lookup( if (ret) { ASSERT(ret == EAGAIN); if (flags & LIBXFS_GETBUF_TRYLOCK) { - cache_node_put(libxfs_bcache, cn); + cache_node_put(bcache, cn); return -EAGAIN; } @@ -434,7 +437,7 @@ __cache_lookup( bp->b_holder = pthread_self(); } - cache_node_set_priority(libxfs_bcache, cn, + cache_node_set_priority(bcache, cn, cache_node_get_priority(cn) - CACHE_PREFETCH_PRIORITY); *bpp = bp; return 0; @@ -550,7 +553,7 @@ libxfs_buf_relse( } if (!list_empty(&bp->b_node.cn_hash)) - cache_node_put(libxfs_bcache, &bp->b_node); + cache_node_put(bp->b_target->bcache, &bp->b_node); else if (--bp->b_node.cn_count == 0) { if (bp->b_flags & LIBXFS_B_DIRTY) libxfs_bwrite(bp); @@ -606,7 +609,7 @@ libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp, error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags); if (!error && - bp->b_target->bt_bdev == btp->bt_bdev && + bp->b_target == btp && bp->b_cache_key == blkno && bp->b_length == len) bp->b_flags |= LIBXFS_B_UPTODATE; @@ -1003,21 +1006,31 @@ libxfs_bflush( } void -libxfs_bcache_purge(void) +libxfs_bcache_purge(struct xfs_mount *mp) { - cache_purge(libxfs_bcache); + if (!mp) + return; + cache_purge(mp->m_ddev_targp->bcache); + cache_purge(mp->m_logdev_targp->bcache); + cache_purge(mp->m_rtdev_targp->bcache); } void -libxfs_bcache_flush(void) +libxfs_bcache_flush(struct xfs_mount *mp) { - cache_flush(libxfs_bcache); + if (!mp) + return; + cache_flush(mp->m_ddev_targp->bcache); + cache_flush(mp->m_logdev_targp->bcache); + cache_flush(mp->m_rtdev_targp->bcache); } int -libxfs_bcache_overflowed(void) +libxfs_bcache_overflowed(struct xfs_mount *mp) { - return cache_overflowed(libxfs_bcache); + return cache_overflowed(mp->m_ddev_targp->bcache) || + cache_overflowed(mp->m_logdev_targp->bcache) || + cache_overflowed(mp->m_rtdev_targp->bcache); } struct cache_operations libxfs_bcache_operations = { @@ -1466,7 +1479,7 @@ libxfs_buf_set_priority( struct xfs_buf *bp, int priority) { - cache_node_set_priority(libxfs_bcache, &bp->b_node, priority); + cache_node_set_priority(bp->b_target->bcache, &bp->b_node, priority); } int diff --git a/mkfs/xfs_mkfs.c b/mkfs/xfs_mkfs.c index f4a9bf20f391..d6fa48edeab5 100644 --- a/mkfs/xfs_mkfs.c +++ b/mkfs/xfs_mkfs.c @@ -4613,7 +4613,7 @@ main( * Need to drop references to inodes we still hold, first. */ libxfs_rtmount_destroy(mp); - libxfs_bcache_purge(); + libxfs_bcache_purge(mp); /* * Mark the filesystem ok. diff --git a/repair/prefetch.c b/repair/prefetch.c index b0dd19775ca8..de36c5fe2cc9 100644 --- a/repair/prefetch.c +++ b/repair/prefetch.c @@ -886,10 +886,12 @@ init_prefetch( prefetch_args_t * start_inode_prefetch( + struct xfs_mount *mp, xfs_agnumber_t agno, int dirs_only, prefetch_args_t *prev_args) { + struct cache *bcache = mp->m_ddev_targp->bcache; prefetch_args_t *args; long max_queue; struct xfs_ino_geometry *igeo = M_IGEO(mp); @@ -914,7 +916,7 @@ start_inode_prefetch( * and not any other associated metadata like directories */ - max_queue = libxfs_bcache->c_maxcount / thread_count / 8; + max_queue = bcache->c_maxcount / thread_count / 8; if (igeo->inode_cluster_size > mp->m_sb.sb_blocksize) max_queue = max_queue * igeo->blocks_per_cluster / igeo->ialloc_blks; @@ -970,14 +972,16 @@ prefetch_ag_range( void (*func)(struct workqueue *, xfs_agnumber_t, void *)) { + struct xfs_mount *mp = work->wq_ctx; int i; struct prefetch_args *pf_args[2]; - pf_args[start_ag & 1] = start_inode_prefetch(start_ag, dirs_only, NULL); + pf_args[start_ag & 1] = start_inode_prefetch(mp, start_ag, dirs_only, + NULL); for (i = start_ag; i < end_ag; i++) { /* Don't prefetch end_ag */ if (i + 1 < end_ag) - pf_args[(~i) & 1] = start_inode_prefetch(i + 1, + pf_args[(~i) & 1] = start_inode_prefetch(mp, i + 1, dirs_only, pf_args[i & 1]); func(work, i, pf_args[i & 1]); } @@ -1027,7 +1031,7 @@ do_inode_prefetch( * filesystem - it's all in the cache. In that case, run a thread per * CPU to maximise parallelism of the queue to be processed. */ - if (check_cache && !libxfs_bcache_overflowed()) { + if (check_cache && !libxfs_bcache_overflowed(mp)) { queue.wq_ctx = mp; create_work_queue(&queue, mp, platform_nproc()); for (i = 0; i < mp->m_sb.sb_agcount; i++) diff --git a/repair/prefetch.h b/repair/prefetch.h index 54ece48ad228..a8c52a1195b6 100644 --- a/repair/prefetch.h +++ b/repair/prefetch.h @@ -39,6 +39,7 @@ init_prefetch( prefetch_args_t * start_inode_prefetch( + struct xfs_mount *mp, xfs_agnumber_t agno, int dirs_only, prefetch_args_t *prev_args); diff --git a/repair/progress.c b/repair/progress.c index f6c4d988444e..625dc41c2894 100644 --- a/repair/progress.c +++ b/repair/progress.c @@ -383,14 +383,18 @@ timediff(int phase) ** array. */ char * -timestamp(int end, int phase, char *buf) +timestamp( + struct xfs_mount *mp, + int end, + int phase, + char *buf) { - time_t now; - struct tm *tmp; + time_t now; + struct tm *tmp; - if (verbose > 1) - cache_report(stderr, "libxfs_bcache", libxfs_bcache); + if (verbose > 1 && mp && mp->m_ddev_targp) + cache_report(stderr, "libxfs_bcache", mp->m_ddev_targp->bcache); now = time(NULL); diff --git a/repair/progress.h b/repair/progress.h index 2c1690db1b17..75b751b783b2 100644 --- a/repair/progress.h +++ b/repair/progress.h @@ -37,7 +37,7 @@ extern void stop_progress_rpt(void); extern void summary_report(void); extern int set_progress_msg(int report, uint64_t total); extern uint64_t print_final_rpt(void); -extern char *timestamp(int end, int phase, char *buf); +extern char *timestamp(struct xfs_mount *mp, int end, int phase, char *buf); extern char *duration(int val, char *buf); extern int do_parallel; diff --git a/repair/scan.c b/repair/scan.c index 7e6d94cfa670..715be1166fc2 100644 --- a/repair/scan.c +++ b/repair/scan.c @@ -42,7 +42,7 @@ struct aghdr_cnts { void set_mp(xfs_mount_t *mpp) { - libxfs_bcache_purge(); + libxfs_bcache_purge(mp); mp = mpp; } diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index ba9d28330d82..d4f99f36f71d 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -942,9 +942,11 @@ repair_capture_writeback( } static inline void -phase_end(int phase) +phase_end( + struct xfs_mount *mp, + int phase) { - timestamp(PHASE_END, phase, NULL); + timestamp(mp, PHASE_END, phase, NULL); /* Fail if someone injected an post-phase error. */ if (fail_after_phase && phase == fail_after_phase) @@ -979,8 +981,8 @@ main(int argc, char **argv) msgbuf = malloc(DURATION_BUF_SIZE); - timestamp(PHASE_START, 0, NULL); - phase_end(0); + timestamp(temp_mp, PHASE_START, 0, NULL); + phase_end(temp_mp, 0); /* -f forces this, but let's be nice and autodetect it, as well. */ if (!isa_file) { @@ -1002,7 +1004,7 @@ main(int argc, char **argv) /* do phase1 to make sure we have a superblock */ phase1(temp_mp); - phase_end(1); + phase_end(temp_mp, 1); if (no_modify && primary_sb_modified) { do_warn(_("Primary superblock would have been modified.\n" @@ -1139,8 +1141,8 @@ main(int argc, char **argv) unsigned long max_mem; struct rlimit rlim; - libxfs_bcache_purge(); - cache_destroy(libxfs_bcache); + libxfs_bcache_purge(mp); + cache_destroy(mp->m_ddev_targp->bcache); mem_used = (mp->m_sb.sb_icount >> (10 - 2)) + (mp->m_sb.sb_dblocks >> (10 + 1)) + @@ -1200,7 +1202,7 @@ main(int argc, char **argv) do_log(_(" - block cache size set to %d entries\n"), libxfs_bhash_size * HASH_CACHE_RATIO); - libxfs_bcache = cache_init(0, libxfs_bhash_size, + mp->m_ddev_targp->bcache = cache_init(0, libxfs_bhash_size, &libxfs_bcache_operations); } @@ -1228,16 +1230,16 @@ main(int argc, char **argv) /* make sure the per-ag freespace maps are ok so we can mount the fs */ phase2(mp, phase2_threads); - phase_end(2); + phase_end(mp, 2); if (do_prefetch) init_prefetch(mp); phase3(mp, phase2_threads); - phase_end(3); + phase_end(mp, 3); phase4(mp); - phase_end(4); + phase_end(mp, 4); if (no_modify) { printf(_("No modify flag set, skipping phase 5\n")); @@ -1247,7 +1249,7 @@ main(int argc, char **argv) } else { phase5(mp); } - phase_end(5); + phase_end(mp, 5); /* * Done with the block usage maps, toss them... @@ -1257,10 +1259,10 @@ main(int argc, char **argv) if (!bad_ino_btree) { phase6(mp); - phase_end(6); + phase_end(mp, 6); phase7(mp, phase2_threads); - phase_end(7); + phase_end(mp, 7); } else { do_warn( _("Inode allocation btrees are too corrupted, skipping phases 6 and 7\n")); @@ -1385,7 +1387,7 @@ _("Note - stripe unit (%d) and width (%d) were copied from a backup superblock.\ * verifiers are run (where we discover the max metadata LSN), reformat * the log if necessary and unmount. */ - libxfs_bcache_flush(); + libxfs_bcache_flush(mp); format_log_max_lsn(mp); if (xfs_sb_version_needsrepair(&mp->m_sb)) From patchwork Tue Apr 16 01:00:23 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13630953 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 87982185E for ; Tue, 16 Apr 2024 01:00:24 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229224; cv=none; b=V1T5dYRqwuYT/egHYCMxUBeeEuHvs15SGhrHHgMwHygc85yF6oHJtyOBrSAOBbzqep2VPkdmexh8OOXI2cUqpMnMimwA4p564kpzVEUpZZeMsaxe1bCDGaCMX1rpQ33aKRHylmDkErU7bnC23wo/VQogAgVFXiOpeKsv4HQsu7Q= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229224; c=relaxed/simple; bh=opUIXEvljglMNsYKwMrktfJucTMw2d2RVT25a40Y0Es=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=a3lE6WwdqjwxmBln3ZAfAurSmL1uRFbDq/TyQ8pXnOsaXD8lsExsaHtv/5A/emKaCkcsNXaeE8xsC5vvzsPTHYeCgXT+xdgog1CfmeDNdurT7cCVOBa10wKjx39Rt5uzlQBJ3k/6pcItmTAeswj5PM8OBLSwOLH/+gdVuNxqqMQ= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=ccfWFWrI; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="ccfWFWrI" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 2DE38C113CC; Tue, 16 Apr 2024 01:00:24 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1713229224; bh=opUIXEvljglMNsYKwMrktfJucTMw2d2RVT25a40Y0Es=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=ccfWFWrIsBqwhazAZYDI/ql6yGh3um1qomwxSfLjremi8aJ2ajRcNbRtG+thw+GiG /MKKvUExa7HoNdhYXzKjIW80xKL1egDIXH1UU4USJY7SikD5tCVnKcCQBVMMDUPzPP qxaKkkoFhktsJQPkFqbpFMtjfjrmIliToZtrqEE12cN1MlxdDNZKxtRrUP2hWRh7Zq SexWTuwk3uiTIMYPMu54EZF6X8SMaNn8p36IVYJUAHT7CFtDmPjbQK0MCz1MVEwqu3 TRr+UhdSomMgI9TlF+i7C34lTPF1F9komGNdVuTH/JCtLzb5HC15kPa3IXlJRklnym b8VZq5RTKShZw== Date: Mon, 15 Apr 2024 18:00:23 -0700 Subject: [PATCH 089/111] libxfs: add xfile support From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: Christoph Hellwig , cmaiolino@redhat.com, linux-xfs@vger.kernel.org, hch@infradead.org Message-ID: <171322883501.211103.5837361141423551988.stgit@frogsfrogsfrogs> In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> References: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Port the xfile functionality (anonymous pageable file-index memory) from the kernel. In userspace, we try to use memfd() to create tmpfs files that are not in any namespace, matching the kernel. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- libxfs/Makefile | 2 libxfs/xfile.c | 210 +++++++++++++++++++++++++++++++++++++++++++++++++++ libxfs/xfile.h | 21 +++++ repair/xfs_repair.c | 15 ++++ 4 files changed, 248 insertions(+) create mode 100644 libxfs/xfile.c create mode 100644 libxfs/xfile.h diff --git a/libxfs/Makefile b/libxfs/Makefile index 6f688c0ad25a..43e8ae183229 100644 --- a/libxfs/Makefile +++ b/libxfs/Makefile @@ -26,6 +26,7 @@ HFILES = \ libxfs_priv.h \ linux-err.h \ topology.h \ + xfile.h \ xfs_ag_resv.h \ xfs_alloc.h \ xfs_alloc_btree.h \ @@ -66,6 +67,7 @@ CFILES = cache.c \ topology.c \ trans.c \ util.c \ + xfile.c \ xfs_ag.c \ xfs_ag_resv.c \ xfs_alloc.c \ diff --git a/libxfs/xfile.c b/libxfs/xfile.c new file mode 100644 index 000000000000..cba173cc17f1 --- /dev/null +++ b/libxfs/xfile.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "libxfs_priv.h" +#include "libxfs.h" +#include "libxfs/xfile.h" +#include +#include +#include +#include + +/* + * Swappable Temporary Memory + * ========================== + * + * Offline checking sometimes needs to be able to stage a large amount of data + * in memory. This information might not fit in the available memory and it + * doesn't all need to be accessible at all times. In other words, we want an + * indexed data buffer to store data that can be paged out. + * + * memfd files meet those requirements. Therefore, the xfile mechanism uses + * one to store our staging data. The xfile must be freed with xfile_destroy. + * + * xfiles assume that the caller will handle all required concurrency + * management; file locks are not taken. + */ + +/* + * Starting with Linux 6.3, there's a new MFD_NOEXEC_SEAL flag that disables + * the longstanding memfd behavior that files are created with the executable + * bit set, and seals the file against it being turned back on. + */ +#ifndef MFD_NOEXEC_SEAL +# define MFD_NOEXEC_SEAL (0x0008U) +#endif + +/* + * Open a memory-backed fd to back an xfile. We require close-on-exec here, + * because these memfd files function as windowed RAM and hence should never + * be shared with other processes. + */ +static int +xfile_create_fd( + const char *description) +{ + int fd = -1; + int ret; + + /* + * memfd_create was added to kernel 3.17 (2014). MFD_NOEXEC_SEAL + * causes -EINVAL on old kernels, so fall back to omitting it so that + * new xfs_repair can run on an older recovery cd kernel. + */ + fd = memfd_create(description, MFD_CLOEXEC | MFD_NOEXEC_SEAL); + if (fd >= 0) + goto got_fd; + fd = memfd_create(description, MFD_CLOEXEC); + if (fd >= 0) + goto got_fd; + + /* + * O_TMPFILE exists as of kernel 3.11 (2013), which means that if we + * find it, we're pretty safe in assuming O_CLOEXEC exists too. + */ + fd = open("/dev/shm", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600); + if (fd >= 0) + goto got_fd; + + fd = open("/tmp", O_TMPFILE | O_CLOEXEC | O_RDWR, 0600); + if (fd >= 0) + goto got_fd; + + /* + * mkostemp exists as of glibc 2.7 (2007) and O_CLOEXEC exists as of + * kernel 2.6.23 (2007). + */ + fd = mkostemp("libxfsXXXXXX", O_CLOEXEC); + if (fd >= 0) + goto got_fd; + + if (!errno) + errno = EOPNOTSUPP; + return -1; +got_fd: + /* + * Turn off mode bits we don't want -- group members and others should + * not have access to the xfile, nor it be executable. memfds are + * created with mode 0777, but we'll be careful just in case the other + * implementations fail to set 0600. + */ + ret = fchmod(fd, 0600); + if (ret) + perror("disabling xfile executable bit"); + + return fd; +} + +/* + * Create an xfile of the given size. The description will be used in the + * trace output. + */ +int +xfile_create( + const char *description, + struct xfile **xfilep) +{ + struct xfile *xf; + int error; + + xf = kmalloc(sizeof(struct xfile), 0); + if (!xf) + return -ENOMEM; + + xf->fd = xfile_create_fd(description); + if (xf->fd < 0) { + error = -errno; + kfree(xf); + return error; + } + + *xfilep = xf; + return 0; +} + +/* Close the file and release all resources. */ +void +xfile_destroy( + struct xfile *xf) +{ + close(xf->fd); + kfree(xf); +} + +static inline loff_t +xfile_maxbytes( + struct xfile *xf) +{ + if (sizeof(loff_t) == 8) + return LLONG_MAX; + return LONG_MAX; +} + +/* + * Load an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +ssize_t +xfile_load( + struct xfile *xf, + void *buf, + size_t count, + loff_t pos) +{ + ssize_t ret; + + if (count > INT_MAX) + return -ENOMEM; + if (xfile_maxbytes(xf) - pos < count) + return -ENOMEM; + + ret = pread(xf->fd, buf, count, pos); + if (ret < 0) + return -errno; + if (ret != count) + return -ENOMEM; + return 0; +} + +/* + * Store an object. Since we're treating this file as "memory", any error or + * short IO is treated as a failure to allocate memory. + */ +ssize_t +xfile_store( + struct xfile *xf, + const void *buf, + size_t count, + loff_t pos) +{ + ssize_t ret; + + if (count > INT_MAX) + return -E2BIG; + if (xfile_maxbytes(xf) - pos < count) + return -EFBIG; + + ret = pwrite(xf->fd, buf, count, pos); + if (ret < 0) + return -errno; + if (ret != count) + return -ENOMEM; + return 0; +} + +/* Compute the number of bytes used by a xfile. */ +unsigned long long +xfile_bytes( + struct xfile *xf) +{ + struct stat statbuf; + int error; + + error = fstat(xf->fd, &statbuf); + if (error) + return -errno; + + return (unsigned long long)statbuf.st_blocks << 9; +} diff --git a/libxfs/xfile.h b/libxfs/xfile.h new file mode 100644 index 000000000000..d60084011357 --- /dev/null +++ b/libxfs/xfile.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (c) 2021-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __LIBXFS_XFILE_H__ +#define __LIBXFS_XFILE_H__ + +struct xfile { + int fd; +}; + +int xfile_create(const char *description, struct xfile **xfilep); +void xfile_destroy(struct xfile *xf); + +ssize_t xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); +ssize_t xfile_store(struct xfile *xf, const void *buf, size_t count, loff_t pos); + +unsigned long long xfile_bytes(struct xfile *xf); + +#endif /* __LIBXFS_XFILE_H__ */ diff --git a/repair/xfs_repair.c b/repair/xfs_repair.c index d4f99f36f71d..01f92e841f29 100644 --- a/repair/xfs_repair.c +++ b/repair/xfs_repair.c @@ -953,6 +953,20 @@ phase_end( platform_crash(); } +/* Try to allow as many memfds as possible. */ +static void +bump_max_fds(void) +{ + struct rlimit rlim = { }; + int ret; + + ret = getrlimit(RLIMIT_NOFILE, &rlim); + if (!ret) { + rlim.rlim_cur = rlim.rlim_max; + setrlimit(RLIMIT_NOFILE, &rlim); + } +} + int main(int argc, char **argv) { @@ -972,6 +986,7 @@ main(int argc, char **argv) bindtextdomain(PACKAGE, LOCALEDIR); textdomain(PACKAGE); dinode_bmbt_translation_init(); + bump_max_fds(); temp_mp = &xfs_m; setbuf(stdout, NULL); From patchwork Tue Apr 16 01:00:39 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13630954 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 535F9185E for ; Tue, 16 Apr 2024 01:00:39 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229240; cv=none; b=MaTRvBSOvRdWGTgVi+hIv1a26vKwK19zQZwAAARrWatqhbg1K/v4/8z/YQP8C48c7tMHeUm0E/RhDAAxRJ48T4YPBomLAh/Y3Qji+pbcPZ/dpG4pV7t6K9qqNY1S0Ls0MLoxYU9K62+lYM633y/++xv7uEkujCg+1UW0irERbyE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229240; c=relaxed/simple; bh=gsLLyzA6cmfHV9/xvCRH8NoKFmnBc60PbyZGWb2+uaU=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=ZXCMff9BkX1jwotJkOh6HMbF626VwUnsB+ah6g0bd1F1ORoh/LK6ELJDE657YSf9SIB2mljWRx58G0fNiT7U7q5RxBfiHzng3VWTLtJaKqvNPPa6d/JdO0h+1qEgDtYE5W9HnPGs42CeOj8tEflSlXnFqS7FspueF9RicCNm6LY= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=Vt/rKRSf; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="Vt/rKRSf" Received: by smtp.kernel.org (Postfix) with ESMTPSA id BCBB7C113CC; Tue, 16 Apr 2024 01:00:39 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1713229239; bh=gsLLyzA6cmfHV9/xvCRH8NoKFmnBc60PbyZGWb2+uaU=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=Vt/rKRSfUv5ubwOs1tGwpVlbb46wq19/gkerOtHMLfRWPxmYk3xfRD3fvZr8uRc4o fGZLbocUgMeSd9gmvQbIBgaDOb/ZMc6Xwxy3fh09Abtvv1xyeGSZzPRX7T9+Gi71fL 3UPZ3B+GeR9DV+5YqC64kjHVZI///hUbhq8hVVQEEO5UCcs23dbiTjBSoDEVX/trJZ veAKYW8MAyKELNf6R5gG3L1rja5sLeQduozM7Hv5v5mZmmWDkbJDhph+4PUq3zP+G5 nQSaTSzZB5tL7xUC+56Z2SGvdAzcoVcYTlmu+7G8cGpL0W/bzxzwwQnDAzRHkfGwmk jy6IwOH/zm1cg== Date: Mon, 15 Apr 2024 18:00:39 -0700 Subject: [PATCH 090/111] libxfs: partition memfd files to avoid using too many fds From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: cmaiolino@redhat.com, linux-xfs@vger.kernel.org, hch@infradead.org Message-ID: <171322883514.211103.15800307559901643828.stgit@frogsfrogsfrogs> In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> References: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Make it so that we can partition a memfd file to avoid running out of file descriptors. Signed-off-by: Darrick J. Wong --- libxfs/xfile.c | 197 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- libxfs/xfile.h | 17 ++++- 2 files changed, 205 insertions(+), 9 deletions(-) diff --git a/libxfs/xfile.c b/libxfs/xfile.c index cba173cc17f1..fdb76f406647 100644 --- a/libxfs/xfile.c +++ b/libxfs/xfile.c @@ -97,6 +97,149 @@ xfile_create_fd( return fd; } +static LIST_HEAD(fcb_list); +static pthread_mutex_t fcb_mutex = PTHREAD_MUTEX_INITIALIZER; + +/* Create a new memfd. */ +static inline int +xfile_fcb_create( + const char *description, + struct xfile_fcb **fcbp) +{ + struct xfile_fcb *fcb; + int fd; + + fd = xfile_create_fd(description); + if (fd < 0) + return -errno; + + fcb = malloc(sizeof(struct xfile_fcb)); + if (!fcb) { + close(fd); + return -ENOMEM; + } + + list_head_init(&fcb->fcb_list); + fcb->fd = fd; + fcb->refcount = 1; + + *fcbp = fcb; + return 0; +} + +/* Release an xfile control block */ +static void +xfile_fcb_irele( + struct xfile_fcb *fcb, + loff_t pos, + uint64_t len) +{ + /* + * If this memfd is linked only to itself, it's private, so we can + * close it without taking any locks. + */ + if (list_empty(&fcb->fcb_list)) { + close(fcb->fd); + free(fcb); + return; + } + + pthread_mutex_lock(&fcb_mutex); + if (--fcb->refcount == 0) { + /* If we're the last user of this memfd file, kill it fast. */ + list_del(&fcb->fcb_list); + close(fcb->fd); + free(fcb); + } else if (len > 0) { + struct stat statbuf; + int ret; + + /* + * If we were using the end of a partitioned file, free the + * address space. IOWs, bonus points if you delete these in + * reverse-order of creation. + */ + ret = fstat(fcb->fd, &statbuf); + if (!ret && statbuf.st_size == pos + len) { + ret = ftruncate(fcb->fd, pos); + } + } + pthread_mutex_unlock(&fcb_mutex); +} + +/* + * Find an memfd that can accomodate the given amount of address space. + */ +static int +xfile_fcb_find( + const char *description, + uint64_t maxbytes, + loff_t *posp, + struct xfile_fcb **fcbp) +{ + struct xfile_fcb *fcb; + int ret; + int error; + + /* No maximum range means that the caller gets a private memfd. */ + if (maxbytes == 0) { + *posp = 0; + return xfile_fcb_create(description, fcbp); + } + + /* round up to page granularity so we can do mmap */ + maxbytes = roundup_64(maxbytes, PAGE_SIZE); + + pthread_mutex_lock(&fcb_mutex); + + /* + * If we only need a certain number of byte range, look for one with + * available file range. + */ + list_for_each_entry(fcb, &fcb_list, fcb_list) { + struct stat statbuf; + loff_t pos; + + ret = fstat(fcb->fd, &statbuf); + if (ret) + continue; + pos = roundup_64(statbuf.st_size, PAGE_SIZE); + + /* + * Truncate up to ensure that the memfd can actually handle + * writes to the end of the range. + */ + ret = ftruncate(fcb->fd, pos + maxbytes); + if (ret) + continue; + + fcb->refcount++; + *posp = pos; + *fcbp = fcb; + goto out_unlock; + } + + /* Otherwise, open a new memfd and add it to our list. */ + error = xfile_fcb_create(description, &fcb); + if (error) + return error; + + ret = ftruncate(fcb->fd, maxbytes); + if (ret) { + error = -errno; + xfile_fcb_irele(fcb, 0, maxbytes); + return error; + } + + list_add_tail(&fcb->fcb_list, &fcb_list); + *posp = 0; + *fcbp = fcb; + +out_unlock: + pthread_mutex_unlock(&fcb_mutex); + return error; +} + /* * Create an xfile of the given size. The description will be used in the * trace output. @@ -104,6 +247,7 @@ xfile_create_fd( int xfile_create( const char *description, + unsigned long long maxbytes, struct xfile **xfilep) { struct xfile *xf; @@ -113,13 +257,14 @@ xfile_create( if (!xf) return -ENOMEM; - xf->fd = xfile_create_fd(description); - if (xf->fd < 0) { - error = -errno; + error = xfile_fcb_find(description, maxbytes, &xf->partition_pos, + &xf->fcb); + if (error) { kfree(xf); return error; } + xf->maxbytes = maxbytes; *xfilep = xf; return 0; } @@ -129,7 +274,7 @@ void xfile_destroy( struct xfile *xf) { - close(xf->fd); + xfile_fcb_irele(xf->fcb, xf->partition_pos, xf->maxbytes); kfree(xf); } @@ -137,6 +282,9 @@ static inline loff_t xfile_maxbytes( struct xfile *xf) { + if (xf->maxbytes > 0) + return xf->maxbytes; + if (sizeof(loff_t) == 8) return LLONG_MAX; return LONG_MAX; @@ -160,7 +308,7 @@ xfile_load( if (xfile_maxbytes(xf) - pos < count) return -ENOMEM; - ret = pread(xf->fd, buf, count, pos); + ret = pread(xf->fcb->fd, buf, count, pos + xf->partition_pos); if (ret < 0) return -errno; if (ret != count) @@ -186,7 +334,7 @@ xfile_store( if (xfile_maxbytes(xf) - pos < count) return -EFBIG; - ret = pwrite(xf->fd, buf, count, pos); + ret = pwrite(xf->fcb->fd, buf, count, pos + xf->partition_pos); if (ret < 0) return -errno; if (ret != count) @@ -194,6 +342,38 @@ xfile_store( return 0; } +/* Compute the number of bytes used by a partitioned xfile. */ +static unsigned long long +xfile_partition_bytes( + struct xfile *xf) +{ + loff_t data_pos = xf->partition_pos; + loff_t stop_pos = data_pos + xf->maxbytes; + loff_t hole_pos; + unsigned long long bytes = 0; + + data_pos = lseek(xf->fcb->fd, data_pos, SEEK_DATA); + while (data_pos >= 0 && data_pos < stop_pos) { + hole_pos = lseek(xf->fcb->fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + /* save error, break */ + data_pos = hole_pos; + break; + } + if (hole_pos >= stop_pos) { + bytes += stop_pos - data_pos; + return bytes; + } + bytes += hole_pos - data_pos; + + data_pos = lseek(xf->fcb->fd, hole_pos, SEEK_DATA); + } + if (data_pos < 0 && errno != ENXIO) + return xf->maxbytes; + + return bytes; +} + /* Compute the number of bytes used by a xfile. */ unsigned long long xfile_bytes( @@ -202,7 +382,10 @@ xfile_bytes( struct stat statbuf; int error; - error = fstat(xf->fd, &statbuf); + if (xf->maxbytes > 0) + return xfile_partition_bytes(xf); + + error = fstat(xf->fcb->fd, &statbuf); if (error) return -errno; diff --git a/libxfs/xfile.h b/libxfs/xfile.h index d60084011357..180a42bbbaa2 100644 --- a/libxfs/xfile.h +++ b/libxfs/xfile.h @@ -6,11 +6,24 @@ #ifndef __LIBXFS_XFILE_H__ #define __LIBXFS_XFILE_H__ -struct xfile { +struct xfile_fcb { + struct list_head fcb_list; int fd; + unsigned int refcount; }; -int xfile_create(const char *description, struct xfile **xfilep); +struct xfile { + struct xfile_fcb *fcb; + + /* File position within fcb->fd where this partition starts */ + loff_t partition_pos; + + /* Maximum number of bytes that can be written to the partition. */ + uint64_t maxbytes; +}; + +int xfile_create(const char *description, unsigned long long maxbytes, + struct xfile **xfilep); void xfile_destroy(struct xfile *xf); ssize_t xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos); From patchwork Tue Apr 16 01:00:54 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13630955 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8BBB8185E for ; Tue, 16 Apr 2024 01:00:55 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229255; cv=none; b=PwC1lHLipVzKU6M1HjcPuzpRzL+q7WghE4PKL55FFK3J/adcnJ2ou4sdRqaMtutxtp3Glp3aTjH5agPbE7hPUCguQAxvCqP9BR4NuFA1R1fNi9hrsHhCGKHjYp/02M8BspuuP7eJK+EpVH7ObxQsLPZkRv4n9x/jdbgdoY3S+9s= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229255; c=relaxed/simple; bh=se7C1ETqLXaSwnI5EBME9VInz/N918Nj/oTIZBFCJsI=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=NLdPijW6hO+iUqS40GEpfqqf9m8K1BhXxyjBvxMUOOU+LAoSYNW/WrjuZEd6RLEyA5Au/nFexdcI0kOLfj/vb/UV1NPLE8vzb1KyDABEblsxaHw/pISyQXzsA8c5y1FJB9Z5Bx7XxtkY7MUd+0VpByB/3A6cFHZi83I/Q7EaHqA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=fiO+RIVQ; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="fiO+RIVQ" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 64C04C113CC; Tue, 16 Apr 2024 01:00:55 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1713229255; bh=se7C1ETqLXaSwnI5EBME9VInz/N918Nj/oTIZBFCJsI=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=fiO+RIVQBAtD/Tv7u+3ueo87AbiTq8u3KiVu9NswVrxppaoikfaPRQhoeTh9ioABZ x0tRmVGDabsTEBZEKDZ5txk+HKkMAOItUZSAAuwm/B8Oke79w91R9WPbUWfrn0HY6m ywLeuvZUQqD5jxqQbdYjx6a30UexIYKm5lnrpIDyWGdqS8tsuwG95JvtjeP11Mg7r8 E4vBTVP+gwc1QOygv1h85ygF2v+Jj+xdzW+bUXqgCJldn3wiYYSG0y7cyIUTqWdcmF 11HXOPA1QPBAVNs9/fbx10eFqu32fcNF3MnwXfFkJJSLOjR/9gld5yppG1TIb5T04g crfYsA+4gQEAg== Date: Mon, 15 Apr 2024 18:00:54 -0700 Subject: [PATCH 091/111] xfs: teach buftargs to maintain their own buffer hashtable From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: Christoph Hellwig , cmaiolino@redhat.com, linux-xfs@vger.kernel.org, hch@infradead.org Message-ID: <171322883527.211103.12104235769532254825.stgit@frogsfrogsfrogs> In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> References: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Source kernel commit: e7b58f7c1be20550d4f51cec6307b811e7555f52 Currently, cached buffers are indexed by per-AG hashtables. This works great for the data device, but won't work for in-memory btrees. To handle that use case, buftargs will need to be able to index buffers independently of other data structures. We accomplish this by hoisting the rhashtable and its lock into a separate xfs_buf_cache structure, make the buftarg point to the _buf_cache structure, and rework various functions to use it. This will enable the in-memory buftarg to come up with its own _buf_cache. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- libxfs/libxfs_priv.h | 4 ++-- libxfs/xfs_ag.c | 6 +++--- libxfs/xfs_ag.h | 4 +--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h index 0a4f686d9455..aee85c155abf 100644 --- a/libxfs/libxfs_priv.h +++ b/libxfs/libxfs_priv.h @@ -550,8 +550,8 @@ unsigned int hweight8(unsigned int w); unsigned int hweight32(unsigned int w); unsigned int hweight64(__u64 w); -static inline int xfs_buf_hash_init(struct xfs_perag *pag) { return 0; } -static inline void xfs_buf_hash_destroy(struct xfs_perag *pag) { } +#define xfs_buf_cache_init(bch) (0) +#define xfs_buf_cache_destroy(bch) ((void)0) static inline int xfs_iunlink_init(struct xfs_perag *pag) { return 0; } static inline void xfs_iunlink_destroy(struct xfs_perag *pag) { } diff --git a/libxfs/xfs_ag.c b/libxfs/xfs_ag.c index 389a8288e989..06a881285682 100644 --- a/libxfs/xfs_ag.c +++ b/libxfs/xfs_ag.c @@ -262,7 +262,7 @@ xfs_free_perag( xfs_defer_drain_free(&pag->pag_intents_drain); cancel_delayed_work_sync(&pag->pag_blockgc_work); - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); /* drop the mount's active reference */ xfs_perag_rele(pag); @@ -350,7 +350,7 @@ xfs_free_unused_perag_range( spin_unlock(&mp->m_perag_lock); if (!pag) break; - xfs_buf_hash_destroy(pag); + xfs_buf_cache_destroy(&pag->pag_bcache); xfs_defer_drain_free(&pag->pag_intents_drain); kfree(pag); } @@ -417,7 +417,7 @@ xfs_initialize_perag( pag->pagb_tree = RB_ROOT; #endif /* __KERNEL__ */ - error = xfs_buf_hash_init(pag); + error = xfs_buf_cache_init(&pag->pag_bcache); if (error) goto out_remove_pag; diff --git a/libxfs/xfs_ag.h b/libxfs/xfs_ag.h index 19eddba09894..29bfa6273dec 100644 --- a/libxfs/xfs_ag.h +++ b/libxfs/xfs_ag.h @@ -106,9 +106,7 @@ struct xfs_perag { int pag_ici_reclaimable; /* reclaimable inodes */ unsigned long pag_ici_reclaim_cursor; /* reclaim restart point */ - /* buffer cache index */ - spinlock_t pag_buf_lock; /* lock for pag_buf_hash */ - struct rhashtable pag_buf_hash; + struct xfs_buf_cache pag_bcache; /* background prealloc block trimming */ struct delayed_work pag_blockgc_work; From patchwork Tue Apr 16 01:01:10 2024 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: "Darrick J. Wong" X-Patchwork-Id: 13630956 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 24509185E for ; Tue, 16 Apr 2024 01:01:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229271; cv=none; b=aABqjlOlGsFHbv3bLymz5xDErsT8AaCfpVsZUpbgT8FMlcqptqcYOTgXlQk+PzVOvJBzLVtk7Q7w11qfJbQRophNg6ywfxBmlOCg18bHujn/1jMDq2JNmSmarAJa4T1OwA39oCiV/UmjYbPX9vjFFuySyUtzwFduRk+M8OtlkC0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713229271; c=relaxed/simple; bh=rfaEsEQK2/TbAGNLwIOB9d+ityj9LK0odjttbGbpbrg=; h=Date:Subject:From:To:Cc:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=BAwX+l6MTwZ6UYbNHcgsoUVRYs7UMi6HlLAA/42xjqniFUr33HqHaXU9DzZ4gF8LIx59oD0p6SY/Z9prz91waOWnPh5/3wji++tcYYbSeKcn6rtJtyFPTVVuCOraq7xhuU3XfHg6U05JcRebpjoSGqMVEeuT/sfwm2f+oqCf5ys= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=CSXbAF/D; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="CSXbAF/D" Received: by smtp.kernel.org (Postfix) with ESMTPSA id F097AC113CC; Tue, 16 Apr 2024 01:01:10 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1713229271; bh=rfaEsEQK2/TbAGNLwIOB9d+ityj9LK0odjttbGbpbrg=; h=Date:Subject:From:To:Cc:In-Reply-To:References:From; b=CSXbAF/D3Jp9Dft6kJg9L8zW1yVDDQiC2WtrYr3IPdiSWiNRYpLn7gQ2E8arNvBPx +3vUDqWxHTVXCsLyFtUmB8AIQLpzKrGAxU/sKQdMZfC1u1DpBDF1+rztBWUPPj/gzH zleyFdpHfbDe8GV8csOwrq+Wy2no7T1zOcipcAI4f1KCRmAa7sdC2Berrz2cgrA6hE FKW8fIVzrTBg2TQcs90r9bXd18Gv8UUoqI8cYIH3ahME28woyYsURQL1nyfbgBOCJk DMr0ibL3Z4IjGWI1B4H7iC0art6OfDRkSR7vfl1D5WUc1OCDKo3fMu2gmfvt39G5il 0X9NP6iZyyyLg== Date: Mon, 15 Apr 2024 18:01:10 -0700 Subject: [PATCH 092/111] libxfs: support in-memory buffer cache targets From: "Darrick J. Wong" To: djwong@kernel.org, cem@kernel.org Cc: Christoph Hellwig , cmaiolino@redhat.com, linux-xfs@vger.kernel.org, hch@infradead.org Message-ID: <171322883539.211103.15663396557404367362.stgit@frogsfrogsfrogs> In-Reply-To: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> References: <171322882240.211103.3776766269442402814.stgit@frogsfrogsfrogs> User-Agent: StGit/0.19 Precedence: bulk X-Mailing-List: linux-xfs@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 From: Darrick J. Wong Allow the buffer cache to target in-memory files by connecting it to xfiles. The next few patches will enable creating xfs_btrees in memory. Unlike the kernel version of this patch, we use a partitioned xfile to avoid overflowing the fd table instead of opening a separate memfd for each target. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig --- libxfs/Makefile | 4 + libxfs/buf_mem.c | 235 ++++++++++++++++++++++++++++++++++++++++++++++++++++ libxfs/buf_mem.h | 26 ++++++ libxfs/init.c | 4 + libxfs/libxfs_io.h | 22 +++++ libxfs/rdwr.c | 41 ++++----- 6 files changed, 310 insertions(+), 22 deletions(-) create mode 100644 libxfs/buf_mem.c create mode 100644 libxfs/buf_mem.h diff --git a/libxfs/Makefile b/libxfs/Makefile index 43e8ae183229..8dc9a79059ed 100644 --- a/libxfs/Makefile +++ b/libxfs/Makefile @@ -26,6 +26,7 @@ HFILES = \ libxfs_priv.h \ linux-err.h \ topology.h \ + buf_mem.h \ xfile.h \ xfs_ag_resv.h \ xfs_alloc.h \ @@ -58,7 +59,8 @@ HFILES = \ xfs_trans_space.h \ xfs_dir2_priv.h -CFILES = cache.c \ +CFILES = buf_mem.c \ + cache.c \ defer_item.c \ init.c \ kmem.c \ diff --git a/libxfs/buf_mem.c b/libxfs/buf_mem.c new file mode 100644 index 000000000000..7c8fa1d2cdcd --- /dev/null +++ b/libxfs/buf_mem.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#include "libxfs_priv.h" +#include "libxfs.h" +#include "libxfs/xfile.h" +#include "libxfs/buf_mem.h" +#include +#include +#include + +/* + * Buffer Cache for In-Memory Files + * ================================ + * + * Offline fsck wants to create ephemeral ordered recordsets. The existing + * btree infrastructure can do this, but we need the buffer cache to target + * memory instead of block devices. + * + * xfiles meet those requirements. Therefore, the xmbuf mechanism uses a + * partition on an xfile to store the staging data. + * + * xmbufs assume that the caller will handle all required concurrency + * management. The resulting xfs_buf objects are kept private to the xmbuf + * (they are not recycled to the LRU) because b_addr is mapped directly to the + * memfd file. + * + * The only supported block size is the system page size. + */ + +/* Figure out the xfile buffer cache block size here */ +unsigned int XMBUF_BLOCKSIZE; +unsigned int XMBUF_BLOCKSHIFT; + +void +xmbuf_libinit(void) +{ + long ret = sysconf(_SC_PAGESIZE); + + /* If we don't find a power-of-two page size, go with 4k. */ + if (ret < 0 || !is_power_of_2(ret)) + ret = 4096; + + XMBUF_BLOCKSIZE = ret; + XMBUF_BLOCKSHIFT = libxfs_highbit32(XMBUF_BLOCKSIZE); +} + +/* Allocate a new cache node (aka a xfs_buf) */ +static struct cache_node * +xmbuf_cache_alloc( + cache_key_t key) +{ + struct xfs_bufkey *bufkey = (struct xfs_bufkey *)key; + struct xfs_buf *bp; + int error; + + bp = kmem_cache_zalloc(xfs_buf_cache, 0); + if (!bp) + return NULL; + + bp->b_cache_key = bufkey->blkno; + bp->b_length = bufkey->bblen; + bp->b_target = bufkey->buftarg; + bp->b_mount = bufkey->buftarg->bt_mount; + + pthread_mutex_init(&bp->b_lock, NULL); + INIT_LIST_HEAD(&bp->b_li_list); + bp->b_maps = &bp->__b_map; + + bp->b_nmaps = 1; + bp->b_maps[0].bm_bn = bufkey->blkno; + bp->b_maps[0].bm_len = bp->b_length; + + error = xmbuf_map_page(bp); + if (error) { + fprintf(stderr, + _("%s: %s can't mmap %u bytes at xfile offset %llu: %s\n"), + progname, __FUNCTION__, BBTOB(bp->b_length), + (unsigned long long)BBTOB(bufkey->blkno), + strerror(error)); + + kmem_cache_free(xfs_buf_cache, bp); + return NULL; + } + + return &bp->b_node; +} + +/* Flush a buffer to disk before purging the cache node */ +static int +xmbuf_cache_flush( + struct cache_node *node) +{ + /* direct mapped buffers do not need writing */ + return 0; +} + +/* Release resources, free the buffer. */ +static void +xmbuf_cache_relse( + struct cache_node *node) +{ + struct xfs_buf *bp; + + bp = container_of(node, struct xfs_buf, b_node); + xmbuf_unmap_page(bp); + kmem_cache_free(xfs_buf_cache, bp); +} + +/* Release a bunch of buffers */ +static unsigned int +xmbuf_cache_bulkrelse( + struct cache *cache, + struct list_head *list) +{ + struct cache_node *cn, *n; + int count = 0; + + if (list_empty(list)) + return 0; + + list_for_each_entry_safe(cn, n, list, cn_mru) { + xmbuf_cache_relse(cn); + count++; + } + + return count; +} + +static struct cache_operations xmbuf_bcache_operations = { + .hash = libxfs_bhash, + .alloc = xmbuf_cache_alloc, + .flush = xmbuf_cache_flush, + .relse = xmbuf_cache_relse, + .compare = libxfs_bcompare, + .bulkrelse = xmbuf_cache_bulkrelse +}; + +/* + * Allocate a buffer cache target for a memory-backed file and set up the + * buffer target. + */ +int +xmbuf_alloc( + struct xfs_mount *mp, + const char *descr, + unsigned long long maxpos, + struct xfs_buftarg **btpp) +{ + struct xfs_buftarg *btp; + struct xfile *xfile; + struct cache *cache; + int error; + + btp = kzalloc(sizeof(*btp), GFP_KERNEL); + if (!btp) + return -ENOMEM; + + error = xfile_create(descr, maxpos, &xfile); + if (error) + goto out_btp; + + cache = cache_init(0, LIBXFS_BHASHSIZE(NULL), &xmbuf_bcache_operations); + if (!cache) { + error = -ENOMEM; + goto out_xfile; + } + + /* Initialize buffer target */ + btp->bt_mount = mp; + btp->bt_bdev = (dev_t)-1; + btp->bt_bdev_fd = -1; + btp->bt_xfile = xfile; + btp->bcache = cache; + + error = pthread_mutex_init(&btp->lock, NULL); + if (error) + goto out_cache; + + *btpp = btp; + return 0; + +out_cache: + cache_destroy(cache); +out_xfile: + xfile_destroy(xfile); +out_btp: + kfree(btp); + return error; +} + +/* Free a buffer cache target for a memory-backed file. */ +void +xmbuf_free( + struct xfs_buftarg *btp) +{ + ASSERT(xfs_buftarg_is_mem(btp)); + + cache_destroy(btp->bcache); + pthread_mutex_destroy(&btp->lock); + xfile_destroy(btp->bt_xfile); + kfree(btp); +} + +/* Directly map a memfd page into the buffer cache. */ +int +xmbuf_map_page( + struct xfs_buf *bp) +{ + struct xfile *xfile = bp->b_target->bt_xfile; + void *p; + loff_t pos; + + pos = xfile->partition_pos + BBTOB(xfs_buf_daddr(bp)); + p = mmap(NULL, BBTOB(bp->b_length), PROT_READ | PROT_WRITE, MAP_SHARED, + xfile->fcb->fd, pos); + if (p == MAP_FAILED) + return -errno; + + bp->b_addr = p; + bp->b_flags |= LIBXFS_B_UPTODATE | LIBXFS_B_UNCHECKED; + bp->b_error = 0; + return 0; +} + +/* Unmap a memfd page that was mapped into the buffer cache. */ +void +xmbuf_unmap_page( + struct xfs_buf *bp) +{ + munmap(bp->b_addr, BBTOB(bp->b_length)); + bp->b_addr = NULL; +} diff --git a/libxfs/buf_mem.h b/libxfs/buf_mem.h new file mode 100644 index 000000000000..d2be2c4240b6 --- /dev/null +++ b/libxfs/buf_mem.h @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2023-2024 Oracle. All Rights Reserved. + * Author: Darrick J. Wong + */ +#ifndef __XFS_BUF_MEM_H__ +#define __XFS_BUF_MEM_H__ + +extern unsigned int XMBUF_BLOCKSIZE; +extern unsigned int XMBUF_BLOCKSHIFT; + +void xmbuf_libinit(void); + +static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *target) +{ + return target->bt_xfile != NULL; +} + +int xmbuf_alloc(struct xfs_mount *mp, const char *descr, + unsigned long long maxpos, struct xfs_buftarg **btpp); +void xmbuf_free(struct xfs_buftarg *btp); + +int xmbuf_map_page(struct xfs_buf *bp); +void xmbuf_unmap_page(struct xfs_buf *bp); + +#endif /* __XFS_BUF_MEM_H__ */ diff --git a/libxfs/init.c b/libxfs/init.c index f002dc93cd56..f5cd85655cf0 100644 --- a/libxfs/init.c +++ b/libxfs/init.c @@ -22,6 +22,8 @@ #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "libfrog/platform.h" +#include "libxfs/xfile.h" +#include "libxfs/buf_mem.h" #include "xfs_format.h" #include "xfs_da_format.h" @@ -253,6 +255,7 @@ int libxfs_init(struct libxfs_init *a) { xfs_check_ondisk_structs(); + xmbuf_libinit(); rcu_init(); rcu_register_thread(); radix_tree_init(); @@ -463,6 +466,7 @@ libxfs_buftarg_alloc( btp->bt_mount = mp; btp->bt_bdev = dev->dev; btp->bt_bdev_fd = dev->fd; + btp->bt_xfile = NULL; btp->flags = 0; if (write_fails) { btp->writes_left = write_fails; diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h index 7877e17685b8..ae3c4a9484c7 100644 --- a/libxfs/libxfs_io.h +++ b/libxfs/libxfs_io.h @@ -27,6 +27,7 @@ struct xfs_buftarg { unsigned long writes_left; dev_t bt_bdev; int bt_bdev_fd; + struct xfile *bt_xfile; unsigned int flags; struct cache *bcache; /* buffer cache */ }; @@ -58,6 +59,27 @@ xfs_buftarg_trip_write( void libxfs_buftarg_init(struct xfs_mount *mp, struct libxfs_init *xi); int libxfs_blkdev_issue_flush(struct xfs_buftarg *btp); +/* + * The bufkey is used to pass the new buffer information to the cache object + * allocation routine. Because discontiguous buffers need to pass different + * information, we need fields to pass that information. However, because the + * blkno and bblen is needed for the initial cache entry lookup (i.e. for + * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous + * buffer initialisation instead of a contiguous buffer. + */ +struct xfs_bufkey { + struct xfs_buftarg *buftarg; + xfs_daddr_t blkno; + unsigned int bblen; + struct xfs_buf_map *map; + int nmaps; +}; + +/* for buf_mem.c only: */ +unsigned int libxfs_bhash(cache_key_t key, unsigned int hashsize, + unsigned int hashshift); +int libxfs_bcompare(struct cache_node *node, cache_key_t key); + #define LIBXFS_BBTOOFF64(bbs) (((xfs_off_t)(bbs)) << BBSHIFT) #define XB_PAGES 2 diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c index cf986a7e7820..50760cd866e3 100644 --- a/libxfs/rdwr.c +++ b/libxfs/rdwr.c @@ -18,7 +18,8 @@ #include "xfs_inode.h" #include "xfs_trans.h" #include "libfrog/platform.h" - +#include "libxfs/xfile.h" +#include "libxfs/buf_mem.h" #include "libxfs.h" static void libxfs_brelse(struct cache_node *node); @@ -69,6 +70,9 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) char *z; int error; + if (xfs_buftarg_is_mem(btp)) + return -EOPNOTSUPP; + start_offset = LIBXFS_BBTOOFF64(start); /* try to use special zeroing methods, fall back to writes if needed */ @@ -167,26 +171,10 @@ static struct cache_mru xfs_buf_freelist = {{&xfs_buf_freelist.cm_list, &xfs_buf_freelist.cm_list}, 0, PTHREAD_MUTEX_INITIALIZER }; -/* - * The bufkey is used to pass the new buffer information to the cache object - * allocation routine. Because discontiguous buffers need to pass different - * information, we need fields to pass that information. However, because the - * blkno and bblen is needed for the initial cache entry lookup (i.e. for - * bcompare) the fact that the map/nmaps is non-null to switch to discontiguous - * buffer initialisation instead of a contiguous buffer. - */ -struct xfs_bufkey { - struct xfs_buftarg *buftarg; - xfs_daddr_t blkno; - unsigned int bblen; - struct xfs_buf_map *map; - int nmaps; -}; - /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL #define CACHE_LINE_SIZE 64 -static unsigned int +unsigned int libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift) { uint64_t hashval = ((struct xfs_bufkey *)key)->blkno; @@ -197,7 +185,7 @@ libxfs_bhash(cache_key_t key, unsigned int hashsize, unsigned int hashshift) return tmp % hashsize; } -static int +int libxfs_bcompare( struct cache_node *node, cache_key_t key) @@ -231,6 +219,8 @@ static void __initbuf(struct xfs_buf *bp, struct xfs_buftarg *btp, xfs_daddr_t bno, unsigned int bytes) { + ASSERT(!xfs_buftarg_is_mem(btp)); + bp->b_flags = 0; bp->b_cache_key = bno; bp->b_length = BTOBB(bytes); @@ -577,7 +567,6 @@ libxfs_balloc( return &bp->b_node; } - static int __read_buf(int fd, void *buf, int len, off_t offset, int flags) { @@ -607,6 +596,9 @@ libxfs_readbufr(struct xfs_buftarg *btp, xfs_daddr_t blkno, struct xfs_buf *bp, ASSERT(len <= bp->b_length); + if (xfs_buftarg_is_mem(btp)) + return 0; + error = __read_buf(fd, bp->b_addr, bytes, LIBXFS_BBTOOFF64(blkno), flags); if (!error && bp->b_target == btp && @@ -639,6 +631,9 @@ libxfs_readbufr_map(struct xfs_buftarg *btp, struct xfs_buf *bp, int flags) void *buf; int i; + if (xfs_buftarg_is_mem(btp)) + return 0; + buf = bp->b_addr; for (i = 0; i < bp->b_nmaps; i++) { off_t offset = LIBXFS_BBTOOFF64(bp->b_maps[i].bm_bn); @@ -857,7 +852,9 @@ libxfs_bwrite( } } - if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) { + if (xfs_buftarg_is_mem(bp->b_target)) { + bp->b_error = 0; + } else if (!(bp->b_flags & LIBXFS_B_DISCONTIG)) { bp->b_error = __write_buf(fd, bp->b_addr, BBTOB(bp->b_length), LIBXFS_BBTOOFF64(xfs_buf_daddr(bp)), bp->b_flags); @@ -917,6 +914,8 @@ libxfs_buf_prepare_mru( xfs_perag_put(bp->b_pag); bp->b_pag = NULL; + ASSERT(!xfs_buftarg_is_mem(btp)); + if (!(bp->b_flags & LIBXFS_B_DIRTY)) return;