From patchwork Thu Oct 22 05:15:31 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850301 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 51B5492C for ; Thu, 22 Oct 2020 05:33:03 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 3A4502245D for ; Thu, 22 Oct 2020 05:33:03 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2502876AbgJVFdC (ORCPT ); Thu, 22 Oct 2020 01:33:02 -0400 Received: from mail105.syd.optusnet.com.au ([211.29.132.249]:49089 "EHLO mail105.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2502871AbgJVFdC (ORCPT ); Thu, 22 Oct 2020 01:33:02 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail105.syd.optusnet.com.au (Postfix) with ESMTPS id 7B3953AC0B3 for ; Thu, 22 Oct 2020 16:32:58 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034Km-PD for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoD-He for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 1/7] workqueue: bound maximum queue depth Date: Thu, 22 Oct 2020 16:15:31 +1100 Message-Id: <20201022051537.2286402-2-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=Ubgvt5aN c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=vJvmB_99ysriVlu2_BwA:9 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Existing users of workqueues have bound maximum queue depths in their external algorithms (e.g. prefetch counts). For parallelising work that doesn't have an external bound, allow workqueues to throttle incoming requests at a maximum bound. bounded workqueues also need to distribute work over all worker threads themselves as there is no external bounding or worker function throttling provided. Existing callers are not throttled and retain direct control of worker threads, only users of the new create interface will be throttled and concurrency managed. Signed-off-by: Dave Chinner --- libfrog/workqueue.c | 42 +++++++++++++++++++++++++++++++++++++++--- libfrog/workqueue.h | 4 ++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/libfrog/workqueue.c b/libfrog/workqueue.c index fe3de4289379..e42b2a2f678b 100644 --- a/libfrog/workqueue.c +++ b/libfrog/workqueue.c @@ -40,13 +40,21 @@ workqueue_thread(void *arg) } /* - * Dequeue work from the head of the list. + * Dequeue work from the head of the list. If the queue was + * full then send a wakeup if we're configured to do so. */ assert(wq->item_count > 0); + if (wq->max_queued && wq->item_count == wq->max_queued) + pthread_cond_signal(&wq->queue_full); + wi = wq->next_item; wq->next_item = wi->next; wq->item_count--; + if (wq->max_queued && wq->next_item) { + /* more work, wake up another worker */ + pthread_cond_signal(&wq->wakeup); + } pthread_mutex_unlock(&wq->lock); (wi->function)(wi->queue, wi->index, wi->arg); @@ -58,10 +66,11 @@ workqueue_thread(void *arg) /* Allocate a work queue and threads. Returns zero or negative error code. */ int -workqueue_create( +workqueue_create_bound( struct workqueue *wq, void *wq_ctx, - unsigned int nr_workers) + unsigned int nr_workers, + unsigned int max_queue) { unsigned int i; int err = 0; @@ -70,12 +79,16 @@ workqueue_create( err = -pthread_cond_init(&wq->wakeup, NULL); if (err) return err; + err = -pthread_cond_init(&wq->queue_full, NULL); + if (err) + goto out_wake; err = -pthread_mutex_init(&wq->lock, NULL); if (err) goto out_cond; wq->wq_ctx = wq_ctx; wq->thread_count = nr_workers; + wq->max_queued = max_queue; wq->threads = malloc(nr_workers * sizeof(pthread_t)); if (!wq->threads) { err = -errno; @@ -102,10 +115,21 @@ workqueue_create( out_mutex: pthread_mutex_destroy(&wq->lock); out_cond: + pthread_cond_destroy(&wq->queue_full); +out_wake: pthread_cond_destroy(&wq->wakeup); return err; } +int +workqueue_create( + struct workqueue *wq, + void *wq_ctx, + unsigned int nr_workers) +{ + return workqueue_create_bound(wq, wq_ctx, nr_workers, 0); +} + /* * Create a work item consisting of a function and some arguments and schedule * the work item to be run via the thread pool. Returns zero or a negative @@ -140,6 +164,7 @@ workqueue_add( /* Now queue the new work structure to the work queue. */ pthread_mutex_lock(&wq->lock); +restart: if (wq->next_item == NULL) { assert(wq->item_count == 0); ret = -pthread_cond_signal(&wq->wakeup); @@ -150,6 +175,16 @@ workqueue_add( } wq->next_item = wi; } else { + /* throttle on a full queue if configured */ + if (wq->max_queued && wq->item_count == wq->max_queued) { + pthread_cond_wait(&wq->queue_full, &wq->lock); + /* + * Queue might be empty or even still full by the time + * we get the lock back, so restart the lookup so we do + * the right thing with the current state of the queue. + */ + goto restart; + } wq->last_item->next = wi; } wq->last_item = wi; @@ -201,5 +236,6 @@ workqueue_destroy( free(wq->threads); pthread_mutex_destroy(&wq->lock); pthread_cond_destroy(&wq->wakeup); + pthread_cond_destroy(&wq->queue_full); memset(wq, 0, sizeof(*wq)); } diff --git a/libfrog/workqueue.h b/libfrog/workqueue.h index a56d1cf14081..a9c108d0e66a 100644 --- a/libfrog/workqueue.h +++ b/libfrog/workqueue.h @@ -31,10 +31,14 @@ struct workqueue { unsigned int thread_count; bool terminate; bool terminated; + int max_queued; + pthread_cond_t queue_full; }; int workqueue_create(struct workqueue *wq, void *wq_ctx, unsigned int nr_workers); +int workqueue_create_bound(struct workqueue *wq, void *wq_ctx, + unsigned int nr_workers, unsigned int max_queue); int workqueue_add(struct workqueue *wq, workqueue_func_t fn, uint32_t index, void *arg); int workqueue_terminate(struct workqueue *wq); From patchwork Thu Oct 22 05:15:32 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850267 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 9E4AA14B2 for ; Thu, 22 Oct 2020 05:15:45 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 8645B223FB for ; Thu, 22 Oct 2020 05:15:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2444251AbgJVFPp (ORCPT ); Thu, 22 Oct 2020 01:15:45 -0400 Received: from mail105.syd.optusnet.com.au ([211.29.132.249]:59673 "EHLO mail105.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2444246AbgJVFPo (ORCPT ); Thu, 22 Oct 2020 01:15:44 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail105.syd.optusnet.com.au (Postfix) with ESMTPS id 989B43ABDDB for ; Thu, 22 Oct 2020 16:15:38 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034Kn-Q7 for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoG-IV for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 2/7] repair: Protect bad inode list with mutex Date: Thu, 22 Oct 2020 16:15:32 +1100 Message-Id: <20201022051537.2286402-3-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=F8MpiZpN c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=4vG5bgiOaNo-D7GZVj4A:9 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner To enable phase 6 parallelisation, we need to protect the bad inode list from concurrent modification and/or access. Wrap it with a mutex and clean up the nasty typedefs. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- repair/dir2.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/repair/dir2.c b/repair/dir2.c index eabdb4f2d497..23333e59a382 100644 --- a/repair/dir2.c +++ b/repair/dir2.c @@ -20,40 +20,50 @@ * Known bad inode list. These are seen when the leaf and node * block linkages are incorrect. */ -typedef struct dir2_bad { +struct dir2_bad { xfs_ino_t ino; struct dir2_bad *next; -} dir2_bad_t; +}; -static dir2_bad_t *dir2_bad_list; +static struct dir2_bad *dir2_bad_list; +pthread_mutex_t dir2_bad_list_lock = PTHREAD_MUTEX_INITIALIZER; static void dir2_add_badlist( xfs_ino_t ino) { - dir2_bad_t *l; + struct dir2_bad *l; - if ((l = malloc(sizeof(dir2_bad_t))) == NULL) { + l = malloc(sizeof(*l)); + if (!l) { do_error( _("malloc failed (%zu bytes) dir2_add_badlist:ino %" PRIu64 "\n"), - sizeof(dir2_bad_t), ino); + sizeof(*l), ino); exit(1); } + pthread_mutex_lock(&dir2_bad_list_lock); l->next = dir2_bad_list; dir2_bad_list = l; l->ino = ino; + pthread_mutex_unlock(&dir2_bad_list_lock); } int dir2_is_badino( xfs_ino_t ino) { - dir2_bad_t *l; + struct dir2_bad *l; + int ret = 0; - for (l = dir2_bad_list; l; l = l->next) - if (l->ino == ino) - return 1; - return 0; + pthread_mutex_lock(&dir2_bad_list_lock); + for (l = dir2_bad_list; l; l = l->next) { + if (l->ino == ino) { + ret = 1; + break; + } + } + pthread_mutex_unlock(&dir2_bad_list_lock); + return ret; } /* From patchwork Thu Oct 22 05:15:33 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850303 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 7DE8514B2 for ; Thu, 22 Oct 2020 05:33:04 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 66B6D223FB for ; Thu, 22 Oct 2020 05:33:04 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2502871AbgJVFdE (ORCPT ); Thu, 22 Oct 2020 01:33:04 -0400 Received: from mail104.syd.optusnet.com.au ([211.29.132.246]:39915 "EHLO mail104.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2502848AbgJVFdD (ORCPT ); Thu, 22 Oct 2020 01:33:03 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail104.syd.optusnet.com.au (Postfix) with ESMTPS id 282F158D092 for ; Thu, 22 Oct 2020 16:32:59 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034Kr-RL for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoJ-JG for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 3/7] repair: protect inode chunk tree records with a mutex Date: Thu, 22 Oct 2020 16:15:33 +1100 Message-Id: <20201022051537.2286402-4-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=F8MpiZpN c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=T_8LcwnyJo5SsLRZtwYA:9 a=1gB5eKhlqS7B6gGJ:21 a=aWqMy_GDOUPUWl2T:21 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Phase 6 accesses inode chunk records mostly in an isolated manner. However, when it finds a corruption in a directory or there are multiple hardlinks to an inode, there can be concurrent access to the inode chunk record to update state. Hence the inode record itself needs a mutex. This protects all state changes within the inode chunk record, as well as inode link counts and chunk references. That allows us to process multiple chunks at once, providing concurrency within an AG as well as across AGs. The inode chunk tree itself is not modified in phase 6 - it's built in phases 3 and 4 - and so we do not need to worry about locking for AVL tree lookups to find the inode chunk records themselves. hence internal locking is all we need here. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- repair/incore.h | 23 +++++++++++++++++++++++ repair/incore_ino.c | 15 +++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/repair/incore.h b/repair/incore.h index 5b29d5d1efd8..6564e0d38963 100644 --- a/repair/incore.h +++ b/repair/incore.h @@ -282,6 +282,7 @@ typedef struct ino_tree_node { parent_list_t *plist; /* phases 2-5 */ } ino_un; uint8_t *ftypes; /* phases 3,6 */ + pthread_mutex_t lock; } ino_tree_node_t; #define INOS_PER_IREC (sizeof(uint64_t) * NBBY) @@ -412,7 +413,9 @@ next_free_ino_rec(ino_tree_node_t *ino_rec) */ static inline void add_inode_refchecked(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_un.ex_data->ino_processed |= IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline int is_inode_refchecked(struct ino_tree_node *irec, int offset) @@ -438,12 +441,16 @@ static inline int is_inode_confirmed(struct ino_tree_node *irec, int offset) */ static inline void set_inode_isadir(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_isa_dir |= IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline void clear_inode_isadir(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_isa_dir &= ~IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline int inode_isadir(struct ino_tree_node *irec, int offset) @@ -456,15 +463,19 @@ static inline int inode_isadir(struct ino_tree_node *irec, int offset) */ static inline void set_inode_free(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); set_inode_confirmed(irec, offset); irec->ir_free |= XFS_INOBT_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline void set_inode_used(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); set_inode_confirmed(irec, offset); irec->ir_free &= ~XFS_INOBT_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline int is_inode_free(struct ino_tree_node *irec, int offset) @@ -477,7 +488,9 @@ static inline int is_inode_free(struct ino_tree_node *irec, int offset) */ static inline void set_inode_sparse(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ir_sparse |= XFS_INOBT_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline bool is_inode_sparse(struct ino_tree_node *irec, int offset) @@ -490,12 +503,16 @@ static inline bool is_inode_sparse(struct ino_tree_node *irec, int offset) */ static inline void set_inode_was_rl(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_was_rl |= IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline void clear_inode_was_rl(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_was_rl &= ~IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline int inode_was_rl(struct ino_tree_node *irec, int offset) @@ -508,12 +525,16 @@ static inline int inode_was_rl(struct ino_tree_node *irec, int offset) */ static inline void set_inode_is_rl(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_is_rl |= IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline void clear_inode_is_rl(struct ino_tree_node *irec, int offset) { + pthread_mutex_lock(&irec->lock); irec->ino_is_rl &= ~IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } static inline int inode_is_rl(struct ino_tree_node *irec, int offset) @@ -546,7 +567,9 @@ static inline int is_inode_reached(struct ino_tree_node *irec, int offset) static inline void add_inode_reached(struct ino_tree_node *irec, int offset) { add_inode_ref(irec, offset); + pthread_mutex_lock(&irec->lock); irec->ino_un.ex_data->ino_reached |= IREC_MASK(offset); + pthread_mutex_unlock(&irec->lock); } /* diff --git a/repair/incore_ino.c b/repair/incore_ino.c index 82956ae93005..299e4f949e5e 100644 --- a/repair/incore_ino.c +++ b/repair/incore_ino.c @@ -91,6 +91,7 @@ void add_inode_ref(struct ino_tree_node *irec, int ino_offset) { ASSERT(irec->ino_un.ex_data != NULL); + pthread_mutex_lock(&irec->lock); switch (irec->nlink_size) { case sizeof(uint8_t): if (irec->ino_un.ex_data->counted_nlinks.un8[ino_offset] < 0xff) { @@ -112,6 +113,7 @@ void add_inode_ref(struct ino_tree_node *irec, int ino_offset) default: ASSERT(0); } + pthread_mutex_unlock(&irec->lock); } void drop_inode_ref(struct ino_tree_node *irec, int ino_offset) @@ -120,6 +122,7 @@ void drop_inode_ref(struct ino_tree_node *irec, int ino_offset) ASSERT(irec->ino_un.ex_data != NULL); + pthread_mutex_lock(&irec->lock); switch (irec->nlink_size) { case sizeof(uint8_t): ASSERT(irec->ino_un.ex_data->counted_nlinks.un8[ino_offset] > 0); @@ -139,6 +142,7 @@ void drop_inode_ref(struct ino_tree_node *irec, int ino_offset) if (refs == 0) irec->ino_un.ex_data->ino_reached &= ~IREC_MASK(ino_offset); + pthread_mutex_unlock(&irec->lock); } uint32_t num_inode_references(struct ino_tree_node *irec, int ino_offset) @@ -161,6 +165,7 @@ uint32_t num_inode_references(struct ino_tree_node *irec, int ino_offset) void set_inode_disk_nlinks(struct ino_tree_node *irec, int ino_offset, uint32_t nlinks) { + pthread_mutex_lock(&irec->lock); switch (irec->nlink_size) { case sizeof(uint8_t): if (nlinks < 0xff) { @@ -182,6 +187,7 @@ void set_inode_disk_nlinks(struct ino_tree_node *irec, int ino_offset, default: ASSERT(0); } + pthread_mutex_unlock(&irec->lock); } uint32_t get_inode_disk_nlinks(struct ino_tree_node *irec, int ino_offset) @@ -253,6 +259,7 @@ alloc_ino_node( irec->nlink_size = sizeof(uint8_t); irec->disk_nlinks.un8 = alloc_nlink_array(irec->nlink_size); irec->ftypes = alloc_ftypes_array(mp); + pthread_mutex_init(&irec->lock, NULL); return irec; } @@ -294,6 +301,7 @@ free_ino_tree_node( } free(irec->ftypes); + pthread_mutex_destroy(&irec->lock); free(irec); } @@ -600,6 +608,7 @@ set_inode_parent( uint64_t bitmask; parent_entry_t *tmp; + pthread_mutex_lock(&irec->lock); if (full_ino_ex_data) ptbl = irec->ino_un.ex_data->parents; else @@ -625,6 +634,7 @@ set_inode_parent( #endif ptbl->pentries[0] = parent; + pthread_mutex_unlock(&irec->lock); return; } @@ -642,6 +652,7 @@ set_inode_parent( #endif ptbl->pentries[target] = parent; + pthread_mutex_unlock(&irec->lock); return; } @@ -682,6 +693,7 @@ set_inode_parent( #endif ptbl->pentries[target] = parent; ptbl->pmask |= (1ULL << offset); + pthread_mutex_unlock(&irec->lock); } xfs_ino_t @@ -692,6 +704,7 @@ get_inode_parent(ino_tree_node_t *irec, int offset) int i; int target; + pthread_mutex_lock(&irec->lock); if (full_ino_ex_data) ptbl = irec->ino_un.ex_data->parents; else @@ -709,9 +722,11 @@ get_inode_parent(ino_tree_node_t *irec, int offset) #ifdef DEBUG ASSERT(target < ptbl->cnt); #endif + pthread_mutex_unlock(&irec->lock); return(ptbl->pentries[target]); } + pthread_mutex_unlock(&irec->lock); return(0LL); } From patchwork Thu Oct 22 05:15:34 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850261 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id BDE6014B7 for ; Thu, 22 Oct 2020 05:15:42 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id A7358223FB for ; Thu, 22 Oct 2020 05:15:42 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2507849AbgJVFPl (ORCPT ); Thu, 22 Oct 2020 01:15:41 -0400 Received: from mail104.syd.optusnet.com.au ([211.29.132.246]:50591 "EHLO mail104.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2444305AbgJVFPl (ORCPT ); Thu, 22 Oct 2020 01:15:41 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail104.syd.optusnet.com.au (Postfix) with ESMTPS id 98A7C588250 for ; Thu, 22 Oct 2020 16:15:38 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034Kt-SP for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoO-Ki for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 4/7] repair: parallelise phase 6 Date: Thu, 22 Oct 2020 16:15:34 +1100 Message-Id: <20201022051537.2286402-5-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=YKPhNiOx c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=6KEnGoqAMqwac_kpMeoA:9 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner A recent metadump provided to us caused repair to take hours in phase6. It wasn't IO bound - it was fully CPU bound the entire time. The only way to speed it up is to make phase 6 run multiple concurrent processing threads. The obvious way to do this is to spread the concurrency across AGs, like the other phases, and while this works it is not optimal. When a processing thread hits a really large directory, it essentially sits CPU bound until that directory is processed. IF an AG has lots of large directories, we end up with a really long single threaded tail that limits concurrency. Hence we also need to have concurrency /within/ the AG. This is realtively easy, as the inode chunk records allow for a simple concurrency mechanism within an AG. We can simply feed each chunk record to a workqueue, and we get concurrency within the AG for free. However, this allows prefetch to run way ahead of processing and this blows out the buffer cache size and can cause OOM. However, we can use the new workqueue depth limiting to limit the number of inode chunks queued, and this then backs up the inode prefetching to it's maximum queue depth. Hence we prevent having the prefetch code queue the entire AG's inode chunks on the workqueue blowing out memory by throttling the prefetch consumer. This takes phase 6 from taking many, many hours down to: Phase 6: 10/30 21:12:58 10/30 21:40:48 27 minutes, 50 seconds And burning 20-30 cpus that entire time on my test rig. Signed-off-by: Dave Chinner --- repair/phase6.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) diff --git a/repair/phase6.c b/repair/phase6.c index 70d32089bb57..bf0719c186fb 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -6,6 +6,7 @@ #include "libxfs.h" #include "threads.h" +#include "threads.h" #include "prefetch.h" #include "avl.h" #include "globals.h" @@ -3109,20 +3110,45 @@ check_for_orphaned_inodes( } static void -traverse_function( +do_dir_inode( struct workqueue *wq, - xfs_agnumber_t agno, + xfs_agnumber_t agno, void *arg) { - ino_tree_node_t *irec; + struct ino_tree_node *irec = arg; int i; + + for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { + if (inode_isadir(irec, i)) + process_dir_inode(wq->wq_ctx, agno, irec, i); + } +} + +static void +traverse_function( + struct workqueue *wq, + xfs_agnumber_t agno, + void *arg) +{ + struct ino_tree_node *irec; prefetch_args_t *pf_args = arg; + struct workqueue lwq; + struct xfs_mount *mp = wq->wq_ctx; + wait_for_inode_prefetch(pf_args); if (verbose) do_log(_(" - agno = %d\n"), agno); + /* + * The more AGs we have in flight at once, the fewer processing threads + * per AG. This means we don't overwhelm the machine with hundreds of + * threads when we start acting on lots of AGs at once. We just want + * enough that we can keep multiple CPUs busy across multiple AGs. + */ + workqueue_create_bound(&lwq, mp, ag_stride, 1000); + for (irec = findfirst_inode_rec(agno); irec; irec = next_ino_rec(irec)) { if (irec->ino_isa_dir == 0) continue; @@ -3130,18 +3156,19 @@ traverse_function( if (pf_args) { sem_post(&pf_args->ra_count); #ifdef XR_PF_TRACE + { + int i; sem_getvalue(&pf_args->ra_count, &i); pftrace( "processing inode chunk %p in AG %d (sem count = %d)", irec, agno, i); + } #endif } - for (i = 0; i < XFS_INODES_PER_CHUNK; i++) { - if (inode_isadir(irec, i)) - process_dir_inode(wq->wq_ctx, agno, irec, i); - } + queue_work(&lwq, do_dir_inode, agno, irec); } + destroy_work_queue(&lwq); cleanup_inode_prefetch(pf_args); } @@ -3169,7 +3196,7 @@ static void traverse_ags( struct xfs_mount *mp) { - do_inode_prefetch(mp, 0, traverse_function, false, true); + do_inode_prefetch(mp, ag_stride, traverse_function, false, true); } void From patchwork Thu Oct 22 05:15:35 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850265 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 37EFF14B2 for ; Thu, 22 Oct 2020 05:15:44 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 208602244C for ; Thu, 22 Oct 2020 05:15:44 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2507851AbgJVFPn (ORCPT ); Thu, 22 Oct 2020 01:15:43 -0400 Received: from mail105.syd.optusnet.com.au ([211.29.132.249]:59137 "EHLO mail105.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2444251AbgJVFPn (ORCPT ); Thu, 22 Oct 2020 01:15:43 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail105.syd.optusnet.com.au (Postfix) with ESMTPS id 98DB43ABF4C for ; Thu, 22 Oct 2020 16:15:38 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034Kv-TU for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoR-Li for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 5/7] repair: don't duplicate names in phase 6 Date: Thu, 22 Oct 2020 16:15:35 +1100 Message-Id: <20201022051537.2286402-6-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=YKPhNiOx c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=ktqsFqGhlViaf363e9wA:9 a=v6a9nb2BsVOmaqJX:21 a=Oikw1yXlmi1R5_1x:21 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner The name hash in phase 6 is constructed by using names that point directly into the directory buffers. Hence before the buffers can be released, the constructed name hash has to duplicate all those names into meory it owns via dir_hash_dup_names(). Given that the structure that holds the name is dynamically allocated, it makes no sense to store a pointer to the name dir_hash_add() and then later have dynamically allocate the name. Extend the name hash allocation to contain space for the name itself, and copy the name into the name hash structure in dir_hash_add(). This allows us to get rid of dir_hash_dup_names(), and the directory checking code no longer needs to hold all the directory buffers in memory until the entire directory walk is complete and the names duplicated. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- repair/phase6.c | 101 ++++++++++++++---------------------------------- 1 file changed, 29 insertions(+), 72 deletions(-) diff --git a/repair/phase6.c b/repair/phase6.c index bf0719c186fb..79c87495656f 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -72,7 +72,7 @@ typedef struct dir_hash_ent { struct dir_hash_ent *nextbyorder; /* next in order added */ xfs_dahash_t hashval; /* hash value of name */ uint32_t address; /* offset of data entry */ - xfs_ino_t inum; /* inode num of entry */ + xfs_ino_t inum; /* inode num of entry */ short junkit; /* name starts with / */ short seen; /* have seen leaf entry */ struct xfs_name name; @@ -80,7 +80,6 @@ typedef struct dir_hash_ent { typedef struct dir_hash_tab { int size; /* size of hash tables */ - int names_duped; /* 1 = ent names malloced */ dir_hash_ent_t *first; /* ptr to first added entry */ dir_hash_ent_t *last; /* ptr to last added entry */ dir_hash_ent_t **byhash; /* ptr to name hash buckets */ @@ -171,8 +170,6 @@ dir_hash_add( short junk; struct xfs_name xname; - ASSERT(!hashtab->names_duped); - xname.name = name; xname.len = namelen; xname.type = ftype; @@ -199,7 +196,12 @@ dir_hash_add( } } - if ((p = malloc(sizeof(*p))) == NULL) + /* + * Allocate enough space for the hash entry and the name in a single + * allocation so we can store our own copy of the name for later use. + */ + p = calloc(1, sizeof(*p) + namelen + 1); + if (!p) do_error(_("malloc failed in dir_hash_add (%zu bytes)\n"), sizeof(*p)); @@ -220,7 +222,12 @@ dir_hash_add( p->address = addr; p->inum = inum; p->seen = 0; - p->name = xname; + + /* Set up the name in the region trailing the hash entry. */ + memcpy(p + 1, name, namelen); + p->name.name = (const unsigned char *)(p + 1); + p->name.len = namelen; + p->name.type = ftype; return !dup; } @@ -287,8 +294,6 @@ dir_hash_done( for (i = 0; i < hashtab->size; i++) { for (p = hashtab->byaddr[i]; p; p = n) { n = p->nextbyaddr; - if (hashtab->names_duped) - free((void *)p->name.name); free(p); } } @@ -385,27 +390,6 @@ dir_hash_see_all( return j == stale ? DIR_HASH_CK_OK : DIR_HASH_CK_BADSTALE; } -/* - * Convert name pointers into locally allocated memory. - * This must only be done after all the entries have been added. - */ -static void -dir_hash_dup_names(dir_hash_tab_t *hashtab) -{ - unsigned char *name; - dir_hash_ent_t *p; - - if (hashtab->names_duped) - return; - - for (p = hashtab->first; p; p = p->nextbyorder) { - name = malloc(p->name.len); - memcpy(name, p->name.name, p->name.len); - p->name.name = name; - } - hashtab->names_duped = 1; -} - /* * Given a block number in a fork, return the next valid block number * (not a hole). @@ -1387,6 +1371,7 @@ dir2_kill_block( res_failed(error); libxfs_trans_ijoin(tp, ip, 0); libxfs_trans_bjoin(tp, bp); + libxfs_trans_bhold(tp, bp); memset(&args, 0, sizeof(args)); args.dp = ip; args.trans = tp; @@ -1418,7 +1403,7 @@ longform_dir2_entry_check_data( int *need_dot, ino_tree_node_t *current_irec, int current_ino_offset, - struct xfs_buf **bpp, + struct xfs_buf *bp, dir_hash_tab_t *hashtab, freetab_t **freetabp, xfs_dablk_t da_bno, @@ -1426,7 +1411,6 @@ longform_dir2_entry_check_data( { xfs_dir2_dataptr_t addr; xfs_dir2_leaf_entry_t *blp; - struct xfs_buf *bp; xfs_dir2_block_tail_t *btp; struct xfs_dir2_data_hdr *d; xfs_dir2_db_t db; @@ -1457,7 +1441,6 @@ longform_dir2_entry_check_data( }; - bp = *bpp; d = bp->b_addr; ptr = (char *)d + mp->m_dir_geo->data_entry_offset; nbad = 0; @@ -1558,10 +1541,8 @@ longform_dir2_entry_check_data( dir2_kill_block(mp, ip, da_bno, bp); } else { do_warn(_("would junk block\n")); - libxfs_buf_relse(bp); } freetab->ents[db].v = NULLDATAOFF; - *bpp = NULL; return; } @@ -2219,17 +2200,15 @@ longform_dir2_entry_check(xfs_mount_t *mp, int ino_offset, dir_hash_tab_t *hashtab) { - struct xfs_buf **bplist; + struct xfs_buf *bp; xfs_dablk_t da_bno; freetab_t *freetab; - int num_bps; int i; int isblock; int isleaf; xfs_fileoff_t next_da_bno; int seeval; int fixit = 0; - xfs_dir2_db_t db; struct xfs_da_args args; *need_dot = 1; @@ -2246,11 +2225,6 @@ longform_dir2_entry_check(xfs_mount_t *mp, freetab->ents[i].v = NULLDATAOFF; freetab->ents[i].s = 0; } - num_bps = freetab->naents; - bplist = calloc(num_bps, sizeof(struct xfs_buf*)); - if (!bplist) - do_error(_("calloc failed in %s (%zu bytes)\n"), - __func__, num_bps * sizeof(struct xfs_buf*)); /* is this a block, leaf, or node directory? */ args.dp = ip; @@ -2279,28 +2253,12 @@ longform_dir2_entry_check(xfs_mount_t *mp, break; } - db = xfs_dir2_da_to_db(mp->m_dir_geo, da_bno); - if (db >= num_bps) { - int last_size = num_bps; - - /* more data blocks than expected */ - num_bps = db + 1; - bplist = realloc(bplist, num_bps * sizeof(struct xfs_buf*)); - if (!bplist) - do_error(_("realloc failed in %s (%zu bytes)\n"), - __func__, - num_bps * sizeof(struct xfs_buf*)); - /* Initialize the new elements */ - for (i = last_size; i < num_bps; i++) - bplist[i] = NULL; - } - if (isblock) ops = &xfs_dir3_block_buf_ops; else ops = &xfs_dir3_data_buf_ops; - error = dir_read_buf(ip, da_bno, &bplist[db], ops, &fixit); + error = dir_read_buf(ip, da_bno, &bp, ops, &fixit); if (error) { do_warn( _("can't read data block %u for directory inode %" PRIu64 " error %d\n"), @@ -2320,21 +2278,25 @@ longform_dir2_entry_check(xfs_mount_t *mp, } /* check v5 metadata */ - d = bplist[db]->b_addr; + d = bp->b_addr; if (be32_to_cpu(d->magic) == XFS_DIR3_BLOCK_MAGIC || be32_to_cpu(d->magic) == XFS_DIR3_DATA_MAGIC) { - struct xfs_buf *bp = bplist[db]; - error = check_dir3_header(mp, bp, ino); if (error) { fixit++; + if (isblock) + goto out_fix; continue; } } longform_dir2_entry_check_data(mp, ip, num_illegal, need_dot, - irec, ino_offset, &bplist[db], hashtab, + irec, ino_offset, bp, hashtab, &freetab, da_bno, isblock); + if (isblock) + break; + + libxfs_buf_relse(bp); } fixit |= (*num_illegal != 0) || dir2_is_badino(ino) || *need_dot; @@ -2345,7 +2307,7 @@ longform_dir2_entry_check(xfs_mount_t *mp, xfs_dir2_block_tail_t *btp; xfs_dir2_leaf_entry_t *blp; - block = bplist[0]->b_addr; + block = bp->b_addr; btp = xfs_dir2_block_tail_p(mp->m_dir_geo, block); blp = xfs_dir2_block_leaf_p(btp); seeval = dir_hash_see_all(hashtab, blp, @@ -2362,11 +2324,10 @@ longform_dir2_entry_check(xfs_mount_t *mp, } } out_fix: + if (isblock && bp) + libxfs_buf_relse(bp); + if (!no_modify && (fixit || dotdot_update)) { - dir_hash_dup_names(hashtab); - for (i = 0; i < num_bps; i++) - if (bplist[i]) - libxfs_buf_relse(bplist[i]); longform_dir2_rebuild(mp, ino, ip, irec, ino_offset, hashtab); *num_illegal = 0; *need_dot = 0; @@ -2374,12 +2335,8 @@ out_fix: if (fixit || dotdot_update) do_warn( _("would rebuild directory inode %" PRIu64 "\n"), ino); - for (i = 0; i < num_bps; i++) - if (bplist[i]) - libxfs_buf_relse(bplist[i]); } - free(bplist); free(freetab); } From patchwork Thu Oct 22 05:15:36 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850271 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 4FD5B16C0 for ; Thu, 22 Oct 2020 05:15:46 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id 3BD682245D for ; Thu, 22 Oct 2020 05:15:46 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2507850AbgJVFPp (ORCPT ); Thu, 22 Oct 2020 01:15:45 -0400 Received: from mail104.syd.optusnet.com.au ([211.29.132.246]:50599 "EHLO mail104.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2444222AbgJVFPp (ORCPT ); Thu, 22 Oct 2020 01:15:45 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail104.syd.optusnet.com.au (Postfix) with ESMTPS id 98C8D58883B for ; Thu, 22 Oct 2020 16:15:38 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034L3-Ut for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoU-Mh for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 6/7] repair: convert the dir byaddr hash to a radix tree Date: Thu, 22 Oct 2020 16:15:36 +1100 Message-Id: <20201022051537.2286402-7-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=F8MpiZpN c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=AMCJo6KD-jaqlFYMlX4A:9 a=hw07J2mADOJeRYaa:21 a=NO3rtv9TNl4CSAbU:21 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner Phase 6 uses a hash table to track the data segment addresses of the entries it has seen in a directory. This is indexed by the offset into the data segment for the dirent, and is used to check if the entry exists, is a duplicate or has a bad hash value. The lookup operations involve walking long hash chains on large directories and they are done for every entry in the directory. This means certain operations have O(n^2) scalability (or worse!) and hence hurt on very large directories. It is also used to determine if the directory has unseen entries, which involves a full hash traversal that is very expensive on large directories. Hence the directory checking for unseen ends up being roughly a O(n^2 + n) algorithm. Switch the byaddr indexing to a radix tree. While a radix tree will burn more memory than the linked list, it gives us O(log n) lookup operations instead of O(n) on large directories, and use for tags gives us O(1) determination of whether all entries have been seen or not. This brings the "entry seen" algorithm scalability back to O(nlog n) and so is a major improvement for processing large directories. Given a filesystem with 10M empty files in a single directory, we see: 5.6.0: 97.56% xfs_repair [.] dir_hash_add.lto_priv.0 0.38% xfs_repair [.] avl_ino_start.lto_priv.0 0.37% libc-2.31.so [.] malloc 0.34% xfs_repair [.] longform_dir2_entry_check_data.lto_priv.0 Phase 6: 10/22 12:07:13 10/22 12:10:51 3 minutes, 38 seconds Patched: 97.11% xfs_repair [.] dir_hash_add 0.38% xfs_repair [.] longform_dir2_entry_check_data 0.34% libc-2.31.so [.] __libc_calloc 0.32% xfs_repair [.] avl_ino_start Phase 6: 10/22 12:11:40 10/22 12:14:28 2 minutes, 48 seconds So there's some improvement, but we are clearly still CPU bound due to the O(n^2) scalability of the duplicate name checking algorithm. Signed-off-by: Dave Chinner Reviewed-by: Darrick J. Wong --- libfrog/radix-tree.c | 46 +++++++++ repair/phase6.c | 222 ++++++++++++++++++++----------------------- 2 files changed, 148 insertions(+), 120 deletions(-) diff --git a/libfrog/radix-tree.c b/libfrog/radix-tree.c index c1c74876964c..261fc2487de9 100644 --- a/libfrog/radix-tree.c +++ b/libfrog/radix-tree.c @@ -312,6 +312,52 @@ void *radix_tree_lookup_first(struct radix_tree_root *root, unsigned long *index #ifdef RADIX_TREE_TAGS +/** + * radix_tree_tag_get - get a tag on a radix tree node + * @root: radix tree root + * @index: index key + * @tag: tag index (< RADIX_TREE_MAX_TAGS) + * + * Return values: + * + * 0: tag not present or not set + * 1: tag set + * + * Note that the return value of this function may not be relied on, even if + * the RCU lock is held, unless tag modification and node deletion are excluded + * from concurrency. + */ +int radix_tree_tag_get(struct radix_tree_root *root, + unsigned long index, unsigned int tag) +{ + unsigned int height, shift; + struct radix_tree_node *slot; + + height = root->height; + if (index > radix_tree_maxindex(height)) + return 0; + + shift = (height - 1) * RADIX_TREE_MAP_SHIFT; + slot = root->rnode; + + while (height > 0) { + int offset; + + if (slot == NULL) + return 0; + + offset = (index >> shift) & RADIX_TREE_MAP_MASK; + if (!tag_get(slot, tag, offset)) + return 0; + + slot = slot->slots[offset]; + ASSERT(slot != NULL); + shift -= RADIX_TREE_MAP_SHIFT; + height--; + } + return 1; +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root diff --git a/repair/phase6.c b/repair/phase6.c index 79c87495656f..21f49dd748e1 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -66,8 +66,7 @@ add_dotdot_update( * and whether their leaf entry has been seen. Also used for name * duplicate checking and rebuilding step if required. */ -typedef struct dir_hash_ent { - struct dir_hash_ent *nextbyaddr; /* next in addr bucket */ +struct dir_hash_ent { struct dir_hash_ent *nextbyhash; /* next in name bucket */ struct dir_hash_ent *nextbyorder; /* next in order added */ xfs_dahash_t hashval; /* hash value of name */ @@ -76,18 +75,19 @@ typedef struct dir_hash_ent { short junkit; /* name starts with / */ short seen; /* have seen leaf entry */ struct xfs_name name; -} dir_hash_ent_t; +}; -typedef struct dir_hash_tab { +struct dir_hash_tab { int size; /* size of hash tables */ - dir_hash_ent_t *first; /* ptr to first added entry */ - dir_hash_ent_t *last; /* ptr to last added entry */ - dir_hash_ent_t **byhash; /* ptr to name hash buckets */ - dir_hash_ent_t **byaddr; /* ptr to addr hash buckets */ -} dir_hash_tab_t; + struct dir_hash_ent *first; /* ptr to first added entry */ + struct dir_hash_ent *last; /* ptr to last added entry */ + struct dir_hash_ent **byhash; /* ptr to name hash buckets */ +#define HT_UNSEEN 1 + struct radix_tree_root byaddr; +}; #define DIR_HASH_TAB_SIZE(n) \ - (sizeof(dir_hash_tab_t) + (sizeof(dir_hash_ent_t *) * (n) * 2)) + (sizeof(struct dir_hash_tab) + (sizeof(struct dir_hash_ent *) * (n))) #define DIR_HASH_FUNC(t,a) ((a) % (t)->size) /* @@ -154,8 +154,8 @@ dir_read_buf( */ static int dir_hash_add( - xfs_mount_t *mp, - dir_hash_tab_t *hashtab, + struct xfs_mount *mp, + struct dir_hash_tab *hashtab, uint32_t addr, xfs_ino_t inum, int namelen, @@ -163,19 +163,18 @@ dir_hash_add( uint8_t ftype) { xfs_dahash_t hash = 0; - int byaddr; int byhash = 0; - dir_hash_ent_t *p; + struct dir_hash_ent *p; int dup; short junk; struct xfs_name xname; + int error; xname.name = name; xname.len = namelen; xname.type = ftype; junk = name[0] == '/'; - byaddr = DIR_HASH_FUNC(hashtab, addr); dup = 0; if (!junk) { @@ -205,8 +204,14 @@ dir_hash_add( do_error(_("malloc failed in dir_hash_add (%zu bytes)\n"), sizeof(*p)); - p->nextbyaddr = hashtab->byaddr[byaddr]; - hashtab->byaddr[byaddr] = p; + error = radix_tree_insert(&hashtab->byaddr, addr, p); + if (error == EEXIST) { + do_warn(_("duplicate addrs %u in directory!\n"), addr); + free(p); + return 0; + } + radix_tree_tag_set(&hashtab->byaddr, addr, HT_UNSEEN); + if (hashtab->last) hashtab->last->nextbyorder = p; else @@ -232,33 +237,14 @@ dir_hash_add( return !dup; } -/* - * checks to see if any data entries are not in the leaf blocks - */ -static int -dir_hash_unseen( - dir_hash_tab_t *hashtab) -{ - int i; - dir_hash_ent_t *p; - - for (i = 0; i < hashtab->size; i++) { - for (p = hashtab->byaddr[i]; p; p = p->nextbyaddr) { - if (p->seen == 0) - return 1; - } - } - return 0; -} - static int dir_hash_check( - dir_hash_tab_t *hashtab, - xfs_inode_t *ip, - int seeval) + struct dir_hash_tab *hashtab, + struct xfs_inode *ip, + int seeval) { - static char *seevalstr[DIR_HASH_CK_TOTAL]; - static int done; + static char *seevalstr[DIR_HASH_CK_TOTAL]; + static int done; if (!done) { seevalstr[DIR_HASH_CK_OK] = _("ok"); @@ -270,7 +256,8 @@ dir_hash_check( done = 1; } - if (seeval == DIR_HASH_CK_OK && dir_hash_unseen(hashtab)) + if (seeval == DIR_HASH_CK_OK && + radix_tree_tagged(&hashtab->byaddr, HT_UNSEEN)) seeval = DIR_HASH_CK_NOLEAF; if (seeval == DIR_HASH_CK_OK) return 0; @@ -285,27 +272,28 @@ dir_hash_check( static void dir_hash_done( - dir_hash_tab_t *hashtab) + struct dir_hash_tab *hashtab) { - int i; - dir_hash_ent_t *n; - dir_hash_ent_t *p; + int i; + struct dir_hash_ent *n; + struct dir_hash_ent *p; for (i = 0; i < hashtab->size; i++) { - for (p = hashtab->byaddr[i]; p; p = n) { - n = p->nextbyaddr; + for (p = hashtab->byhash[i]; p; p = n) { + n = p->nextbyhash; + radix_tree_delete(&hashtab->byaddr, p->address); free(p); } } free(hashtab); } -static dir_hash_tab_t * +static struct dir_hash_tab * dir_hash_init( - xfs_fsize_t size) + xfs_fsize_t size) { - dir_hash_tab_t *hashtab; - int hsize; + struct dir_hash_tab *hashtab; + int hsize; hsize = size / (16 * 4); if (hsize > 65536) @@ -315,51 +303,43 @@ dir_hash_init( if ((hashtab = calloc(DIR_HASH_TAB_SIZE(hsize), 1)) == NULL) do_error(_("calloc failed in dir_hash_init\n")); hashtab->size = hsize; - hashtab->byhash = (dir_hash_ent_t**)((char *)hashtab + - sizeof(dir_hash_tab_t)); - hashtab->byaddr = (dir_hash_ent_t**)((char *)hashtab + - sizeof(dir_hash_tab_t) + sizeof(dir_hash_ent_t*) * hsize); + hashtab->byhash = (struct dir_hash_ent **)((char *)hashtab + + sizeof(struct dir_hash_tab)); + INIT_RADIX_TREE(&hashtab->byaddr, 0); return hashtab; } static int dir_hash_see( - dir_hash_tab_t *hashtab, + struct dir_hash_tab *hashtab, xfs_dahash_t hash, xfs_dir2_dataptr_t addr) { - int i; - dir_hash_ent_t *p; + struct dir_hash_ent *p; - i = DIR_HASH_FUNC(hashtab, addr); - for (p = hashtab->byaddr[i]; p; p = p->nextbyaddr) { - if (p->address != addr) - continue; - if (p->seen) - return DIR_HASH_CK_DUPLEAF; - if (p->junkit == 0 && p->hashval != hash) - return DIR_HASH_CK_BADHASH; - p->seen = 1; - return DIR_HASH_CK_OK; - } - return DIR_HASH_CK_NODATA; + p = radix_tree_lookup(&hashtab->byaddr, addr); + if (!p) + return DIR_HASH_CK_NODATA; + if (!radix_tree_tag_get(&hashtab->byaddr, addr, HT_UNSEEN)) + return DIR_HASH_CK_DUPLEAF; + if (p->junkit == 0 && p->hashval != hash) + return DIR_HASH_CK_BADHASH; + radix_tree_tag_clear(&hashtab->byaddr, addr, HT_UNSEEN); + return DIR_HASH_CK_OK; } static void dir_hash_update_ftype( - dir_hash_tab_t *hashtab, + struct dir_hash_tab *hashtab, xfs_dir2_dataptr_t addr, uint8_t ftype) { - int i; - dir_hash_ent_t *p; + struct dir_hash_ent *p; - i = DIR_HASH_FUNC(hashtab, addr); - for (p = hashtab->byaddr[i]; p; p = p->nextbyaddr) { - if (p->address != addr) - continue; - p->name.type = ftype; - } + p = radix_tree_lookup(&hashtab->byaddr, addr); + if (!p) + return; + p->name.type = ftype; } /* @@ -368,7 +348,7 @@ dir_hash_update_ftype( */ static int dir_hash_see_all( - dir_hash_tab_t *hashtab, + struct dir_hash_tab *hashtab, xfs_dir2_leaf_entry_t *ents, int count, int stale) @@ -1226,19 +1206,19 @@ dir_binval( static void longform_dir2_rebuild( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_ino_t ino, - xfs_inode_t *ip, - ino_tree_node_t *irec, + struct xfs_inode *ip, + struct ino_tree_node *irec, int ino_offset, - dir_hash_tab_t *hashtab) + struct dir_hash_tab *hashtab) { int error; int nres; - xfs_trans_t *tp; + struct xfs_trans *tp; xfs_fileoff_t lastblock; - xfs_inode_t pip; - dir_hash_ent_t *p; + struct xfs_inode pip; + struct dir_hash_ent *p; int done = 0; /* @@ -1397,14 +1377,14 @@ _("directory shrink failed (%d)\n"), error); */ static void longform_dir2_entry_check_data( - xfs_mount_t *mp, - xfs_inode_t *ip, + struct xfs_mount *mp, + struct xfs_inode *ip, int *num_illegal, int *need_dot, - ino_tree_node_t *current_irec, + struct ino_tree_node *current_irec, int current_ino_offset, struct xfs_buf *bp, - dir_hash_tab_t *hashtab, + struct dir_hash_tab *hashtab, freetab_t **freetabp, xfs_dablk_t da_bno, int isblock) @@ -1931,10 +1911,10 @@ check_dir3_header( */ static int longform_dir2_check_leaf( - xfs_mount_t *mp, - xfs_inode_t *ip, - dir_hash_tab_t *hashtab, - freetab_t *freetab) + struct xfs_mount *mp, + struct xfs_inode *ip, + struct dir_hash_tab *hashtab, + struct freetab *freetab) { int badtail; __be16 *bestsp; @@ -2016,10 +1996,10 @@ longform_dir2_check_leaf( */ static int longform_dir2_check_node( - xfs_mount_t *mp, - xfs_inode_t *ip, - dir_hash_tab_t *hashtab, - freetab_t *freetab) + struct xfs_mount *mp, + struct xfs_inode *ip, + struct dir_hash_tab *hashtab, + struct freetab *freetab) { struct xfs_buf *bp; xfs_dablk_t da_bno; @@ -2191,14 +2171,15 @@ longform_dir2_check_node( * (ie. get libxfs to do all the grunt work) */ static void -longform_dir2_entry_check(xfs_mount_t *mp, - xfs_ino_t ino, - xfs_inode_t *ip, - int *num_illegal, - int *need_dot, - ino_tree_node_t *irec, - int ino_offset, - dir_hash_tab_t *hashtab) +longform_dir2_entry_check( + struct xfs_mount *mp, + xfs_ino_t ino, + struct xfs_inode *ip, + int *num_illegal, + int *need_dot, + struct ino_tree_node *irec, + int ino_offset, + struct dir_hash_tab *hashtab) { struct xfs_buf *bp; xfs_dablk_t da_bno; @@ -2401,13 +2382,14 @@ shortform_dir2_junk( } static void -shortform_dir2_entry_check(xfs_mount_t *mp, - xfs_ino_t ino, - xfs_inode_t *ip, - int *ino_dirty, - ino_tree_node_t *current_irec, - int current_ino_offset, - dir_hash_tab_t *hashtab) +shortform_dir2_entry_check( + struct xfs_mount *mp, + xfs_ino_t ino, + struct xfs_inode *ip, + int *ino_dirty, + struct ino_tree_node *current_irec, + int current_ino_offset, + struct dir_hash_tab *hashtab) { xfs_ino_t lino; xfs_ino_t parent; @@ -2749,15 +2731,15 @@ _("entry \"%s\" (ino %" PRIu64 ") in dir %" PRIu64 " is a duplicate name"), */ static void process_dir_inode( - xfs_mount_t *mp, + struct xfs_mount *mp, xfs_agnumber_t agno, - ino_tree_node_t *irec, + struct ino_tree_node *irec, int ino_offset) { xfs_ino_t ino; - xfs_inode_t *ip; - xfs_trans_t *tp; - dir_hash_tab_t *hashtab; + struct xfs_inode *ip; + struct xfs_trans *tp; + struct dir_hash_tab *hashtab; int need_dot; int dirty, num_illegal, error, nres; From patchwork Thu Oct 22 05:15:37 2020 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Dave Chinner X-Patchwork-Id: 11850269 Return-Path: Received: from mail.kernel.org (pdx-korg-mail-1.web.codeaurora.org [172.30.200.123]) by pdx-korg-patchwork-2.web.codeaurora.org (Postfix) with ESMTP id 071A414B7 for ; Thu, 22 Oct 2020 05:15:46 +0000 (UTC) Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by mail.kernel.org (Postfix) with ESMTP id E9B792244C for ; Thu, 22 Oct 2020 05:15:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S2444246AbgJVFPp (ORCPT ); Thu, 22 Oct 2020 01:15:45 -0400 Received: from mail105.syd.optusnet.com.au ([211.29.132.249]:59671 "EHLO mail105.syd.optusnet.com.au" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S2507850AbgJVFPp (ORCPT ); Thu, 22 Oct 2020 01:15:45 -0400 Received: from dread.disaster.area (pa49-179-6-140.pa.nsw.optusnet.com.au [49.179.6.140]) by mail105.syd.optusnet.com.au (Postfix) with ESMTPS id 989393ABCED for ; Thu, 22 Oct 2020 16:15:38 +1100 (AEDT) Received: from discord.disaster.area ([192.168.253.110]) by dread.disaster.area with esmtp (Exim 4.92.3) (envelope-from ) id 1kVSwr-0034L6-W1 for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 Received: from dave by discord.disaster.area with local (Exim 4.94) (envelope-from ) id 1kVSwr-009aoX-OD for linux-xfs@vger.kernel.org; Thu, 22 Oct 2020 16:15:37 +1100 From: Dave Chinner To: linux-xfs@vger.kernel.org Subject: [PATCH 7/7] repair: scale duplicate name checking in phase 6. Date: Thu, 22 Oct 2020 16:15:37 +1100 Message-Id: <20201022051537.2286402-8-david@fromorbit.com> X-Mailer: git-send-email 2.28.0 In-Reply-To: <20201022051537.2286402-1-david@fromorbit.com> References: <20201022051537.2286402-1-david@fromorbit.com> MIME-Version: 1.0 X-Optus-CM-Score: 0 X-Optus-CM-Analysis: v=2.3 cv=Ubgvt5aN c=1 sm=1 tr=0 cx=a_idp_d a=uDU3YIYVKEaHT0eX+MXYOQ==:117 a=uDU3YIYVKEaHT0eX+MXYOQ==:17 a=afefHYAZSVUA:10 a=20KFwNOVAAAA:8 a=3pgqu3-PYap36f93kZgA:9 a=8SLSQ6RLE708x3xS:21 a=HYTXw9kBQAD6o9LZ:21 Precedence: bulk List-ID: X-Mailing-List: linux-xfs@vger.kernel.org From: Dave Chinner phase 6 on large directories is cpu bound on duplicate name checking due to the algorithm having effectively O(n^2) scalability. Hence when the duplicate name hash table size is far smaller than the number of directory entries, we end up with long hash chains that are searched linearly on every new entry that is found in the directory to do duplicate detection. The in-memory hash table size is limited to 64k entries. Hence when we have millions of entries in a directory, duplicate entry lookups on the hash table have substantial overhead. Scale this table out to larger sizes so that we keep the chain lengths short and hence the O(n^2) scalability impact is limited because N is always small. For a 10M entry directoryi consuming 400MB of directory data, the hash table now sizes at 6.4 million entries instead of ~64k - it is ~100x larger. While the hash table now consumes ~50MB of RAM, the xfs_repair footprint barely changes at it's using already consuming ~9GB of RAM at this point in time. IOWs, the incremental memory usage change is noise, but the directory checking time: Unpatched: 97.11% xfs_repair [.] dir_hash_add 0.38% xfs_repair [.] longform_dir2_entry_check_data 0.34% libc-2.31.so [.] __libc_calloc 0.32% xfs_repair [.] avl_ino_start Phase 6: 10/22 12:11:40 10/22 12:14:28 2 minutes, 48 seconds Patched: 46.74% xfs_repair [.] radix_tree_lookup 32.13% xfs_repair [.] dir_hash_see_all 7.70% xfs_repair [.] radix_tree_tag_get 3.92% xfs_repair [.] dir_hash_add 3.52% xfs_repair [.] radix_tree_tag_clear 2.43% xfs_repair [.] crc32c_le Phase 6: 10/22 13:11:01 10/22 13:11:18 17 seconds has been reduced by an order of magnitude. Signed-off-by: Dave Chinner --- repair/phase6.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/repair/phase6.c b/repair/phase6.c index 21f49dd748e1..7dd6130056ee 100644 --- a/repair/phase6.c +++ b/repair/phase6.c @@ -288,19 +288,37 @@ dir_hash_done( free(hashtab); } +/* + * Create a directory hash index structure based on the size of the directory we + * are about to try to repair. The size passed in is the size of the data + * segment of the directory in bytes, so we don't really know exactly how many + * entries are in it. Hence assume an entry size of around 64 bytes - that's a + * name length of 40+ bytes so should cover a most situations with large + * really directories. + */ static struct dir_hash_tab * dir_hash_init( xfs_fsize_t size) { - struct dir_hash_tab *hashtab; + struct dir_hash_tab *hashtab = NULL; int hsize; - hsize = size / (16 * 4); - if (hsize > 65536) - hsize = 63336; - else if (hsize < 16) + hsize = size / 64; + if (hsize < 16) hsize = 16; - if ((hashtab = calloc(DIR_HASH_TAB_SIZE(hsize), 1)) == NULL) + + /* + * Try to allocate as large a hash table as possible. Failure to + * allocate isn't fatal, it will just result in slower performance as we + * reduce the size of the table. + */ + while (hsize >= 16) { + hashtab = calloc(DIR_HASH_TAB_SIZE(hsize), 1); + if (hashtab) + break; + hsize /= 2; + } + if (!hashtab) do_error(_("calloc failed in dir_hash_init\n")); hashtab->size = hsize; hashtab->byhash = (struct dir_hash_ent **)((char *)hashtab +