Message ID | 20140115125144.GA2953@shrek.lan (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > The following patches are reverted in this patch because these > patches caused regression in the unlink() calls. > > ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > of dentry lock to ocfs2_wq > f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > ocfs2_kill_sb on a failed mount > > The regression is caused because these patches delay the iput() in case > of dentry unlocks. This also delays the unlocking of the open lockres. > The open lockresource is required to test if the inode can be wiped from > disk on not. When the deleting node does not get the open lock, it marks > it as orphan (even though it is not in use by another node/process) > and causes a journal checkpoint. This delays operations following the > inode eviction. This also moves the inode to the orphaned inode > which further causes more I/O and a lot of unneccessary orphans. > > As for the fix, I tried reproducing the problem by using quotas and enabling > lockdep, but the issue could not be reproduced. So I was thinking about this for a while trying to remember... What it really boils down to is whether it is OK to call ocfs2_delete_inode() from DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all kinds of interesting cluster locks meaning that the downconvert thread can block waiting to acquire some cluster lock. That seems like asking for trouble to me (i.e., what if the node holding the lock we need from downconvert thread needs a lock from us first?) but maybe it is OK these days. Honza > > Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > > --- > diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c > index 0d3a97d..e2e05a1 100644 > --- a/fs/ocfs2/dcache.c > +++ b/fs/ocfs2/dcache.c > @@ -37,7 +37,6 @@ > #include "dlmglue.h" > #include "file.h" > #include "inode.h" > -#include "super.h" > #include "ocfs2_trace.h" > > void ocfs2_dentry_attach_gen(struct dentry *dentry) > @@ -346,52 +345,6 @@ out_attach: > return ret; > } > > -DEFINE_SPINLOCK(dentry_list_lock); > - > -/* We limit the number of dentry locks to drop in one go. We have > - * this limit so that we don't starve other users of ocfs2_wq. */ > -#define DL_INODE_DROP_COUNT 64 > - > -/* Drop inode references from dentry locks */ > -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) > -{ > - struct ocfs2_dentry_lock *dl; > - > - spin_lock(&dentry_list_lock); > - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { > - dl = osb->dentry_lock_list; > - osb->dentry_lock_list = dl->dl_next; > - spin_unlock(&dentry_list_lock); > - iput(dl->dl_inode); > - kfree(dl); > - spin_lock(&dentry_list_lock); > - } > - spin_unlock(&dentry_list_lock); > -} > - > -void ocfs2_drop_dl_inodes(struct work_struct *work) > -{ > - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, > - dentry_lock_work); > - > - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); > - /* > - * Don't queue dropping if umount is in progress. We flush the > - * list in ocfs2_dismount_volume > - */ > - spin_lock(&dentry_list_lock); > - if (osb->dentry_lock_list && > - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) > - queue_work(ocfs2_wq, &osb->dentry_lock_work); > - spin_unlock(&dentry_list_lock); > -} > - > -/* Flush the whole work queue */ > -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) > -{ > - __ocfs2_drop_dl_inodes(osb, -1); > -} > - > /* > * ocfs2_dentry_iput() and friends. > * > @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) > static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, > struct ocfs2_dentry_lock *dl) > { > + iput(dl->dl_inode); > ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); > ocfs2_lock_res_free(&dl->dl_lockres); > - > - /* We leave dropping of inode reference to ocfs2_wq as that can > - * possibly lead to inode deletion which gets tricky */ > - spin_lock(&dentry_list_lock); > - if (!osb->dentry_lock_list && > - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) > - queue_work(ocfs2_wq, &osb->dentry_lock_work); > - dl->dl_next = osb->dentry_lock_list; > - osb->dentry_lock_list = dl; > - spin_unlock(&dentry_list_lock); > + kfree(dl); > } > > void ocfs2_dentry_lock_put(struct ocfs2_super *osb, > struct ocfs2_dentry_lock *dl) > { > - int unlock; > + int unlock = 0; > > BUG_ON(dl->dl_count == 0); > > diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h > index b79eff7..55f5889 100644 > --- a/fs/ocfs2/dcache.h > +++ b/fs/ocfs2/dcache.h > @@ -29,13 +29,8 @@ > extern const struct dentry_operations ocfs2_dentry_ops; > > struct ocfs2_dentry_lock { > - /* Use count of dentry lock */ > unsigned int dl_count; > - union { > - /* Linked list of dentry locks to release */ > - struct ocfs2_dentry_lock *dl_next; > - u64 dl_parent_blkno; > - }; > + u64 dl_parent_blkno; > > /* > * The ocfs2_dentry_lock keeps an inode reference until > @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { > int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, > u64 parent_blkno); > > -extern spinlock_t dentry_list_lock; > - > void ocfs2_dentry_lock_put(struct ocfs2_super *osb, > struct ocfs2_dentry_lock *dl); > > -void ocfs2_drop_dl_inodes(struct work_struct *work); > -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); > - > struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, > int skip_unhashed); > > diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h > index 553f53c..6be4e1d 100644 > --- a/fs/ocfs2/ocfs2.h > +++ b/fs/ocfs2/ocfs2.h > @@ -274,19 +274,16 @@ enum ocfs2_mount_options > OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ > }; > > -#define OCFS2_OSB_SOFT_RO 0x0001 > -#define OCFS2_OSB_HARD_RO 0x0002 > -#define OCFS2_OSB_ERROR_FS 0x0004 > -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 > - > -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 > +#define OCFS2_OSB_SOFT_RO 0x0001 > +#define OCFS2_OSB_HARD_RO 0x0002 > +#define OCFS2_OSB_ERROR_FS 0x0004 > +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 > > struct ocfs2_journal; > struct ocfs2_slot_info; > struct ocfs2_recovery_map; > struct ocfs2_replay_map; > struct ocfs2_quota_recovery; > -struct ocfs2_dentry_lock; > struct ocfs2_super > { > struct task_struct *commit_task; > @@ -414,11 +411,6 @@ struct ocfs2_super > struct list_head blocked_lock_list; > unsigned long blocked_lock_count; > > - /* List of dentry locks to release. Anyone can add locks to > - * the list, ocfs2_wq processes the list */ > - struct ocfs2_dentry_lock *dentry_lock_list; > - struct work_struct dentry_lock_work; > - > wait_queue_head_t osb_mount_event; > > /* Truncate log info */ > @@ -579,18 +571,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, > spin_unlock(&osb->osb_lock); > } > > - > -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, > - unsigned long flag) > -{ > - unsigned long ret; > - > - spin_lock(&osb->osb_lock); > - ret = osb->osb_flags & flag; > - spin_unlock(&osb->osb_lock); > - return ret; > -} > - > static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, > int hard) > { > diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c > index 7d259ff..129b4db 100644 > --- a/fs/ocfs2/super.c > +++ b/fs/ocfs2/super.c > @@ -1238,30 +1238,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, > return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); > } > > -static void ocfs2_kill_sb(struct super_block *sb) > -{ > - struct ocfs2_super *osb = OCFS2_SB(sb); > - > - /* Failed mount? */ > - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) > - goto out; > - > - /* Prevent further queueing of inode drop events */ > - spin_lock(&dentry_list_lock); > - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); > - spin_unlock(&dentry_list_lock); > - /* Wait for work to finish and/or remove it */ > - cancel_work_sync(&osb->dentry_lock_work); > -out: > - kill_block_super(sb); > -} > - > static struct file_system_type ocfs2_fs_type = { > .owner = THIS_MODULE, > .name = "ocfs2", > .mount = ocfs2_mount, > - .kill_sb = ocfs2_kill_sb, > - > + .kill_sb = kill_block_super, > .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, > .next = NULL > }; > @@ -1934,12 +1915,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) > > debugfs_remove(osb->osb_ctxt); > > - /* > - * Flush inode dropping work queue so that deletes are > - * performed while the filesystem is still working > - */ > - ocfs2_drop_all_dl_inodes(osb); > - > /* Orphan scan should be stopped as early as possible */ > ocfs2_orphan_scan_stop(osb); > > @@ -2274,9 +2249,6 @@ static int ocfs2_initialize_super(struct super_block *sb, > INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); > journal->j_state = OCFS2_JOURNAL_FREE; > > - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); > - osb->dentry_lock_list = NULL; > - > /* get some pseudo constants for clustersize bits */ > osb->s_clustersize_bits = > le32_to_cpu(di->id2.i_super.s_clustersize_bits); > > -- > Goldwyn
On 01/15/2014 07:33 AM, Jan Kara wrote: > On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >> The following patches are reverted in this patch because these >> patches caused regression in the unlink() calls. >> >> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >> of dentry lock to ocfs2_wq >> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >> ocfs2_kill_sb on a failed mount >> >> The regression is caused because these patches delay the iput() in case >> of dentry unlocks. This also delays the unlocking of the open lockres. >> The open lockresource is required to test if the inode can be wiped from >> disk on not. When the deleting node does not get the open lock, it marks >> it as orphan (even though it is not in use by another node/process) >> and causes a journal checkpoint. This delays operations following the >> inode eviction. This also moves the inode to the orphaned inode >> which further causes more I/O and a lot of unneccessary orphans. >> >> As for the fix, I tried reproducing the problem by using quotas and enabling >> lockdep, but the issue could not be reproduced. > So I was thinking about this for a while trying to remember... What it > really boils down to is whether it is OK to call ocfs2_delete_inode() from > DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > kinds of interesting cluster locks meaning that the downconvert thread can > block waiting to acquire some cluster lock. That seems like asking for > trouble to me (i.e., what if the node holding the lock we need from > downconvert thread needs a lock from us first?) but maybe it is OK these > days. > The only lock it tries to take is the "inode lock" resource, which seems to be fine to take. It is taken to refresh the inode from disk. The other cluster lock is the "open lock", which is taken in a try-lock fashion and is not blocking. I don't see quota code using any cluster locking, but if there is anything interfering, I would like to know. > Honza > >> >> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> >> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> >> >> --- >> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >> index 0d3a97d..e2e05a1 100644 >> --- a/fs/ocfs2/dcache.c >> +++ b/fs/ocfs2/dcache.c >> @@ -37,7 +37,6 @@ >> #include "dlmglue.h" >> #include "file.h" >> #include "inode.h" >> -#include "super.h" >> #include "ocfs2_trace.h" >> >> void ocfs2_dentry_attach_gen(struct dentry *dentry) >> @@ -346,52 +345,6 @@ out_attach: >> return ret; >> } >> >> -DEFINE_SPINLOCK(dentry_list_lock); >> - >> -/* We limit the number of dentry locks to drop in one go. We have >> - * this limit so that we don't starve other users of ocfs2_wq. */ >> -#define DL_INODE_DROP_COUNT 64 >> - >> -/* Drop inode references from dentry locks */ >> -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) >> -{ >> - struct ocfs2_dentry_lock *dl; >> - >> - spin_lock(&dentry_list_lock); >> - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { >> - dl = osb->dentry_lock_list; >> - osb->dentry_lock_list = dl->dl_next; >> - spin_unlock(&dentry_list_lock); >> - iput(dl->dl_inode); >> - kfree(dl); >> - spin_lock(&dentry_list_lock); >> - } >> - spin_unlock(&dentry_list_lock); >> -} >> - >> -void ocfs2_drop_dl_inodes(struct work_struct *work) >> -{ >> - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, >> - dentry_lock_work); >> - >> - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); >> - /* >> - * Don't queue dropping if umount is in progress. We flush the >> - * list in ocfs2_dismount_volume >> - */ >> - spin_lock(&dentry_list_lock); >> - if (osb->dentry_lock_list && >> - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) >> - queue_work(ocfs2_wq, &osb->dentry_lock_work); >> - spin_unlock(&dentry_list_lock); >> -} >> - >> -/* Flush the whole work queue */ >> -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) >> -{ >> - __ocfs2_drop_dl_inodes(osb, -1); >> -} >> - >> /* >> * ocfs2_dentry_iput() and friends. >> * >> @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) >> static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, >> struct ocfs2_dentry_lock *dl) >> { >> + iput(dl->dl_inode); >> ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); >> ocfs2_lock_res_free(&dl->dl_lockres); >> - >> - /* We leave dropping of inode reference to ocfs2_wq as that can >> - * possibly lead to inode deletion which gets tricky */ >> - spin_lock(&dentry_list_lock); >> - if (!osb->dentry_lock_list && >> - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) >> - queue_work(ocfs2_wq, &osb->dentry_lock_work); >> - dl->dl_next = osb->dentry_lock_list; >> - osb->dentry_lock_list = dl; >> - spin_unlock(&dentry_list_lock); >> + kfree(dl); >> } >> >> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, >> struct ocfs2_dentry_lock *dl) >> { >> - int unlock; >> + int unlock = 0; >> >> BUG_ON(dl->dl_count == 0); >> >> diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h >> index b79eff7..55f5889 100644 >> --- a/fs/ocfs2/dcache.h >> +++ b/fs/ocfs2/dcache.h >> @@ -29,13 +29,8 @@ >> extern const struct dentry_operations ocfs2_dentry_ops; >> >> struct ocfs2_dentry_lock { >> - /* Use count of dentry lock */ >> unsigned int dl_count; >> - union { >> - /* Linked list of dentry locks to release */ >> - struct ocfs2_dentry_lock *dl_next; >> - u64 dl_parent_blkno; >> - }; >> + u64 dl_parent_blkno; >> >> /* >> * The ocfs2_dentry_lock keeps an inode reference until >> @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { >> int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, >> u64 parent_blkno); >> >> -extern spinlock_t dentry_list_lock; >> - >> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, >> struct ocfs2_dentry_lock *dl); >> >> -void ocfs2_drop_dl_inodes(struct work_struct *work); >> -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); >> - >> struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, >> int skip_unhashed); >> >> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h >> index 553f53c..6be4e1d 100644 >> --- a/fs/ocfs2/ocfs2.h >> +++ b/fs/ocfs2/ocfs2.h >> @@ -274,19 +274,16 @@ enum ocfs2_mount_options >> OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ >> }; >> >> -#define OCFS2_OSB_SOFT_RO 0x0001 >> -#define OCFS2_OSB_HARD_RO 0x0002 >> -#define OCFS2_OSB_ERROR_FS 0x0004 >> -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 >> - >> -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 >> +#define OCFS2_OSB_SOFT_RO 0x0001 >> +#define OCFS2_OSB_HARD_RO 0x0002 >> +#define OCFS2_OSB_ERROR_FS 0x0004 >> +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 >> >> struct ocfs2_journal; >> struct ocfs2_slot_info; >> struct ocfs2_recovery_map; >> struct ocfs2_replay_map; >> struct ocfs2_quota_recovery; >> -struct ocfs2_dentry_lock; >> struct ocfs2_super >> { >> struct task_struct *commit_task; >> @@ -414,11 +411,6 @@ struct ocfs2_super >> struct list_head blocked_lock_list; >> unsigned long blocked_lock_count; >> >> - /* List of dentry locks to release. Anyone can add locks to >> - * the list, ocfs2_wq processes the list */ >> - struct ocfs2_dentry_lock *dentry_lock_list; >> - struct work_struct dentry_lock_work; >> - >> wait_queue_head_t osb_mount_event; >> >> /* Truncate log info */ >> @@ -579,18 +571,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, >> spin_unlock(&osb->osb_lock); >> } >> >> - >> -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, >> - unsigned long flag) >> -{ >> - unsigned long ret; >> - >> - spin_lock(&osb->osb_lock); >> - ret = osb->osb_flags & flag; >> - spin_unlock(&osb->osb_lock); >> - return ret; >> -} >> - >> static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, >> int hard) >> { >> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c >> index 7d259ff..129b4db 100644 >> --- a/fs/ocfs2/super.c >> +++ b/fs/ocfs2/super.c >> @@ -1238,30 +1238,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, >> return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); >> } >> >> -static void ocfs2_kill_sb(struct super_block *sb) >> -{ >> - struct ocfs2_super *osb = OCFS2_SB(sb); >> - >> - /* Failed mount? */ >> - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) >> - goto out; >> - >> - /* Prevent further queueing of inode drop events */ >> - spin_lock(&dentry_list_lock); >> - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); >> - spin_unlock(&dentry_list_lock); >> - /* Wait for work to finish and/or remove it */ >> - cancel_work_sync(&osb->dentry_lock_work); >> -out: >> - kill_block_super(sb); >> -} >> - >> static struct file_system_type ocfs2_fs_type = { >> .owner = THIS_MODULE, >> .name = "ocfs2", >> .mount = ocfs2_mount, >> - .kill_sb = ocfs2_kill_sb, >> - >> + .kill_sb = kill_block_super, >> .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, >> .next = NULL >> }; >> @@ -1934,12 +1915,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) >> >> debugfs_remove(osb->osb_ctxt); >> >> - /* >> - * Flush inode dropping work queue so that deletes are >> - * performed while the filesystem is still working >> - */ >> - ocfs2_drop_all_dl_inodes(osb); >> - >> /* Orphan scan should be stopped as early as possible */ >> ocfs2_orphan_scan_stop(osb); >> >> @@ -2274,9 +2249,6 @@ static int ocfs2_initialize_super(struct super_block *sb, >> INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); >> journal->j_state = OCFS2_JOURNAL_FREE; >> >> - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); >> - osb->dentry_lock_list = NULL; >> - >> /* get some pseudo constants for clustersize bits */ >> osb->s_clustersize_bits = >> le32_to_cpu(di->id2.i_super.s_clustersize_bits); >> >> -- >> Goldwyn
On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > On 01/15/2014 07:33 AM, Jan Kara wrote: > >On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>The following patches are reverted in this patch because these > >>patches caused regression in the unlink() calls. > >> > >>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>of dentry lock to ocfs2_wq > >>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>ocfs2_kill_sb on a failed mount > >> > >>The regression is caused because these patches delay the iput() in case > >>of dentry unlocks. This also delays the unlocking of the open lockres. > >>The open lockresource is required to test if the inode can be wiped from > >>disk on not. When the deleting node does not get the open lock, it marks > >>it as orphan (even though it is not in use by another node/process) > >>and causes a journal checkpoint. This delays operations following the > >>inode eviction. This also moves the inode to the orphaned inode > >>which further causes more I/O and a lot of unneccessary orphans. > >> > >>As for the fix, I tried reproducing the problem by using quotas and enabling > >>lockdep, but the issue could not be reproduced. > > So I was thinking about this for a while trying to remember... What it > >really boils down to is whether it is OK to call ocfs2_delete_inode() from > >DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >kinds of interesting cluster locks meaning that the downconvert thread can > >block waiting to acquire some cluster lock. That seems like asking for > >trouble to me (i.e., what if the node holding the lock we need from > >downconvert thread needs a lock from us first?) but maybe it is OK these > >days. > > > > The only lock it tries to take is the "inode lock" resource, which > seems to be fine to take. Why is it obviously fine? Some other node might be holding the inode lock and still write to the inode. If that needs a cluster lock for block allocation and that allocation cluster lock is currently hold by our node, we are screwed. Aren't we? > It is taken to refresh the inode from > disk. The other cluster lock is the "open lock", which is taken in a > try-lock fashion and is not blocking. > > I don't see quota code using any cluster locking, but if there is > anything interfering, I would like to know. Quota code can take e.g. inode lock on the quota file during dquot_initialize() call. It's not common (usually the quota structure is already cached in memory) but it can happen. Honza > >>Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> > >>Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> > >> > >>--- > >>diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c > >>index 0d3a97d..e2e05a1 100644 > >>--- a/fs/ocfs2/dcache.c > >>+++ b/fs/ocfs2/dcache.c > >>@@ -37,7 +37,6 @@ > >> #include "dlmglue.h" > >> #include "file.h" > >> #include "inode.h" > >>-#include "super.h" > >> #include "ocfs2_trace.h" > >> > >> void ocfs2_dentry_attach_gen(struct dentry *dentry) > >>@@ -346,52 +345,6 @@ out_attach: > >> return ret; > >> } > >> > >>-DEFINE_SPINLOCK(dentry_list_lock); > >>- > >>-/* We limit the number of dentry locks to drop in one go. We have > >>- * this limit so that we don't starve other users of ocfs2_wq. */ > >>-#define DL_INODE_DROP_COUNT 64 > >>- > >>-/* Drop inode references from dentry locks */ > >>-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) > >>-{ > >>- struct ocfs2_dentry_lock *dl; > >>- > >>- spin_lock(&dentry_list_lock); > >>- while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { > >>- dl = osb->dentry_lock_list; > >>- osb->dentry_lock_list = dl->dl_next; > >>- spin_unlock(&dentry_list_lock); > >>- iput(dl->dl_inode); > >>- kfree(dl); > >>- spin_lock(&dentry_list_lock); > >>- } > >>- spin_unlock(&dentry_list_lock); > >>-} > >>- > >>-void ocfs2_drop_dl_inodes(struct work_struct *work) > >>-{ > >>- struct ocfs2_super *osb = container_of(work, struct ocfs2_super, > >>- dentry_lock_work); > >>- > >>- __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); > >>- /* > >>- * Don't queue dropping if umount is in progress. We flush the > >>- * list in ocfs2_dismount_volume > >>- */ > >>- spin_lock(&dentry_list_lock); > >>- if (osb->dentry_lock_list && > >>- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) > >>- queue_work(ocfs2_wq, &osb->dentry_lock_work); > >>- spin_unlock(&dentry_list_lock); > >>-} > >>- > >>-/* Flush the whole work queue */ > >>-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) > >>-{ > >>- __ocfs2_drop_dl_inodes(osb, -1); > >>-} > >>- > >> /* > >> * ocfs2_dentry_iput() and friends. > >> * > >>@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) > >> static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, > >> struct ocfs2_dentry_lock *dl) > >> { > >>+ iput(dl->dl_inode); > >> ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); > >> ocfs2_lock_res_free(&dl->dl_lockres); > >>- > >>- /* We leave dropping of inode reference to ocfs2_wq as that can > >>- * possibly lead to inode deletion which gets tricky */ > >>- spin_lock(&dentry_list_lock); > >>- if (!osb->dentry_lock_list && > >>- !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) > >>- queue_work(ocfs2_wq, &osb->dentry_lock_work); > >>- dl->dl_next = osb->dentry_lock_list; > >>- osb->dentry_lock_list = dl; > >>- spin_unlock(&dentry_list_lock); > >>+ kfree(dl); > >> } > >> > >> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, > >> struct ocfs2_dentry_lock *dl) > >> { > >>- int unlock; > >>+ int unlock = 0; > >> > >> BUG_ON(dl->dl_count == 0); > >> > >>diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h > >>index b79eff7..55f5889 100644 > >>--- a/fs/ocfs2/dcache.h > >>+++ b/fs/ocfs2/dcache.h > >>@@ -29,13 +29,8 @@ > >> extern const struct dentry_operations ocfs2_dentry_ops; > >> > >> struct ocfs2_dentry_lock { > >>- /* Use count of dentry lock */ > >> unsigned int dl_count; > >>- union { > >>- /* Linked list of dentry locks to release */ > >>- struct ocfs2_dentry_lock *dl_next; > >>- u64 dl_parent_blkno; > >>- }; > >>+ u64 dl_parent_blkno; > >> > >> /* > >> * The ocfs2_dentry_lock keeps an inode reference until > >>@@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { > >> int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, > >> u64 parent_blkno); > >> > >>-extern spinlock_t dentry_list_lock; > >>- > >> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, > >> struct ocfs2_dentry_lock *dl); > >> > >>-void ocfs2_drop_dl_inodes(struct work_struct *work); > >>-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); > >>- > >> struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, > >> int skip_unhashed); > >> > >>diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h > >>index 553f53c..6be4e1d 100644 > >>--- a/fs/ocfs2/ocfs2.h > >>+++ b/fs/ocfs2/ocfs2.h > >>@@ -274,19 +274,16 @@ enum ocfs2_mount_options > >> OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ > >> }; > >> > >>-#define OCFS2_OSB_SOFT_RO 0x0001 > >>-#define OCFS2_OSB_HARD_RO 0x0002 > >>-#define OCFS2_OSB_ERROR_FS 0x0004 > >>-#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 > >>- > >>-#define OCFS2_DEFAULT_ATIME_QUANTUM 60 > >>+#define OCFS2_OSB_SOFT_RO 0x0001 > >>+#define OCFS2_OSB_HARD_RO 0x0002 > >>+#define OCFS2_OSB_ERROR_FS 0x0004 > >>+#define OCFS2_DEFAULT_ATIME_QUANTUM 60 > >> > >> struct ocfs2_journal; > >> struct ocfs2_slot_info; > >> struct ocfs2_recovery_map; > >> struct ocfs2_replay_map; > >> struct ocfs2_quota_recovery; > >>-struct ocfs2_dentry_lock; > >> struct ocfs2_super > >> { > >> struct task_struct *commit_task; > >>@@ -414,11 +411,6 @@ struct ocfs2_super > >> struct list_head blocked_lock_list; > >> unsigned long blocked_lock_count; > >> > >>- /* List of dentry locks to release. Anyone can add locks to > >>- * the list, ocfs2_wq processes the list */ > >>- struct ocfs2_dentry_lock *dentry_lock_list; > >>- struct work_struct dentry_lock_work; > >>- > >> wait_queue_head_t osb_mount_event; > >> > >> /* Truncate log info */ > >>@@ -579,18 +571,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, > >> spin_unlock(&osb->osb_lock); > >> } > >> > >>- > >>-static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, > >>- unsigned long flag) > >>-{ > >>- unsigned long ret; > >>- > >>- spin_lock(&osb->osb_lock); > >>- ret = osb->osb_flags & flag; > >>- spin_unlock(&osb->osb_lock); > >>- return ret; > >>-} > >>- > >> static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, > >> int hard) > >> { > >>diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c > >>index 7d259ff..129b4db 100644 > >>--- a/fs/ocfs2/super.c > >>+++ b/fs/ocfs2/super.c > >>@@ -1238,30 +1238,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, > >> return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); > >> } > >> > >>-static void ocfs2_kill_sb(struct super_block *sb) > >>-{ > >>- struct ocfs2_super *osb = OCFS2_SB(sb); > >>- > >>- /* Failed mount? */ > >>- if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) > >>- goto out; > >>- > >>- /* Prevent further queueing of inode drop events */ > >>- spin_lock(&dentry_list_lock); > >>- ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); > >>- spin_unlock(&dentry_list_lock); > >>- /* Wait for work to finish and/or remove it */ > >>- cancel_work_sync(&osb->dentry_lock_work); > >>-out: > >>- kill_block_super(sb); > >>-} > >>- > >> static struct file_system_type ocfs2_fs_type = { > >> .owner = THIS_MODULE, > >> .name = "ocfs2", > >> .mount = ocfs2_mount, > >>- .kill_sb = ocfs2_kill_sb, > >>- > >>+ .kill_sb = kill_block_super, > >> .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, > >> .next = NULL > >> }; > >>@@ -1934,12 +1915,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) > >> > >> debugfs_remove(osb->osb_ctxt); > >> > >>- /* > >>- * Flush inode dropping work queue so that deletes are > >>- * performed while the filesystem is still working > >>- */ > >>- ocfs2_drop_all_dl_inodes(osb); > >>- > >> /* Orphan scan should be stopped as early as possible */ > >> ocfs2_orphan_scan_stop(osb); > >> > >>@@ -2274,9 +2249,6 @@ static int ocfs2_initialize_super(struct super_block *sb, > >> INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); > >> journal->j_state = OCFS2_JOURNAL_FREE; > >> > >>- INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); > >>- osb->dentry_lock_list = NULL; > >>- > >> /* get some pseudo constants for clustersize bits */ > >> osb->s_clustersize_bits = > >> le32_to_cpu(di->id2.i_super.s_clustersize_bits); > >> > >>-- > >>Goldwyn > > > -- > Goldwyn
On 01/15/2014 09:53 AM, Jan Kara wrote: > On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: >> On 01/15/2014 07:33 AM, Jan Kara wrote: >>> On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >>>> The following patches are reverted in this patch because these >>>> patches caused regression in the unlink() calls. >>>> >>>> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >>>> of dentry lock to ocfs2_wq >>>> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >>>> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >>>> ocfs2_kill_sb on a failed mount >>>> >>>> The regression is caused because these patches delay the iput() in case >>>> of dentry unlocks. This also delays the unlocking of the open lockres. >>>> The open lockresource is required to test if the inode can be wiped from >>>> disk on not. When the deleting node does not get the open lock, it marks >>>> it as orphan (even though it is not in use by another node/process) >>>> and causes a journal checkpoint. This delays operations following the >>>> inode eviction. This also moves the inode to the orphaned inode >>>> which further causes more I/O and a lot of unneccessary orphans. >>>> >>>> As for the fix, I tried reproducing the problem by using quotas and enabling >>>> lockdep, but the issue could not be reproduced. >>> So I was thinking about this for a while trying to remember... What it >>> really boils down to is whether it is OK to call ocfs2_delete_inode() from >>> DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all >>> kinds of interesting cluster locks meaning that the downconvert thread can >>> block waiting to acquire some cluster lock. That seems like asking for >>> trouble to me (i.e., what if the node holding the lock we need from >>> downconvert thread needs a lock from us first?) but maybe it is OK these >>> days. >>> >> >> The only lock it tries to take is the "inode lock" resource, which >> seems to be fine to take. > Why is it obviously fine? Some other node might be holding the inode lock > and still write to the inode. If that needs a cluster lock for block > allocation and that allocation cluster lock is currently hold by our node, > we are screwed. Aren't we? > No. If another thread is using the allocation cluster lock implies we already have the inode lock. Cluster locks are also taken in a strict order in order to avoid ABBA locking. >> It is taken to refresh the inode from >> disk. The other cluster lock is the "open lock", which is taken in a >> try-lock fashion and is not blocking. >> >> I don't see quota code using any cluster locking, but if there is >> anything interfering, I would like to know. > Quota code can take e.g. inode lock on the quota file during > dquot_initialize() call. It's not common (usually the quota structure is > already cached in memory) but it can happen. This seems safe, because cluster lock granularity (for ocfs2) is node level. So, if the node already holds the lock, then taking it again would succeed and just increment the hold count. > > Honza > >>>> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com> >>>> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com> >>>> >>>> --- >>>> diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c >>>> index 0d3a97d..e2e05a1 100644 >>>> --- a/fs/ocfs2/dcache.c >>>> +++ b/fs/ocfs2/dcache.c >>>> @@ -37,7 +37,6 @@ >>>> #include "dlmglue.h" >>>> #include "file.h" >>>> #include "inode.h" >>>> -#include "super.h" >>>> #include "ocfs2_trace.h" >>>> >>>> void ocfs2_dentry_attach_gen(struct dentry *dentry) >>>> @@ -346,52 +345,6 @@ out_attach: >>>> return ret; >>>> } >>>> >>>> -DEFINE_SPINLOCK(dentry_list_lock); >>>> - >>>> -/* We limit the number of dentry locks to drop in one go. We have >>>> - * this limit so that we don't starve other users of ocfs2_wq. */ >>>> -#define DL_INODE_DROP_COUNT 64 >>>> - >>>> -/* Drop inode references from dentry locks */ >>>> -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) >>>> -{ >>>> - struct ocfs2_dentry_lock *dl; >>>> - >>>> - spin_lock(&dentry_list_lock); >>>> - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { >>>> - dl = osb->dentry_lock_list; >>>> - osb->dentry_lock_list = dl->dl_next; >>>> - spin_unlock(&dentry_list_lock); >>>> - iput(dl->dl_inode); >>>> - kfree(dl); >>>> - spin_lock(&dentry_list_lock); >>>> - } >>>> - spin_unlock(&dentry_list_lock); >>>> -} >>>> - >>>> -void ocfs2_drop_dl_inodes(struct work_struct *work) >>>> -{ >>>> - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, >>>> - dentry_lock_work); >>>> - >>>> - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); >>>> - /* >>>> - * Don't queue dropping if umount is in progress. We flush the >>>> - * list in ocfs2_dismount_volume >>>> - */ >>>> - spin_lock(&dentry_list_lock); >>>> - if (osb->dentry_lock_list && >>>> - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) >>>> - queue_work(ocfs2_wq, &osb->dentry_lock_work); >>>> - spin_unlock(&dentry_list_lock); >>>> -} >>>> - >>>> -/* Flush the whole work queue */ >>>> -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) >>>> -{ >>>> - __ocfs2_drop_dl_inodes(osb, -1); >>>> -} >>>> - >>>> /* >>>> * ocfs2_dentry_iput() and friends. >>>> * >>>> @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) >>>> static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, >>>> struct ocfs2_dentry_lock *dl) >>>> { >>>> + iput(dl->dl_inode); >>>> ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); >>>> ocfs2_lock_res_free(&dl->dl_lockres); >>>> - >>>> - /* We leave dropping of inode reference to ocfs2_wq as that can >>>> - * possibly lead to inode deletion which gets tricky */ >>>> - spin_lock(&dentry_list_lock); >>>> - if (!osb->dentry_lock_list && >>>> - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) >>>> - queue_work(ocfs2_wq, &osb->dentry_lock_work); >>>> - dl->dl_next = osb->dentry_lock_list; >>>> - osb->dentry_lock_list = dl; >>>> - spin_unlock(&dentry_list_lock); >>>> + kfree(dl); >>>> } >>>> >>>> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, >>>> struct ocfs2_dentry_lock *dl) >>>> { >>>> - int unlock; >>>> + int unlock = 0; >>>> >>>> BUG_ON(dl->dl_count == 0); >>>> >>>> diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h >>>> index b79eff7..55f5889 100644 >>>> --- a/fs/ocfs2/dcache.h >>>> +++ b/fs/ocfs2/dcache.h >>>> @@ -29,13 +29,8 @@ >>>> extern const struct dentry_operations ocfs2_dentry_ops; >>>> >>>> struct ocfs2_dentry_lock { >>>> - /* Use count of dentry lock */ >>>> unsigned int dl_count; >>>> - union { >>>> - /* Linked list of dentry locks to release */ >>>> - struct ocfs2_dentry_lock *dl_next; >>>> - u64 dl_parent_blkno; >>>> - }; >>>> + u64 dl_parent_blkno; >>>> >>>> /* >>>> * The ocfs2_dentry_lock keeps an inode reference until >>>> @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { >>>> int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, >>>> u64 parent_blkno); >>>> >>>> -extern spinlock_t dentry_list_lock; >>>> - >>>> void ocfs2_dentry_lock_put(struct ocfs2_super *osb, >>>> struct ocfs2_dentry_lock *dl); >>>> >>>> -void ocfs2_drop_dl_inodes(struct work_struct *work); >>>> -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); >>>> - >>>> struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, >>>> int skip_unhashed); >>>> >>>> diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h >>>> index 553f53c..6be4e1d 100644 >>>> --- a/fs/ocfs2/ocfs2.h >>>> +++ b/fs/ocfs2/ocfs2.h >>>> @@ -274,19 +274,16 @@ enum ocfs2_mount_options >>>> OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ >>>> }; >>>> >>>> -#define OCFS2_OSB_SOFT_RO 0x0001 >>>> -#define OCFS2_OSB_HARD_RO 0x0002 >>>> -#define OCFS2_OSB_ERROR_FS 0x0004 >>>> -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 >>>> - >>>> -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 >>>> +#define OCFS2_OSB_SOFT_RO 0x0001 >>>> +#define OCFS2_OSB_HARD_RO 0x0002 >>>> +#define OCFS2_OSB_ERROR_FS 0x0004 >>>> +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 >>>> >>>> struct ocfs2_journal; >>>> struct ocfs2_slot_info; >>>> struct ocfs2_recovery_map; >>>> struct ocfs2_replay_map; >>>> struct ocfs2_quota_recovery; >>>> -struct ocfs2_dentry_lock; >>>> struct ocfs2_super >>>> { >>>> struct task_struct *commit_task; >>>> @@ -414,11 +411,6 @@ struct ocfs2_super >>>> struct list_head blocked_lock_list; >>>> unsigned long blocked_lock_count; >>>> >>>> - /* List of dentry locks to release. Anyone can add locks to >>>> - * the list, ocfs2_wq processes the list */ >>>> - struct ocfs2_dentry_lock *dentry_lock_list; >>>> - struct work_struct dentry_lock_work; >>>> - >>>> wait_queue_head_t osb_mount_event; >>>> >>>> /* Truncate log info */ >>>> @@ -579,18 +571,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, >>>> spin_unlock(&osb->osb_lock); >>>> } >>>> >>>> - >>>> -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, >>>> - unsigned long flag) >>>> -{ >>>> - unsigned long ret; >>>> - >>>> - spin_lock(&osb->osb_lock); >>>> - ret = osb->osb_flags & flag; >>>> - spin_unlock(&osb->osb_lock); >>>> - return ret; >>>> -} >>>> - >>>> static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, >>>> int hard) >>>> { >>>> diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c >>>> index 7d259ff..129b4db 100644 >>>> --- a/fs/ocfs2/super.c >>>> +++ b/fs/ocfs2/super.c >>>> @@ -1238,30 +1238,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, >>>> return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); >>>> } >>>> >>>> -static void ocfs2_kill_sb(struct super_block *sb) >>>> -{ >>>> - struct ocfs2_super *osb = OCFS2_SB(sb); >>>> - >>>> - /* Failed mount? */ >>>> - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) >>>> - goto out; >>>> - >>>> - /* Prevent further queueing of inode drop events */ >>>> - spin_lock(&dentry_list_lock); >>>> - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); >>>> - spin_unlock(&dentry_list_lock); >>>> - /* Wait for work to finish and/or remove it */ >>>> - cancel_work_sync(&osb->dentry_lock_work); >>>> -out: >>>> - kill_block_super(sb); >>>> -} >>>> - >>>> static struct file_system_type ocfs2_fs_type = { >>>> .owner = THIS_MODULE, >>>> .name = "ocfs2", >>>> .mount = ocfs2_mount, >>>> - .kill_sb = ocfs2_kill_sb, >>>> - >>>> + .kill_sb = kill_block_super, >>>> .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, >>>> .next = NULL >>>> }; >>>> @@ -1934,12 +1915,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) >>>> >>>> debugfs_remove(osb->osb_ctxt); >>>> >>>> - /* >>>> - * Flush inode dropping work queue so that deletes are >>>> - * performed while the filesystem is still working >>>> - */ >>>> - ocfs2_drop_all_dl_inodes(osb); >>>> - >>>> /* Orphan scan should be stopped as early as possible */ >>>> ocfs2_orphan_scan_stop(osb); >>>> >>>> @@ -2274,9 +2249,6 @@ static int ocfs2_initialize_super(struct super_block *sb, >>>> INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); >>>> journal->j_state = OCFS2_JOURNAL_FREE; >>>> >>>> - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); >>>> - osb->dentry_lock_list = NULL; >>>> - >>>> /* get some pseudo constants for clustersize bits */ >>>> osb->s_clustersize_bits = >>>> le32_to_cpu(di->id2.i_super.s_clustersize_bits); >>>> >>>> -- >>>> Goldwyn >> >> >> -- >> Goldwyn
On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: > On 01/15/2014 09:53 AM, Jan Kara wrote: > >On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > >>On 01/15/2014 07:33 AM, Jan Kara wrote: > >>>On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>>>The following patches are reverted in this patch because these > >>>>patches caused regression in the unlink() calls. > >>>> > >>>>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>>>of dentry lock to ocfs2_wq > >>>>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>>>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>>>ocfs2_kill_sb on a failed mount > >>>> > >>>>The regression is caused because these patches delay the iput() in case > >>>>of dentry unlocks. This also delays the unlocking of the open lockres. > >>>>The open lockresource is required to test if the inode can be wiped from > >>>>disk on not. When the deleting node does not get the open lock, it marks > >>>>it as orphan (even though it is not in use by another node/process) > >>>>and causes a journal checkpoint. This delays operations following the > >>>>inode eviction. This also moves the inode to the orphaned inode > >>>>which further causes more I/O and a lot of unneccessary orphans. > >>>> > >>>>As for the fix, I tried reproducing the problem by using quotas and enabling > >>>>lockdep, but the issue could not be reproduced. > >>> So I was thinking about this for a while trying to remember... What it > >>>really boils down to is whether it is OK to call ocfs2_delete_inode() from > >>>DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >>>kinds of interesting cluster locks meaning that the downconvert thread can > >>>block waiting to acquire some cluster lock. That seems like asking for > >>>trouble to me (i.e., what if the node holding the lock we need from > >>>downconvert thread needs a lock from us first?) but maybe it is OK these > >>>days. > >>> > >> > >>The only lock it tries to take is the "inode lock" resource, which > >>seems to be fine to take. > > Why is it obviously fine? Some other node might be holding the inode lock > >and still write to the inode. If that needs a cluster lock for block > >allocation and that allocation cluster lock is currently hold by our node, > >we are screwed. Aren't we? > > > > No. If another thread is using the allocation cluster lock implies > we already have the inode lock. Cluster locks are also taken in a > strict order in order to avoid ABBA locking. I agree with what you wrote above but it's not exactly what I'm talking about. What seems to be possible is: NODE 1 NODE2 holds dentry lock for 'foo' holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE dquot_initialize(bar) ocfs2_dquot_acquire() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) ... downconvert thread (triggered from another node or a different process from NODE2) ocfs2_dentry_post_unlock() ... iput(foo) ocfs2_evict_inode(foo) ocfs2_clear_inode(foo) dquot_drop(inode) ... ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) - blocks finds we need more space in quota file ... ocfs2_extend_no_holes() ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) - deadlocks waiting for downconvert thread However the positive part of this is that the race I was originally afraid of - when ocfs2_delete_inode() would be triggered from ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep invariant that node can cache dentry for 'foo' without holding inode lock for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread should never go into ocfs2_delete_inode() (however it would be nice to assert this in ocfs2_dentry_post_unlock() for a peace of mind). Since the deadlock seems to be quota specific, it should be possible postpone just the quota processing for the workqueue. It isn't completely trivial because we still have to cleanup inode quota pointers but it should be doable. I'll try to have a look at this tomorrow. Honza
On 01/15/2014 08:47 PM, Jan Kara wrote: > On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: >> On 01/15/2014 09:53 AM, Jan Kara wrote: >>> On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: >>>> On 01/15/2014 07:33 AM, Jan Kara wrote: >>>>> On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >>>>>> The following patches are reverted in this patch because these >>>>>> patches caused regression in the unlink() calls. >>>>>> >>>>>> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >>>>>> of dentry lock to ocfs2_wq >>>>>> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >>>>>> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >>>>>> ocfs2_kill_sb on a failed mount >>>>>> >>>>>> The regression is caused because these patches delay the iput() in case >>>>>> of dentry unlocks. This also delays the unlocking of the open lockres. >>>>>> The open lockresource is required to test if the inode can be wiped from >>>>>> disk on not. When the deleting node does not get the open lock, it marks >>>>>> it as orphan (even though it is not in use by another node/process) >>>>>> and causes a journal checkpoint. This delays operations following the >>>>>> inode eviction. This also moves the inode to the orphaned inode >>>>>> which further causes more I/O and a lot of unneccessary orphans. >>>>>> >>>>>> As for the fix, I tried reproducing the problem by using quotas and enabling >>>>>> lockdep, but the issue could not be reproduced. >>>>> So I was thinking about this for a while trying to remember... What it >>>>> really boils down to is whether it is OK to call ocfs2_delete_inode() from >>>>> DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all >>>>> kinds of interesting cluster locks meaning that the downconvert thread can >>>>> block waiting to acquire some cluster lock. That seems like asking for >>>>> trouble to me (i.e., what if the node holding the lock we need from >>>>> downconvert thread needs a lock from us first?) but maybe it is OK these >>>>> days. >>>>> >>>> >>>> The only lock it tries to take is the "inode lock" resource, which >>>> seems to be fine to take. >>> Why is it obviously fine? Some other node might be holding the inode lock >>> and still write to the inode. If that needs a cluster lock for block >>> allocation and that allocation cluster lock is currently hold by our node, >>> we are screwed. Aren't we? >>> >> >> No. If another thread is using the allocation cluster lock implies >> we already have the inode lock. Cluster locks are also taken in a >> strict order in order to avoid ABBA locking. > I agree with what you wrote above but it's not exactly what I'm talking > about. What seems to be possible is: > NODE 1 NODE2 > holds dentry lock for 'foo' > holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE > dquot_initialize(bar) > ocfs2_dquot_acquire() > ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > ... > downconvert thread (triggered from another > node or a different process from NODE2) > ocfs2_dentry_post_unlock() > ... > iput(foo) > ocfs2_evict_inode(foo) > ocfs2_clear_inode(foo) > dquot_drop(inode) > ... > ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > - blocks > finds we need more space in > quota file > ... > ocfs2_extend_no_holes() > ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) > - deadlocks waiting for > downconvert thread > Thanks. This explains the situation much better. Whats stopping NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from hold and wait condition) > However the positive part of this is that the race I was originally afraid > of - when ocfs2_delete_inode() would be triggered from > ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep > invariant that node can cache dentry for 'foo' without holding inode lock > for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread > should never go into ocfs2_delete_inode() (however it would be nice to > assert this in ocfs2_dentry_post_unlock() for a peace of mind). No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, ocfs2_delete_inode() is called even if i_nlink > 0. > > Since the deadlock seems to be quota specific, it should be possible > postpone just the quota processing for the workqueue. It isn't completely > trivial because we still have to cleanup inode quota pointers but it should > be doable. I'll try to have a look at this tomorrow. Thanks! We need to work out a different way in order to fix this so that open locks are not delayed and does not hurt unlink performance.
On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: > On 01/15/2014 08:47 PM, Jan Kara wrote: > >On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: > >>On 01/15/2014 09:53 AM, Jan Kara wrote: > >>>On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > >>>>On 01/15/2014 07:33 AM, Jan Kara wrote: > >>>>>On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>>>>>The following patches are reverted in this patch because these > >>>>>>patches caused regression in the unlink() calls. > >>>>>> > >>>>>>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>>>>>of dentry lock to ocfs2_wq > >>>>>>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>>>>>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>>>>>ocfs2_kill_sb on a failed mount > >>>>>> > >>>>>>The regression is caused because these patches delay the iput() in case > >>>>>>of dentry unlocks. This also delays the unlocking of the open lockres. > >>>>>>The open lockresource is required to test if the inode can be wiped from > >>>>>>disk on not. When the deleting node does not get the open lock, it marks > >>>>>>it as orphan (even though it is not in use by another node/process) > >>>>>>and causes a journal checkpoint. This delays operations following the > >>>>>>inode eviction. This also moves the inode to the orphaned inode > >>>>>>which further causes more I/O and a lot of unneccessary orphans. > >>>>>> > >>>>>>As for the fix, I tried reproducing the problem by using quotas and enabling > >>>>>>lockdep, but the issue could not be reproduced. > >>>>> So I was thinking about this for a while trying to remember... What it > >>>>>really boils down to is whether it is OK to call ocfs2_delete_inode() from > >>>>>DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >>>>>kinds of interesting cluster locks meaning that the downconvert thread can > >>>>>block waiting to acquire some cluster lock. That seems like asking for > >>>>>trouble to me (i.e., what if the node holding the lock we need from > >>>>>downconvert thread needs a lock from us first?) but maybe it is OK these > >>>>>days. > >>>>> > >>>> > >>>>The only lock it tries to take is the "inode lock" resource, which > >>>>seems to be fine to take. > >>> Why is it obviously fine? Some other node might be holding the inode lock > >>>and still write to the inode. If that needs a cluster lock for block > >>>allocation and that allocation cluster lock is currently hold by our node, > >>>we are screwed. Aren't we? > >>> > >> > >>No. If another thread is using the allocation cluster lock implies > >>we already have the inode lock. Cluster locks are also taken in a > >>strict order in order to avoid ABBA locking. > > I agree with what you wrote above but it's not exactly what I'm talking > >about. What seems to be possible is: > >NODE 1 NODE2 > >holds dentry lock for 'foo' > >holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE > > dquot_initialize(bar) > > ocfs2_dquot_acquire() > > ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > > ... > >downconvert thread (triggered from another > >node or a different process from NODE2) > > ocfs2_dentry_post_unlock() > > ... > > iput(foo) > > ocfs2_evict_inode(foo) > > ocfs2_clear_inode(foo) > > dquot_drop(inode) > > ... > > ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > > - blocks > > finds we need more space in > > quota file > > ... > > ocfs2_extend_no_holes() > > ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) > > - deadlocks waiting for > > downconvert thread > > > > Thanks. This explains the situation much better. Whats stopping > NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from > hold and wait condition) The fact that to release the lock, it has to be downconverted. And there is only one downconvert thread (ocfs2dc) and that is busy waiting for USER_QUOTA_SYSTEM_INODE lock... > >However the positive part of this is that the race I was originally afraid > >of - when ocfs2_delete_inode() would be triggered from > >ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep > >invariant that node can cache dentry for 'foo' without holding inode lock > >for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread > >should never go into ocfs2_delete_inode() (however it would be nice to > >assert this in ocfs2_dentry_post_unlock() for a peace of mind). > > No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, > ocfs2_delete_inode() is called even if i_nlink > 0. Ah, too bad :(. Do you know why we do that? I have to admit inode deletion locking was always one of the trickier part of ocfs2 locking and I'm not sure I've fully appreciated all its details. > >Since the deadlock seems to be quota specific, it should be possible > >postpone just the quota processing for the workqueue. It isn't completely > >trivial because we still have to cleanup inode quota pointers but it should > >be doable. I'll try to have a look at this tomorrow. > > Thanks! We need to work out a different way in order to fix this so > that open locks are not delayed and does not hurt unlink > performance. Sure, I'm all for fixing this deadlock in a better way. Honza
On 01/16/2014 08:02 AM, Jan Kara wrote: > On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: >> On 01/15/2014 08:47 PM, Jan Kara wrote: >>> On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: >>>> On 01/15/2014 09:53 AM, Jan Kara wrote: >>>>> On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: >>>>>> On 01/15/2014 07:33 AM, Jan Kara wrote: >>>>>>> On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >>>>>>>> The following patches are reverted in this patch because these >>>>>>>> patches caused regression in the unlink() calls. >>>>>>>> >>>>>>>> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >>>>>>>> of dentry lock to ocfs2_wq >>>>>>>> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >>>>>>>> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >>>>>>>> ocfs2_kill_sb on a failed mount >>>>>>>> >>>>>>>> The regression is caused because these patches delay the iput() in case >>>>>>>> of dentry unlocks. This also delays the unlocking of the open lockres. >>>>>>>> The open lockresource is required to test if the inode can be wiped from >>>>>>>> disk on not. When the deleting node does not get the open lock, it marks >>>>>>>> it as orphan (even though it is not in use by another node/process) >>>>>>>> and causes a journal checkpoint. This delays operations following the >>>>>>>> inode eviction. This also moves the inode to the orphaned inode >>>>>>>> which further causes more I/O and a lot of unneccessary orphans. >>>>>>>> >>>>>>>> As for the fix, I tried reproducing the problem by using quotas and enabling >>>>>>>> lockdep, but the issue could not be reproduced. >>>>>>> So I was thinking about this for a while trying to remember... What it >>>>>>> really boils down to is whether it is OK to call ocfs2_delete_inode() from >>>>>>> DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all >>>>>>> kinds of interesting cluster locks meaning that the downconvert thread can >>>>>>> block waiting to acquire some cluster lock. That seems like asking for >>>>>>> trouble to me (i.e., what if the node holding the lock we need from >>>>>>> downconvert thread needs a lock from us first?) but maybe it is OK these >>>>>>> days. >>>>>>> >>>>>> >>>>>> The only lock it tries to take is the "inode lock" resource, which >>>>>> seems to be fine to take. >>>>> Why is it obviously fine? Some other node might be holding the inode lock >>>>> and still write to the inode. If that needs a cluster lock for block >>>>> allocation and that allocation cluster lock is currently hold by our node, >>>>> we are screwed. Aren't we? >>>>> >>>> >>>> No. If another thread is using the allocation cluster lock implies >>>> we already have the inode lock. Cluster locks are also taken in a >>>> strict order in order to avoid ABBA locking. >>> I agree with what you wrote above but it's not exactly what I'm talking >>> about. What seems to be possible is: >>> NODE 1 NODE2 >>> holds dentry lock for 'foo' >>> holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE >>> dquot_initialize(bar) >>> ocfs2_dquot_acquire() >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>> ... >>> downconvert thread (triggered from another >>> node or a different process from NODE2) >>> ocfs2_dentry_post_unlock() >>> ... >>> iput(foo) >>> ocfs2_evict_inode(foo) >>> ocfs2_clear_inode(foo) >>> dquot_drop(inode) >>> ... >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>> - blocks >>> finds we need more space in >>> quota file >>> ... >>> ocfs2_extend_no_holes() >>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) >>> - deadlocks waiting for >>> downconvert thread >>> >> >> Thanks. This explains the situation much better. Whats stopping >> NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from >> hold and wait condition) > The fact that to release the lock, it has to be downconverted. And there > is only one downconvert thread (ocfs2dc) and that is busy waiting for > USER_QUOTA_SYSTEM_INODE lock... > >>> However the positive part of this is that the race I was originally afraid >>> of - when ocfs2_delete_inode() would be triggered from >>> ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep >>> invariant that node can cache dentry for 'foo' without holding inode lock >>> for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread >>> should never go into ocfs2_delete_inode() (however it would be nice to >>> assert this in ocfs2_dentry_post_unlock() for a peace of mind). >> >> No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, >> ocfs2_delete_inode() is called even if i_nlink > 0. > Ah, too bad :(. Do you know why we do that? I have to admit inode > deletion locking was always one of the trickier part of ocfs2 locking and > I'm not sure I've fully appreciated all its details. > Well, The file could be unlinked on another node while it is open on this node. inode->i_nlink is a node-local value (and is refreshed only later). >>> Since the deadlock seems to be quota specific, it should be possible >>> postpone just the quota processing for the workqueue. It isn't completely >>> trivial because we still have to cleanup inode quota pointers but it should >>> be doable. I'll try to have a look at this tomorrow. >> >> Thanks! We need to work out a different way in order to fix this so >> that open locks are not delayed and does not hurt unlink >> performance. > Sure, I'm all for fixing this deadlock in a better way.
On Thu 16-01-14 09:49:46, Goldwyn Rodrigues wrote: > On 01/16/2014 08:02 AM, Jan Kara wrote: > >On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: > >>On 01/15/2014 08:47 PM, Jan Kara wrote: > >>>On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: > >>>>On 01/15/2014 09:53 AM, Jan Kara wrote: > >>>>>On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > >>>>>>On 01/15/2014 07:33 AM, Jan Kara wrote: > >>>>>>>On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>>>>>>>The following patches are reverted in this patch because these > >>>>>>>>patches caused regression in the unlink() calls. > >>>>>>>> > >>>>>>>>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>>>>>>>of dentry lock to ocfs2_wq > >>>>>>>>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>>>>>>>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>>>>>>>ocfs2_kill_sb on a failed mount > >>>>>>>> > >>>>>>>>The regression is caused because these patches delay the iput() in case > >>>>>>>>of dentry unlocks. This also delays the unlocking of the open lockres. > >>>>>>>>The open lockresource is required to test if the inode can be wiped from > >>>>>>>>disk on not. When the deleting node does not get the open lock, it marks > >>>>>>>>it as orphan (even though it is not in use by another node/process) > >>>>>>>>and causes a journal checkpoint. This delays operations following the > >>>>>>>>inode eviction. This also moves the inode to the orphaned inode > >>>>>>>>which further causes more I/O and a lot of unneccessary orphans. > >>>>>>>> > >>>>>>>>As for the fix, I tried reproducing the problem by using quotas and enabling > >>>>>>>>lockdep, but the issue could not be reproduced. > >>>>>>> So I was thinking about this for a while trying to remember... What it > >>>>>>>really boils down to is whether it is OK to call ocfs2_delete_inode() from > >>>>>>>DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >>>>>>>kinds of interesting cluster locks meaning that the downconvert thread can > >>>>>>>block waiting to acquire some cluster lock. That seems like asking for > >>>>>>>trouble to me (i.e., what if the node holding the lock we need from > >>>>>>>downconvert thread needs a lock from us first?) but maybe it is OK these > >>>>>>>days. > >>>>>>> > >>>>>> > >>>>>>The only lock it tries to take is the "inode lock" resource, which > >>>>>>seems to be fine to take. > >>>>> Why is it obviously fine? Some other node might be holding the inode lock > >>>>>and still write to the inode. If that needs a cluster lock for block > >>>>>allocation and that allocation cluster lock is currently hold by our node, > >>>>>we are screwed. Aren't we? > >>>>> > >>>> > >>>>No. If another thread is using the allocation cluster lock implies > >>>>we already have the inode lock. Cluster locks are also taken in a > >>>>strict order in order to avoid ABBA locking. > >>> I agree with what you wrote above but it's not exactly what I'm talking > >>>about. What seems to be possible is: > >>>NODE 1 NODE2 > >>>holds dentry lock for 'foo' > >>>holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE > >>> dquot_initialize(bar) > >>> ocfs2_dquot_acquire() > >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>> ... > >>>downconvert thread (triggered from another > >>>node or a different process from NODE2) > >>> ocfs2_dentry_post_unlock() > >>> ... > >>> iput(foo) > >>> ocfs2_evict_inode(foo) > >>> ocfs2_clear_inode(foo) > >>> dquot_drop(inode) > >>> ... > >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>> - blocks > >>> finds we need more space in > >>> quota file > >>> ... > >>> ocfs2_extend_no_holes() > >>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) > >>> - deadlocks waiting for > >>> downconvert thread > >>> > >> > >>Thanks. This explains the situation much better. Whats stopping > >>NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from > >>hold and wait condition) > > The fact that to release the lock, it has to be downconverted. And there > >is only one downconvert thread (ocfs2dc) and that is busy waiting for > >USER_QUOTA_SYSTEM_INODE lock... > > > >>>However the positive part of this is that the race I was originally afraid > >>>of - when ocfs2_delete_inode() would be triggered from > >>>ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep > >>>invariant that node can cache dentry for 'foo' without holding inode lock > >>>for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread > >>>should never go into ocfs2_delete_inode() (however it would be nice to > >>>assert this in ocfs2_dentry_post_unlock() for a peace of mind). > >> > >>No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, > >>ocfs2_delete_inode() is called even if i_nlink > 0. > > Ah, too bad :(. Do you know why we do that? I have to admit inode > >deletion locking was always one of the trickier part of ocfs2 locking and > >I'm not sure I've fully appreciated all its details. > > > > Well, The file could be unlinked on another node while it is open on > this node. inode->i_nlink is a node-local value (and is refreshed > only later). OK, but ocfs2_inode_is_valid_to_delete() will skip deleting inode if called from downconvert thread anyway (see the current == osb->dc_task check). So setting OCFS2_INODE_MAYBE_ORPHANED in ocfs2_dentry_convert_worker() seems to be futile? Also the node doing unlink fails to delete the inode only if ocfs2_query_inode_wipe() fails to upconvert the open lock. That means some other node is still holding open lock for the inode. Since ocfs2_query_inode_wipe() is called after unlink is finished and thus all dentry locks are dropped, we can be sure that dropping the dentry lock will never drop the last inode reference for an unlinked inode. So it seems to be safe to just remove OCFS2_INODE_MAYBE_ORPHANED from ocfs2_dentry_convert_worker()? Honza
On 01/16/2014 11:13 AM, Jan Kara wrote: > On Thu 16-01-14 09:49:46, Goldwyn Rodrigues wrote: >> On 01/16/2014 08:02 AM, Jan Kara wrote: >>> On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: >>>> On 01/15/2014 08:47 PM, Jan Kara wrote: >>>>> On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: >>>>>> On 01/15/2014 09:53 AM, Jan Kara wrote: >>>>>>> On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: >>>>>>>> On 01/15/2014 07:33 AM, Jan Kara wrote: >>>>>>>>> On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >>>>>>>>>> The following patches are reverted in this patch because these >>>>>>>>>> patches caused regression in the unlink() calls. >>>>>>>>>> >>>>>>>>>> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >>>>>>>>>> of dentry lock to ocfs2_wq >>>>>>>>>> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >>>>>>>>>> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >>>>>>>>>> ocfs2_kill_sb on a failed mount >>>>>>>>>> >>>>>>>>>> The regression is caused because these patches delay the iput() in case >>>>>>>>>> of dentry unlocks. This also delays the unlocking of the open lockres. >>>>>>>>>> The open lockresource is required to test if the inode can be wiped from >>>>>>>>>> disk on not. When the deleting node does not get the open lock, it marks >>>>>>>>>> it as orphan (even though it is not in use by another node/process) >>>>>>>>>> and causes a journal checkpoint. This delays operations following the >>>>>>>>>> inode eviction. This also moves the inode to the orphaned inode >>>>>>>>>> which further causes more I/O and a lot of unneccessary orphans. >>>>>>>>>> >>>>>>>>>> As for the fix, I tried reproducing the problem by using quotas and enabling >>>>>>>>>> lockdep, but the issue could not be reproduced. >>>>>>>>> So I was thinking about this for a while trying to remember... What it >>>>>>>>> really boils down to is whether it is OK to call ocfs2_delete_inode() from >>>>>>>>> DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all >>>>>>>>> kinds of interesting cluster locks meaning that the downconvert thread can >>>>>>>>> block waiting to acquire some cluster lock. That seems like asking for >>>>>>>>> trouble to me (i.e., what if the node holding the lock we need from >>>>>>>>> downconvert thread needs a lock from us first?) but maybe it is OK these >>>>>>>>> days. >>>>>>>>> >>>>>>>> >>>>>>>> The only lock it tries to take is the "inode lock" resource, which >>>>>>>> seems to be fine to take. >>>>>>> Why is it obviously fine? Some other node might be holding the inode lock >>>>>>> and still write to the inode. If that needs a cluster lock for block >>>>>>> allocation and that allocation cluster lock is currently hold by our node, >>>>>>> we are screwed. Aren't we? >>>>>>> >>>>>> >>>>>> No. If another thread is using the allocation cluster lock implies >>>>>> we already have the inode lock. Cluster locks are also taken in a >>>>>> strict order in order to avoid ABBA locking. >>>>> I agree with what you wrote above but it's not exactly what I'm talking >>>>> about. What seems to be possible is: >>>>> NODE 1 NODE2 >>>>> holds dentry lock for 'foo' >>>>> holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE >>>>> dquot_initialize(bar) >>>>> ocfs2_dquot_acquire() >>>>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>>>> ... >>>>> downconvert thread (triggered from another >>>>> node or a different process from NODE2) >>>>> ocfs2_dentry_post_unlock() >>>>> ... >>>>> iput(foo) >>>>> ocfs2_evict_inode(foo) >>>>> ocfs2_clear_inode(foo) >>>>> dquot_drop(inode) >>>>> ... >>>>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>>>> - blocks >>>>> finds we need more space in >>>>> quota file >>>>> ... >>>>> ocfs2_extend_no_holes() >>>>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) >>>>> - deadlocks waiting for >>>>> downconvert thread >>>>> >>>> >>>> Thanks. This explains the situation much better. Whats stopping >>>> NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from >>>> hold and wait condition) >>> The fact that to release the lock, it has to be downconverted. And there >>> is only one downconvert thread (ocfs2dc) and that is busy waiting for >>> USER_QUOTA_SYSTEM_INODE lock... >>> >>>>> However the positive part of this is that the race I was originally afraid >>>>> of - when ocfs2_delete_inode() would be triggered from >>>>> ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep >>>>> invariant that node can cache dentry for 'foo' without holding inode lock >>>>> for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread >>>>> should never go into ocfs2_delete_inode() (however it would be nice to >>>>> assert this in ocfs2_dentry_post_unlock() for a peace of mind). >>>> >>>> No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, >>>> ocfs2_delete_inode() is called even if i_nlink > 0. >>> Ah, too bad :(. Do you know why we do that? I have to admit inode >>> deletion locking was always one of the trickier part of ocfs2 locking and >>> I'm not sure I've fully appreciated all its details. >>> >> >> Well, The file could be unlinked on another node while it is open on >> this node. inode->i_nlink is a node-local value (and is refreshed >> only later). > OK, but ocfs2_inode_is_valid_to_delete() will skip deleting inode if > called from downconvert thread anyway (see the current == osb->dc_task > check). So setting OCFS2_INODE_MAYBE_ORPHANED in > ocfs2_dentry_convert_worker() seems to be futile? > > Also the node doing unlink fails to delete the inode only if > ocfs2_query_inode_wipe() fails to upconvert the open lock. That means some > other node is still holding open lock for the inode. Since > ocfs2_query_inode_wipe() is called after unlink is finished and thus all > dentry locks are dropped, we can be sure that dropping the dentry lock > will never drop the last inode reference for an unlinked inode. > > So it seems to be safe to just remove OCFS2_INODE_MAYBE_ORPHANED from > ocfs2_dentry_convert_worker()? > I just walked this path and got badgered ;) https://oss.oracle.com/pipermail/ocfs2-devel/2014-January/009509.html The problem is that if this node has the file open, and another node initiates the unlink, deleting the inode becomes the responsibility of the current node which has the file open after the last close. However, from the VFS point of view, would an inode be evicted if the file is open?
Hi Jan, thanks a lot for explaining the problem. Please see my comment below. On 01/16/2014 06:02 AM, Jan Kara wrote: > On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: >> On 01/15/2014 08:47 PM, Jan Kara wrote: >>> On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: >>>> On 01/15/2014 09:53 AM, Jan Kara wrote: >>>>> On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: >>>>>> On 01/15/2014 07:33 AM, Jan Kara wrote: >>>>>>> On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: >>>>>>>> The following patches are reverted in this patch because these >>>>>>>> patches caused regression in the unlink() calls. >>>>>>>> >>>>>>>> ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping >>>>>>>> of dentry lock to ocfs2_wq >>>>>>>> f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount >>>>>>>> 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in >>>>>>>> ocfs2_kill_sb on a failed mount >>>>>>>> >>>>>>>> The regression is caused because these patches delay the iput() in case >>>>>>>> of dentry unlocks. This also delays the unlocking of the open lockres. >>>>>>>> The open lockresource is required to test if the inode can be wiped from >>>>>>>> disk on not. When the deleting node does not get the open lock, it marks >>>>>>>> it as orphan (even though it is not in use by another node/process) >>>>>>>> and causes a journal checkpoint. This delays operations following the >>>>>>>> inode eviction. This also moves the inode to the orphaned inode >>>>>>>> which further causes more I/O and a lot of unneccessary orphans. >>>>>>>> >>>>>>>> As for the fix, I tried reproducing the problem by using quotas and enabling >>>>>>>> lockdep, but the issue could not be reproduced. >>>>>>> So I was thinking about this for a while trying to remember... What it >>>>>>> really boils down to is whether it is OK to call ocfs2_delete_inode() from >>>>>>> DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all >>>>>>> kinds of interesting cluster locks meaning that the downconvert thread can >>>>>>> block waiting to acquire some cluster lock. That seems like asking for >>>>>>> trouble to me (i.e., what if the node holding the lock we need from >>>>>>> downconvert thread needs a lock from us first?) but maybe it is OK these >>>>>>> days. >>>>>>> >>>>>> The only lock it tries to take is the "inode lock" resource, which >>>>>> seems to be fine to take. >>>>> Why is it obviously fine? Some other node might be holding the inode lock >>>>> and still write to the inode. If that needs a cluster lock for block >>>>> allocation and that allocation cluster lock is currently hold by our node, >>>>> we are screwed. Aren't we? >>>>> >>>> No. If another thread is using the allocation cluster lock implies >>>> we already have the inode lock. Cluster locks are also taken in a >>>> strict order in order to avoid ABBA locking. >>> I agree with what you wrote above but it's not exactly what I'm talking >>> about. What seems to be possible is: >>> NODE 1 NODE2 >>> holds dentry lock for 'foo' >>> holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE >>> dquot_initialize(bar) >>> ocfs2_dquot_acquire() >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>> ... >>> downconvert thread (triggered from another >>> node or a different process from NODE2) >>> ocfs2_dentry_post_unlock() >>> ... >>> iput(foo) >>> ocfs2_evict_inode(foo) >>> ocfs2_clear_inode(foo) >>> dquot_drop(inode) >>> ... >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) >>> - blocks >>> finds we need more space in >>> quota file >>> ... >>> ocfs2_extend_no_holes() >>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) >>> - deadlocks waiting for >>> downconvert thread >>> >> Thanks. This explains the situation much better. Whats stopping >> NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from >> hold and wait condition) > The fact that to release the lock, it has to be downconverted. And there > is only one downconvert thread (ocfs2dc) and that is busy waiting for > USER_QUOTA_SYSTEM_INODE lock... Yes that indeed seems to be a problem, thanks a lot for explaining :). I do not know much about quota's code but wondering if we can fix this problem by enforcing the lock ordering ? Can NODE2 check first if it needs space before getting lock on USER_QUOTA_SYSTEM_INODE. If it does then it should acquire GLOBAL_BITMAP_SYSTEM_INODE before acquiring lock on USER_QUOTA_SYSTEM_INODE ? (Basically we enforce GLOBAL_BITMAP_SYSTEM_INODE cannot be taken while node has USER_QUOTA_SYSTEM_INODE lock) > >>> However the positive part of this is that the race I was originally afraid >>> of - when ocfs2_delete_inode() would be triggered from >>> ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep >>> invariant that node can cache dentry for 'foo' without holding inode lock >>> for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread >>> should never go into ocfs2_delete_inode() (however it would be nice to >>> assert this in ocfs2_dentry_post_unlock() for a peace of mind). >> No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, >> ocfs2_delete_inode() is called even if i_nlink > 0. > Ah, too bad :(. Do you know why we do that? I have to admit inode > deletion locking was always one of the trickier part of ocfs2 locking and > I'm not sure I've fully appreciated all its details. > >>> Since the deadlock seems to be quota specific, it should be possible >>> postpone just the quota processing for the workqueue. It isn't completely >>> trivial because we still have to cleanup inode quota pointers but it should >>> be doable. I'll try to have a look at this tomorrow. >> Thanks! We need to work out a different way in order to fix this so >> that open locks are not delayed and does not hurt unlink >> performance. > Sure, I'm all for fixing this deadlock in a better way. > > Honza
On Thu 16-01-14 11:47:49, Goldwyn Rodrigues wrote: > On 01/16/2014 11:13 AM, Jan Kara wrote: > >On Thu 16-01-14 09:49:46, Goldwyn Rodrigues wrote: > >>On 01/16/2014 08:02 AM, Jan Kara wrote: > >>>On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: > >>>>On 01/15/2014 08:47 PM, Jan Kara wrote: > >>>>>On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: > >>>>>>On 01/15/2014 09:53 AM, Jan Kara wrote: > >>>>>>>On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > >>>>>>>>On 01/15/2014 07:33 AM, Jan Kara wrote: > >>>>>>>>>On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>>>>>>>>>The following patches are reverted in this patch because these > >>>>>>>>>>patches caused regression in the unlink() calls. > >>>>>>>>>> > >>>>>>>>>>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>>>>>>>>>of dentry lock to ocfs2_wq > >>>>>>>>>>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>>>>>>>>>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>>>>>>>>>ocfs2_kill_sb on a failed mount > >>>>>>>>>> > >>>>>>>>>>The regression is caused because these patches delay the iput() in case > >>>>>>>>>>of dentry unlocks. This also delays the unlocking of the open lockres. > >>>>>>>>>>The open lockresource is required to test if the inode can be wiped from > >>>>>>>>>>disk on not. When the deleting node does not get the open lock, it marks > >>>>>>>>>>it as orphan (even though it is not in use by another node/process) > >>>>>>>>>>and causes a journal checkpoint. This delays operations following the > >>>>>>>>>>inode eviction. This also moves the inode to the orphaned inode > >>>>>>>>>>which further causes more I/O and a lot of unneccessary orphans. > >>>>>>>>>> > >>>>>>>>>>As for the fix, I tried reproducing the problem by using quotas and enabling > >>>>>>>>>>lockdep, but the issue could not be reproduced. > >>>>>>>>> So I was thinking about this for a while trying to remember... What it > >>>>>>>>>really boils down to is whether it is OK to call ocfs2_delete_inode() from > >>>>>>>>>DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >>>>>>>>>kinds of interesting cluster locks meaning that the downconvert thread can > >>>>>>>>>block waiting to acquire some cluster lock. That seems like asking for > >>>>>>>>>trouble to me (i.e., what if the node holding the lock we need from > >>>>>>>>>downconvert thread needs a lock from us first?) but maybe it is OK these > >>>>>>>>>days. > >>>>>>>>> > >>>>>>>> > >>>>>>>>The only lock it tries to take is the "inode lock" resource, which > >>>>>>>>seems to be fine to take. > >>>>>>> Why is it obviously fine? Some other node might be holding the inode lock > >>>>>>>and still write to the inode. If that needs a cluster lock for block > >>>>>>>allocation and that allocation cluster lock is currently hold by our node, > >>>>>>>we are screwed. Aren't we? > >>>>>>> > >>>>>> > >>>>>>No. If another thread is using the allocation cluster lock implies > >>>>>>we already have the inode lock. Cluster locks are also taken in a > >>>>>>strict order in order to avoid ABBA locking. > >>>>> I agree with what you wrote above but it's not exactly what I'm talking > >>>>>about. What seems to be possible is: > >>>>>NODE 1 NODE2 > >>>>>holds dentry lock for 'foo' > >>>>>holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE > >>>>> dquot_initialize(bar) > >>>>> ocfs2_dquot_acquire() > >>>>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>>>> ... > >>>>>downconvert thread (triggered from another > >>>>>node or a different process from NODE2) > >>>>> ocfs2_dentry_post_unlock() > >>>>> ... > >>>>> iput(foo) > >>>>> ocfs2_evict_inode(foo) > >>>>> ocfs2_clear_inode(foo) > >>>>> dquot_drop(inode) > >>>>> ... > >>>>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>>>> - blocks > >>>>> finds we need more space in > >>>>> quota file > >>>>> ... > >>>>> ocfs2_extend_no_holes() > >>>>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) > >>>>> - deadlocks waiting for > >>>>> downconvert thread > >>>>> > >>>> > >>>>Thanks. This explains the situation much better. Whats stopping > >>>>NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from > >>>>hold and wait condition) > >>> The fact that to release the lock, it has to be downconverted. And there > >>>is only one downconvert thread (ocfs2dc) and that is busy waiting for > >>>USER_QUOTA_SYSTEM_INODE lock... > >>> > >>>>>However the positive part of this is that the race I was originally afraid > >>>>>of - when ocfs2_delete_inode() would be triggered from > >>>>>ocfs2_dentry_post_unlock() - isn't possible because ocfs2 seems to keep > >>>>>invariant that node can cache dentry for 'foo' without holding inode lock > >>>>>for 'foo' only if foo->i_nlink > 0. Thus iput() from downconvert thread > >>>>>should never go into ocfs2_delete_inode() (however it would be nice to > >>>>>assert this in ocfs2_dentry_post_unlock() for a peace of mind). > >>>> > >>>>No, ocfs2_dentry_convert_worker sets OCFS2_INODE_MAYBE_ORPHANED. So, > >>>>ocfs2_delete_inode() is called even if i_nlink > 0. > >>> Ah, too bad :(. Do you know why we do that? I have to admit inode > >>>deletion locking was always one of the trickier part of ocfs2 locking and > >>>I'm not sure I've fully appreciated all its details. > >>> > >> > >>Well, The file could be unlinked on another node while it is open on > >>this node. inode->i_nlink is a node-local value (and is refreshed > >>only later). > > OK, but ocfs2_inode_is_valid_to_delete() will skip deleting inode if > >called from downconvert thread anyway (see the current == osb->dc_task > >check). So setting OCFS2_INODE_MAYBE_ORPHANED in > >ocfs2_dentry_convert_worker() seems to be futile? > > > >Also the node doing unlink fails to delete the inode only if > >ocfs2_query_inode_wipe() fails to upconvert the open lock. That means some > >other node is still holding open lock for the inode. Since > >ocfs2_query_inode_wipe() is called after unlink is finished and thus all > >dentry locks are dropped, we can be sure that dropping the dentry lock > >will never drop the last inode reference for an unlinked inode. > > > >So it seems to be safe to just remove OCFS2_INODE_MAYBE_ORPHANED from > >ocfs2_dentry_convert_worker()? > > > > I just walked this path and got badgered ;) > https://oss.oracle.com/pipermail/ocfs2-devel/2014-January/009509.html > > The problem is that if this node has the file open, and another node > initiates the unlink, deleting the inode becomes the responsibility > of the current node which has the file open after the last close. Yes, in the mean time I read more code around that flag and understood the logic - either we know the inode cannot be deleted because we have a dentry lock on some dentry pointing to it, or we flag the inode with OCFS2_INODE_MAYBE_ORPHANED to check during inode removal from memory whether it doesn't need to be deleted. > However, from the VFS point of view, would an inode be evicted if > the file is open? Certainly not. Inode is evicted only when last reference to it is dropped. In particular that means after all file pointers to it are closed. I wrote some patches to see how things will work out and it seems postponing just quota cleanup should be doable. Honza
On Thu 16-01-14 11:19:47, Srinivas Eeda wrote: > Hi Jan, > thanks a lot for explaining the problem. Please see my comment below. > > On 01/16/2014 06:02 AM, Jan Kara wrote: > >On Thu 16-01-14 07:35:58, Goldwyn Rodrigues wrote: > >>On 01/15/2014 08:47 PM, Jan Kara wrote: > >>>On Wed 15-01-14 17:17:55, Goldwyn Rodrigues wrote: > >>>>On 01/15/2014 09:53 AM, Jan Kara wrote: > >>>>>On Wed 15-01-14 09:32:39, Goldwyn Rodrigues wrote: > >>>>>>On 01/15/2014 07:33 AM, Jan Kara wrote: > >>>>>>>On Wed 15-01-14 06:51:51, Goldwyn Rodrigues wrote: > >>>>>>>>The following patches are reverted in this patch because these > >>>>>>>>patches caused regression in the unlink() calls. > >>>>>>>> > >>>>>>>>ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping > >>>>>>>>of dentry lock to ocfs2_wq > >>>>>>>>f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount > >>>>>>>>5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in > >>>>>>>>ocfs2_kill_sb on a failed mount > >>>>>>>> > >>>>>>>>The regression is caused because these patches delay the iput() in case > >>>>>>>>of dentry unlocks. This also delays the unlocking of the open lockres. > >>>>>>>>The open lockresource is required to test if the inode can be wiped from > >>>>>>>>disk on not. When the deleting node does not get the open lock, it marks > >>>>>>>>it as orphan (even though it is not in use by another node/process) > >>>>>>>>and causes a journal checkpoint. This delays operations following the > >>>>>>>>inode eviction. This also moves the inode to the orphaned inode > >>>>>>>>which further causes more I/O and a lot of unneccessary orphans. > >>>>>>>> > >>>>>>>>As for the fix, I tried reproducing the problem by using quotas and enabling > >>>>>>>>lockdep, but the issue could not be reproduced. > >>>>>>> So I was thinking about this for a while trying to remember... What it > >>>>>>>really boils down to is whether it is OK to call ocfs2_delete_inode() from > >>>>>>>DLM downconvert thread. Bear in mind that ocfs2_delete_inode() takes all > >>>>>>>kinds of interesting cluster locks meaning that the downconvert thread can > >>>>>>>block waiting to acquire some cluster lock. That seems like asking for > >>>>>>>trouble to me (i.e., what if the node holding the lock we need from > >>>>>>>downconvert thread needs a lock from us first?) but maybe it is OK these > >>>>>>>days. > >>>>>>> > >>>>>>The only lock it tries to take is the "inode lock" resource, which > >>>>>>seems to be fine to take. > >>>>> Why is it obviously fine? Some other node might be holding the inode lock > >>>>>and still write to the inode. If that needs a cluster lock for block > >>>>>allocation and that allocation cluster lock is currently hold by our node, > >>>>>we are screwed. Aren't we? > >>>>> > >>>>No. If another thread is using the allocation cluster lock implies > >>>>we already have the inode lock. Cluster locks are also taken in a > >>>>strict order in order to avoid ABBA locking. > >>> I agree with what you wrote above but it's not exactly what I'm talking > >>>about. What seems to be possible is: > >>>NODE 1 NODE2 > >>>holds dentry lock for 'foo' > >>>holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE > >>> dquot_initialize(bar) > >>> ocfs2_dquot_acquire() > >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>> ... > >>>downconvert thread (triggered from another > >>>node or a different process from NODE2) > >>> ocfs2_dentry_post_unlock() > >>> ... > >>> iput(foo) > >>> ocfs2_evict_inode(foo) > >>> ocfs2_clear_inode(foo) > >>> dquot_drop(inode) > >>> ... > >>> ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) > >>> - blocks > >>> finds we need more space in > >>> quota file > >>> ... > >>> ocfs2_extend_no_holes() > >>> ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) > >>> - deadlocks waiting for > >>> downconvert thread > >>> > >>Thanks. This explains the situation much better. Whats stopping > >>NODE1 from releasing the GLOBAL_BITMAP_SYSTEM_INODE? (to break from > >>hold and wait condition) > > The fact that to release the lock, it has to be downconverted. And there > >is only one downconvert thread (ocfs2dc) and that is busy waiting for > >USER_QUOTA_SYSTEM_INODE lock... > Yes that indeed seems to be a problem, thanks a lot for explaining > :). I do not know much about quota's code but wondering if we can > fix this problem by enforcing the lock ordering ? Can NODE2 check > first if it needs space before getting lock on > USER_QUOTA_SYSTEM_INODE. If it does then it should acquire > GLOBAL_BITMAP_SYSTEM_INODE before acquiring lock on > USER_QUOTA_SYSTEM_INODE ? (Basically we enforce > GLOBAL_BITMAP_SYSTEM_INODE cannot be taken while node has > USER_QUOTA_SYSTEM_INODE lock) Well, that is impractical to say the least - we would have to rewrite the allocation path to handle specifically quota code which would get the GLOBAL_BITMAP_SYSTEM_INODE lock at a different place than any other code. I think it is better to postpone dropping quota references - that leaves the pain localized into the code handling inode eviction. Honza
--- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include "dlmglue.h" #include "file.h" #include "inode.h" -#include "super.h" #include "ocfs2_trace.h" void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { - dl = osb->dentry_lock_list; - osb->dentry_lock_list = dl->dl_next; - spin_unlock(&dentry_list_lock); - iput(dl->dl_inode); - kfree(dl); - spin_lock(&dentry_list_lock); - } - spin_unlock(&dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* - * Don't queue dropping if umount is in progress. We flush the - * list in ocfs2_dismount_volume - */ - spin_lock(&dentry_list_lock); - if (osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - spin_unlock(&dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl->dl_inode); ocfs2_simple_drop_lockres(osb, &dl->dl_lockres); ocfs2_lock_res_free(&dl->dl_lockres); - - /* We leave dropping of inode reference to ocfs2_wq as that can - * possibly lead to inode deletion which gets tricky */ - spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list && - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, &osb->dentry_lock_work); - dl->dl_next = osb->dentry_lock_list; - osb->dentry_lock_list = dl; - spin_unlock(&dentry_list_lock); + kfree(dl); } void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { - int unlock; + int unlock = 0; BUG_ON(dl->dl_count == 0); diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index b79eff7..55f5889 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -29,13 +29,8 @@ extern const struct dentry_operations ocfs2_dentry_ops; struct ocfs2_dentry_lock { - /* Use count of dentry lock */ unsigned int dl_count; - union { - /* Linked list of dentry locks to release */ - struct ocfs2_dentry_lock *dl_next; - u64 dl_parent_blkno; - }; + u64 dl_parent_blkno; /* * The ocfs2_dentry_lock keeps an inode reference until @@ -49,14 +44,9 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); -extern spinlock_t dentry_list_lock; - void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); -void ocfs2_drop_dl_inodes(struct work_struct *work); -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); - struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53c..6be4e1d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -274,19 +274,16 @@ enum ocfs2_mount_options OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 - -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; struct ocfs2_replay_map; struct ocfs2_quota_recovery; -struct ocfs2_dentry_lock; struct ocfs2_super { struct task_struct *commit_task; @@ -414,11 +411,6 @@ struct ocfs2_super struct list_head blocked_lock_list; unsigned long blocked_lock_count; - /* List of dentry locks to release. Anyone can add locks to - * the list, ocfs2_wq processes the list */ - struct ocfs2_dentry_lock *dentry_lock_list; - struct work_struct dentry_lock_work; - wait_queue_head_t osb_mount_event; /* Truncate log info */ @@ -579,18 +571,6 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } - -static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, - unsigned long flag) -{ - unsigned long ret; - - spin_lock(&osb->osb_lock); - ret = osb->osb_flags & flag; - spin_unlock(&osb->osb_lock); - return ret; -} - static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7d259ff..129b4db 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1238,30 +1238,11 @@ static struct dentry *ocfs2_mount(struct file_system_type *fs_type, return mount_bdev(fs_type, flags, dev_name, data, ocfs2_fill_super); } -static void ocfs2_kill_sb(struct super_block *sb) -{ - struct ocfs2_super *osb = OCFS2_SB(sb); - - /* Failed mount? */ - if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) - goto out; - - /* Prevent further queueing of inode drop events */ - spin_lock(&dentry_list_lock); - ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); - spin_unlock(&dentry_list_lock); - /* Wait for work to finish and/or remove it */ - cancel_work_sync(&osb->dentry_lock_work); -out: - kill_block_super(sb); -} - static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .mount = ocfs2_mount, - .kill_sb = ocfs2_kill_sb, - + .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1934,12 +1915,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); - /* - * Flush inode dropping work queue so that deletes are - * performed while the filesystem is still working - */ - ocfs2_drop_all_dl_inodes(osb); - /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -2274,9 +2249,6 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dentry_lock_work, ocfs2_drop_dl_inodes); - osb->dentry_lock_list = NULL; - /* get some pseudo constants for clustersize bits */ osb->s_clustersize_bits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);