From patchwork Wed Dec 15 18:31:59 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Benny Halevy X-Patchwork-Id: 414061 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter1.kernel.org (8.14.4/8.14.3) with ESMTP id oBFIWR3H025714 for ; Wed, 15 Dec 2010 18:32:28 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1754810Ab0LOScI (ORCPT ); Wed, 15 Dec 2010 13:32:08 -0500 Received: from daytona.panasas.com ([67.152.220.89]:47286 "EHLO daytona.panasas.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751114Ab0LOScH (ORCPT ); Wed, 15 Dec 2010 13:32:07 -0500 Received: from fs1.bhalevy.com ([172.17.33.34]) by daytona.panasas.com with Microsoft SMTPSVC(6.0.3790.4675); Wed, 15 Dec 2010 13:32:05 -0500 From: Benny Halevy To: linux-nfs@vger.kernel.org Subject: [PATCH 6/9] Revert "pnfs-submit: wave2: remove cl_layoutrecalls list" Date: Wed, 15 Dec 2010 20:31:59 +0200 Message-Id: <1292437919-21862-1-git-send-email-bhalevy@panasas.com> X-Mailer: git-send-email 1.7.2.3 In-Reply-To: <4D0908F9.4060208@panasas.com> References: <4D0908F9.4060208@panasas.com> X-OriginalArrivalTime: 15 Dec 2010 18:32:05.0598 (UTC) FILETIME=[5FBD3FE0:01CB9C86] Sender: linux-nfs-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-nfs@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter1.kernel.org [140.211.167.41]); Wed, 15 Dec 2010 18:32:29 +0000 (UTC) diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h index b8cafb5..7f55c7e 100644 --- a/fs/nfs/callback.h +++ b/fs/nfs/callback.h @@ -154,7 +154,6 @@ struct cb_layoutrecallargs { union { struct { struct nfs_fh cbl_fh; - struct inode *cbl_inode; struct pnfs_layout_range cbl_range; nfs4_stateid cbl_stateid; }; @@ -165,6 +164,9 @@ struct cb_layoutrecallargs { extern unsigned nfs4_callback_layoutrecall( struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps); +extern bool matches_outstanding_recall(struct inode *ino, + struct pnfs_layout_range *range); +extern void notify_drained(struct nfs_client *clp, u64 mask); static inline void put_session_client(struct nfs4_session *session) { diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index bc532b0..a9d162f 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -120,16 +120,82 @@ int nfs4_validate_delegation_stateid(struct nfs_delegation *delegation, const nf #if defined(CONFIG_NFS_V4_1) -static int initiate_layout_draining(struct nfs_client *clp, - struct cb_layoutrecallargs *args) +static bool +_recall_matches_lget(struct pnfs_cb_lrecall_info *cb_info, + struct inode *ino, struct pnfs_layout_range *range) +{ + struct cb_layoutrecallargs *cb_args = &cb_info->pcl_args; + + switch (cb_args->cbl_recall_type) { + case RETURN_ALL: + return true; + case RETURN_FSID: + return !memcmp(&NFS_SERVER(ino)->fsid, &cb_args->cbl_fsid, + sizeof(struct nfs_fsid)); + case RETURN_FILE: + return (ino == cb_info->pcl_ino) && + should_free_lseg(range, &cb_args->cbl_range); + default: + /* Should never hit here, as decode_layoutrecall_args() + * will verify cb_info from server. + */ + BUG(); + } +} + +bool +matches_outstanding_recall(struct inode *ino, struct pnfs_layout_range *range) { + struct nfs_client *clp = NFS_SERVER(ino)->nfs_client; + struct pnfs_cb_lrecall_info *cb_info; + bool rv = false; + + assert_spin_locked(&clp->cl_lock); + list_for_each_entry(cb_info, &clp->cl_layoutrecalls, pcl_list) { + if (_recall_matches_lget(cb_info, ino, range)) { + rv = true; + break; + } + } + return rv; +} + +void notify_drained(struct nfs_client *clp, u64 mask) +{ + atomic_t **ptr = clp->cl_drain_notification; + + /* clp lock not needed except to remove used up entries */ + /* Should probably use functions defined in bitmap.h */ + while (mask) { + if ((mask & 1) && (atomic_dec_and_test(*ptr))) { + struct pnfs_cb_lrecall_info *cb_info; + + cb_info = container_of(*ptr, + struct pnfs_cb_lrecall_info, + pcl_count); + spin_lock(&clp->cl_lock); + /* Removing from the list unblocks LAYOUTGETs */ + list_del(&cb_info->pcl_list); + clp->cl_cb_lrecall_count--; + clp->cl_drain_notification[1 << cb_info->pcl_notify_bit] = NULL; + spin_unlock(&clp->cl_lock); + kfree(cb_info); + } + mask >>= 1; + ptr++; + } +} + +static int initiate_layout_draining(struct pnfs_cb_lrecall_info *cb_info) +{ + struct nfs_client *clp = cb_info->pcl_clp; struct pnfs_layout_hdr *lo; int rv = NFS4ERR_NOMATCHING_LAYOUT; + struct cb_layoutrecallargs *args = &cb_info->pcl_args; if (args->cbl_recall_type == RETURN_FILE) { LIST_HEAD(free_me_list); - args->cbl_inode = NULL; spin_lock(&clp->cl_lock); list_for_each_entry(lo, &clp->cl_layouts, layouts) { if (nfs_compare_fh(&args->cbl_fh, @@ -138,12 +204,16 @@ static int initiate_layout_draining(struct nfs_client *clp, if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) rv = NFS4ERR_DELAY; else { + /* FIXME I need to better understand igrab and + * does having a layout ref keep ino around? + * It should. + */ /* Without this, layout can be freed as soon * as we release cl_lock. Matched in * do_callback_layoutrecall. */ get_layout_hdr(lo); - args->cbl_inode = lo->inode; + cb_info->pcl_ino = lo->inode; rv = NFS4_OK; } break; @@ -154,6 +224,8 @@ static int initiate_layout_draining(struct nfs_client *clp, if (rv == NFS4_OK) { lo->plh_block_lgets++; nfs4_asynch_forget_layouts(lo, &args->cbl_range, + cb_info->pcl_notify_bit, + &cb_info->pcl_count, &free_me_list); } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); @@ -170,12 +242,18 @@ static int initiate_layout_draining(struct nfs_client *clp, }; spin_lock(&clp->cl_lock); + /* Per RFC 5661, 12.5.5.2.1.5, bulk recall must be serialized */ + if (!list_is_singular(&clp->cl_layoutrecalls)) { + spin_unlock(&clp->cl_lock); + return NFS4ERR_DELAY; + } list_for_each_entry(lo, &clp->cl_layouts, layouts) { if ((args->cbl_recall_type == RETURN_FSID) && memcmp(&NFS_SERVER(lo->inode)->fsid, &args->cbl_fsid, sizeof(struct nfs_fsid))) continue; get_layout_hdr(lo); + /* We could list_del(&lo->layouts) here */ BUG_ON(!list_empty(&lo->plh_bulk_recall)); list_add(&lo->plh_bulk_recall, &recall_list); } @@ -184,7 +262,10 @@ static int initiate_layout_draining(struct nfs_client *clp, &recall_list, plh_bulk_recall) { spin_lock(&lo->inode->i_lock); set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - nfs4_asynch_forget_layouts(lo, &range, &free_me_list); + nfs4_asynch_forget_layouts(lo, &range, + cb_info->pcl_notify_bit, + &cb_info->pcl_count, + &free_me_list); list_del_init(&lo->plh_bulk_recall); spin_unlock(&lo->inode->i_lock); put_layout_hdr(lo); @@ -198,29 +279,69 @@ static int initiate_layout_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 status, res = NFS4ERR_DELAY; + struct pnfs_cb_lrecall_info *new; + atomic_t **ptr; + int bit_num; + u32 res; dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); - if (test_and_set_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state)) + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) { + res = NFS4ERR_DELAY; goto out; - atomic_inc(&clp->cl_recall_count); - status = initiate_layout_draining(clp, args); - if (atomic_dec_and_test(&clp->cl_recall_count)) - res = NFS4ERR_NOMATCHING_LAYOUT; - else + } + memcpy(&new->pcl_args, args, sizeof(*args)); + atomic_set(&new->pcl_count, 1); + new->pcl_clp = clp; + new->pcl_ino = NULL; + spin_lock(&clp->cl_lock); + if (clp->cl_cb_lrecall_count >= PNFS_MAX_CB_LRECALLS) { + kfree(new); res = NFS4ERR_DELAY; - if (status) - res = status; - else if (args->cbl_recall_type == RETURN_FILE) { - struct pnfs_layout_hdr *lo; + spin_unlock(&clp->cl_lock); + goto out; + } + clp->cl_cb_lrecall_count++; + /* Adding to the list will block conflicting LGET activity */ + list_add_tail(&new->pcl_list, &clp->cl_layoutrecalls); + for (bit_num = 0, ptr = clp->cl_drain_notification; *ptr; ptr++) + bit_num++; + *ptr = &new->pcl_count; + new->pcl_notify_bit = bit_num; + spin_unlock(&clp->cl_lock); + res = initiate_layout_draining(new); + if (res || atomic_dec_and_test(&new->pcl_count)) { + spin_lock(&clp->cl_lock); + list_del(&new->pcl_list); + clp->cl_cb_lrecall_count--; + clp->cl_drain_notification[1 << bit_num] = NULL; + spin_unlock(&clp->cl_lock); + if (res == NFS4_OK) { + if (args->cbl_recall_type == RETURN_FILE) { + struct pnfs_layout_hdr *lo; + + lo = NFS_I(new->pcl_ino)->layout; + spin_lock(&lo->inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&lo->inode->i_lock); + put_layout_hdr(lo); + } + res = NFS4ERR_NOMATCHING_LAYOUT; + } + kfree(new); + } else { + /* We are currently using a referenced layout */ + if (args->cbl_recall_type == RETURN_FILE) { + struct pnfs_layout_hdr *lo; - lo = NFS_I(args->cbl_inode)->layout; - spin_lock(&lo->inode->i_lock); - lo->plh_block_lgets--; - spin_unlock(&lo->inode->i_lock); - put_layout_hdr(lo); + lo = NFS_I(new->pcl_ino)->layout; + spin_lock(&lo->inode->i_lock); + lo->plh_block_lgets--; + spin_unlock(&lo->inode->i_lock); + put_layout_hdr(lo); + } + res = NFS4ERR_DELAY; } - clear_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state); out: dprintk("%s returning %i\n", __func__, res); return res; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9042a7a..f8e712f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -158,6 +158,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_machine_cred = cred; #if defined(CONFIG_NFS_V4_1) INIT_LIST_HEAD(&clp->cl_layouts); + INIT_LIST_HEAD(&clp->cl_layoutrecalls); #endif nfs_fscache_get_client_cookie(clp); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a917872..15fea61 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -44,7 +44,6 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_REBOOT, NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, - NFS4CLNT_LAYOUTRECALL, NFS4CLNT_SESSION_RESET, NFS4CLNT_RECALL_SLOT, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5331f28..1c79c09 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5378,9 +5378,14 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) dprintk("--> %s\n", __func__); - if (!nfs4_sequence_done(task, &lgp->res.seq_res)) + if (!nfs4_sequence_done(task, &lgp->res.seq_res)) { + /* layout code relies on fact that in this case + * code falls back to tk_action=call_start, but not + * back to rpc_prepare_task, to keep plh_outstanding + * correct. + */ return; - + } switch (task->tk_status) { case 0: break; @@ -5402,6 +5407,7 @@ static void nfs4_layoutget_release(void *calldata) struct nfs4_layoutget *lgp = calldata; dprintk("--> %s\n", __func__); + put_layout_hdr(NFS_I(lgp->args.inode)->layout); if (lgp->res.layout.buf != NULL) free_page((unsigned long) lgp->res.layout.buf); put_nfs_open_context(lgp->args.ctx); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 1899dc6..8b44c41 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -277,17 +277,17 @@ init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg) smp_mb(); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->layout = lo; - lseg->pls_recall_count = 0; + lseg->pls_notify_mask = 0; } static void free_lseg(struct pnfs_layout_segment *lseg) { struct inode *ino = lseg->layout->inode; - int count = lseg->pls_recall_count; + u64 mask = lseg->pls_notify_mask; BUG_ON(atomic_read(&lseg->pls_refcount) != 0); NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); - atomic_sub(count, &NFS_SERVER(ino)->nfs_client->cl_recall_count); + notify_drained(NFS_SERVER(ino)->nfs_client, mask); /* Matched by get_layout_hdr_locked in pnfs_insert_layout */ put_layout_hdr(NFS_I(ino)->layout); } @@ -544,8 +544,10 @@ send_layoutget(struct pnfs_layout_hdr *lo, BUG_ON(ctx == NULL); lgp = kzalloc(sizeof(*lgp), GFP_KERNEL); - if (lgp == NULL) + if (lgp == NULL) { + put_layout_hdr(lo); return NULL; + } lgp->args.minlength = NFS4_MAX_UINT64; lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; lgp->args.range.iomode = range->iomode; @@ -569,6 +571,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range, + int notify_bit, atomic_t *notify_count, struct list_head *tmp_list) { struct pnfs_layout_segment *lseg, *tmp; @@ -576,8 +579,8 @@ void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo, assert_spin_locked(&lo->inode->i_lock); list_for_each_entry_safe(lseg, tmp, &lo->segs, fi_list) if (should_free_lseg(&lseg->range, range)) { - lseg->pls_recall_count++; - atomic_inc(&NFS_SERVER(lo->inode)->nfs_client->cl_recall_count); + lseg->pls_notify_mask |= (1 << notify_bit); + atomic_inc(notify_count); mark_lseg_invalid(lseg, tmp_list); } } @@ -833,6 +836,13 @@ pnfs_update_layout(struct inode *ino, if (!pnfs_enabled_sb(NFS_SERVER(ino))) return NULL; + spin_lock(&clp->cl_lock); + if (matches_outstanding_recall(ino, &arg)) { + dprintk("%s matches recall, use MDS\n", __func__); + spin_unlock(&clp->cl_lock); + return NULL; + } + spin_unlock(&clp->cl_lock); spin_lock(&ino->i_lock); lo = pnfs_find_alloc_layout(ino); if (lo == NULL) { @@ -840,12 +850,6 @@ pnfs_update_layout(struct inode *ino, goto out_unlock; } - /* Do we even need to bother with this? */ - if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { - dprintk("%s matches recall, use MDS\n", __func__); - goto out_unlock; - } /* Check to see if the layout for the given range already exists */ lseg = pnfs_find_lseg(lo, &arg); if (lseg) @@ -883,7 +887,7 @@ pnfs_update_layout(struct inode *ino, spin_unlock(&ino->i_lock); } atomic_dec(&lo->plh_outstanding); - put_layout_hdr(lo); + spin_unlock(&ino->i_lock); out: dprintk("%s end, state 0x%lx lseg %p\n", __func__, nfsi->layout->plh_flags, lseg); @@ -927,11 +931,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) } spin_lock(&ino->i_lock); - if (test_bit(NFS4CLNT_LAYOUTRECALL, &clp->cl_state) || - test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + /* decrement needs to be done before call to pnfs_layoutget_blocked */ + spin_lock(&clp->cl_lock); + if (matches_outstanding_recall(ino, &res->range)) { + spin_unlock(&clp->cl_lock); dprintk("%s forget reply due to recall\n", __func__); goto out_forget_reply; } + spin_unlock(&clp->cl_lock); if (pnfs_layoutgets_blocked(lo, &res->stateid, 1)) { dprintk("%s forget reply due to state\n", __func__); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 59eb0e8..b011b3c 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -31,6 +31,7 @@ #define FS_NFS_PNFS_H #include +#include "callback.h" enum { NFS_LSEG_VALID = 0, /* cleared when lseg is recalled/returned */ @@ -42,7 +43,7 @@ struct pnfs_layout_segment { atomic_t pls_refcount; unsigned long pls_flags; struct pnfs_layout_hdr *layout; - int pls_recall_count; + u64 pls_notify_mask; }; enum pnfs_try_status { @@ -126,6 +127,15 @@ struct pnfs_device { unsigned int pglen; }; +struct pnfs_cb_lrecall_info { + struct list_head pcl_list; /* hook into cl_layoutrecalls list */ + atomic_t pcl_count; + int pcl_notify_bit; + struct nfs_client *pcl_clp; + struct inode *pcl_ino; + struct cb_layoutrecallargs pcl_args; +}; + /* * Device ID RCU cache. A device ID is unique per client ID and layout type. */ @@ -221,6 +231,7 @@ int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct nfs4_state *open_state); void nfs4_asynch_forget_layouts(struct pnfs_layout_hdr *lo, struct pnfs_layout_range *range, + int notify_bit, atomic_t *notify_count, struct list_head *tmp_list); static inline bool diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 5e97d69..b02f486 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -84,6 +84,10 @@ struct nfs_client { struct nfs4_session *cl_session; /* sharred session */ struct list_head cl_layouts; atomic_t cl_recall_count; /* no. of lsegs in recall */ + struct list_head cl_layoutrecalls; + unsigned long cl_cb_lrecall_count; +#define PNFS_MAX_CB_LRECALLS (64) + atomic_t *cl_drain_notification[PNFS_MAX_CB_LRECALLS]; struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */ #endif /* CONFIG_NFS_V4_1 */