[v2] ceph: map snapid to anonymous bdev ID

Message ID	20190107063145.9239-1-zyan@redhat.com (mailing list archive)
State	New, archived
Headers	show Return-Path: <ceph-devel-owner@kernel.org> From: "Yan, Zheng" <zyan@redhat.com> To: ceph-devel@vger.kernel.org Cc: jlayton@redhat.com, idryomov@gmail.com, "Yan, Zheng" <zyan@redhat.com> Subject: [PATCH v2] ceph: map snapid to anonymous bdev ID Date: Mon, 7 Jan 2019 14:31:45 +0800 Message-Id: <20190107063145.9239-1-zyan@redhat.com> Sender: ceph-devel-owner@vger.kernel.org Precedence: bulk
Series	[v2] ceph: map snapid to anonymous bdev ID \| expand [v2] ceph: map snapid to anonymous bdev ID

Message ID

20190107063145.9239-1-zyan@redhat.com (mailing list archive)

State

New, archived

Headers

From: "Yan, Zheng" <zyan@redhat.com>
To: ceph-devel@vger.kernel.org
Cc: jlayton@redhat.com, idryomov@gmail.com,
        "Yan, Zheng" <zyan@redhat.com>
Subject: [PATCH v2] ceph: map snapid to anonymous bdev ID
Date: Mon,  7 Jan 2019 14:31:45 +0800
Message-Id: <20190107063145.9239-1-zyan@redhat.com>
Sender: ceph-devel-owner@vger.kernel.org
Precedence: bulk

Series

[v2] ceph: map snapid to anonymous bdev ID | expand

Commit Message

Yan, Zheng Jan. 7, 2019, 6:31 a.m. UTC

ceph_getattr() return zero dev ID for head inodes and set dev ID to
snapid directly for snaphost inodes. This is not good because userspace
utilities may consider device ID of 0 as invalid, snapid may conflict
with other device's ID.

This patch introduces "snapids to anonymous bdev IDs" map. we create a
new mapping when we see a snapid for the first time. we trim unused
mapping after it is ilde for 5 minutes.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/inode.c      |  37 +++++++----
 fs/ceph/mds_client.c |   8 +++
 fs/ceph/mds_client.h |  13 ++++
 fs/ceph/snap.c       | 154 +++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h      |  13 +++-
 5 files changed, 210 insertions(+), 15 deletions(-)

Changes since v1
 change the code that handles leaked snap map on umount

Comments

Jeff Layton Jan. 7, 2019, 1:05 p.m. UTC | #1

On Mon, 2019-01-07 at 14:31 +0800, Yan, Zheng wrote:
> ceph_getattr() return zero dev ID for head inodes and set dev ID to
> snapid directly for snaphost inodes. This is not good because userspace
> utilities may consider device ID of 0 as invalid, snapid may conflict
> with other device's ID.
> 
> This patch introduces "snapids to anonymous bdev IDs" map. we create a
> new mapping when we see a snapid for the first time. we trim unused
> mapping after it is ilde for 5 minutes.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/inode.c      |  37 +++++++----
>  fs/ceph/mds_client.c |   8 +++
>  fs/ceph/mds_client.h |  13 ++++
>  fs/ceph/snap.c       | 154 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/super.h      |  13 +++-
>  5 files changed, 210 insertions(+), 15 deletions(-)
> 
> Changes since v1
>  change the code that handles leaked snap map on umount
> 
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index c080b133c08b..7f82ceff510a 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
>  	 */
>  	if (ci->i_snap_realm) {
>  		struct ceph_mds_client *mdsc =
> -			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> -		struct ceph_snap_realm *realm = ci->i_snap_realm;
> -
> -		dout(" dropping residual ref to snap realm %p\n", realm);
> -		spin_lock(&realm->inodes_with_caps_lock);
> -		list_del_init(&ci->i_snap_realm_item);
> -		ci->i_snap_realm = NULL;
> -		if (realm->ino == ci->i_vino.ino)
> -			realm->inode = NULL;
> -		spin_unlock(&realm->inodes_with_caps_lock);
> -		ceph_put_snap_realm(mdsc, realm);
> +					ceph_inode_to_client(inode)->mdsc;
> +		if (ceph_snap(inode) == CEPH_NOSNAP) {
> +			struct ceph_snap_realm *realm = ci->i_snap_realm;
> +			dout(" dropping residual ref to snap realm %p\n",
> +			     realm);
> +			spin_lock(&realm->inodes_with_caps_lock);
> +			list_del_init(&ci->i_snap_realm_item);
> +			ci->i_snap_realm = NULL;
> +			if (realm->ino == ci->i_vino.ino)
> +				realm->inode = NULL;
> +			spin_unlock(&realm->inodes_with_caps_lock);
> +			ceph_put_snap_realm(mdsc, realm);
> +		} else {
> +			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> +			ci->i_snap_realm = NULL;
> +		}
>  	}
>  
>  	kfree(ci->i_symlink);
> @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
>  						     iinfo->pool_ns_len);
>  
> +	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> +		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> +
>  	spin_lock(&ci->i_ceph_lock);
>  
>  	/*
> @@ -2260,10 +2268,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>  	if (!err) {
>  		generic_fillattr(inode, stat);
>  		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> -		if (ceph_snap(inode) != CEPH_NOSNAP)
> -			stat->dev = ceph_snap(inode);
> +		if (ceph_snap(inode) == CEPH_NOSNAP)
> +			stat->dev = inode->i_sb->s_dev;
>  		else
> -			stat->dev = 0;
> +			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> +
>  		if (S_ISDIR(inode->i_mode)) {
>  			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
>  						RBYTES))
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 32f627517fd0..1ff490c2d38e 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3789,6 +3789,8 @@ static void delayed_work(struct work_struct *work)
>  	dout("mdsc delayed_work\n");
>  	ceph_check_delayed_caps(mdsc);
>  
> +	ceph_trim_snapid_map(mdsc);
> +
>  	mutex_lock(&mdsc->mutex);
>  	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
>  	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> @@ -3891,6 +3893,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	ceph_caps_init(mdsc);
>  	ceph_adjust_min_caps(mdsc, fsc->min_caps);
>  
> +	spin_lock_init(&mdsc->snapid_map_lock);
> +	mdsc->snapid_map_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> +
>  	init_rwsem(&mdsc->pool_perm_rwsem);
>  	mdsc->pool_perm_tree = RB_ROOT;
>  
> @@ -4084,6 +4090,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>  	WARN_ON(!list_empty(&mdsc->cap_delay_list));
>  	mutex_unlock(&mdsc->mutex);
>  
> +	ceph_cleanup_snapid_map(mdsc);
> +
>  	ceph_cleanup_empty_realms(mdsc);
>  
>  	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 4f962642fee4..d3a5c4046316 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -313,6 +313,15 @@ struct ceph_pool_perm {
>  	char pool_ns[];
>  };
>  
> +struct ceph_snapid_map {
> +	struct rb_node node;
> +	struct list_head lru;
> +	atomic_t ref;
> +	u64 snap;
> +	dev_t dev;
> +	unsigned long last_used;
> +};
> +
>  /*
>   * mds client state
>   */
> @@ -390,6 +399,10 @@ struct ceph_mds_client {
>  	struct list_head  dentry_lru;
>  	int		  num_dentry;
>  
> +	spinlock_t		snapid_map_lock;
> +	struct rb_root		snapid_map_tree;
> +	struct list_head	snapid_map_lru;
> +
>  	struct rw_semaphore     pool_perm_rwsem;
>  	struct rb_root		pool_perm_tree;
>  
> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> index d7359628fd1f..c3fd66d6e03a 100644
> --- a/fs/ceph/snap.c
> +++ b/fs/ceph/snap.c
> @@ -988,3 +988,157 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>  		up_write(&mdsc->snap_rwsem);
>  	return;
>  }
> +
> +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> +					    u64 snap)
> +{
> +	struct ceph_snapid_map *sm, *exist;
> +	struct rb_node **p, *parent;
> +	int ret;
> +
> +	exist = NULL;
> +	spin_lock(&mdsc->snapid_map_lock);
> +	p = &mdsc->snapid_map_tree.rb_node;
> +	while (*p) {
> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> +		if (snap > exist->snap) {
> +			p = &(*p)->rb_left;
> +		} else if (snap < exist->snap) {
> +			p = &(*p)->rb_right;
> +		} else {
> +			if (atomic_inc_return(&exist->ref) == 1)
> +				list_del_init(&exist->lru);
> +			break;
> +		}
> +		exist = NULL;
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +	if (exist) {
> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> +		return exist;
> +	}
> +
> +	sm = kmalloc(sizeof(*sm), GFP_NOFS);
> +	if (!sm)
> +		return NULL;
> +
> +	ret = get_anon_bdev(&sm->dev);
> +	if (ret < 0) {
> +		kfree(sm);
> +		return NULL;
> +	}
> +
> +	INIT_LIST_HEAD(&sm->lru);
> +	atomic_set(&sm->ref, 1);
> +	sm->snap = snap;
> +
> +	exist = NULL;
> +	parent = NULL;
> +	p = &mdsc->snapid_map_tree.rb_node;
> +	spin_lock(&mdsc->snapid_map_lock);
> +	while (*p) {
> +		parent = *p;
> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> +		if (snap > exist->snap)
> +			p = &(*p)->rb_left;
> +		else if (snap < exist->snap)
> +			p = &(*p)->rb_right;
> +		else
> +			break;
> +		exist = NULL;
> +	}
> +	if (exist) {
> +		if (atomic_inc_return(&exist->ref) == 1)
> +			list_del_init(&exist->lru);
> +	} else {
> +		rb_link_node(&sm->node, parent, p);
> +		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +	if (exist) {
> +		free_anon_bdev(sm->dev);
> +		kfree(sm);
> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> +		return exist;
> +	}
> +
> +	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> +	return sm;
> +}
> +
> +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> +			 struct ceph_snapid_map *sm)
> +{
> +	if (!sm)
> +		return;
> +	if (!atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> +		if (!RB_EMPTY_NODE(&sm->node)) {
> +			sm->last_used = jiffies;
> +			list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> +			spin_unlock(&mdsc->snapid_map_lock);
> +		} else {
> +			/* already cleaned up by
> +			 * ceph_cleanup_snapid_map() */
> +			spin_unlock(&mdsc->snapid_map_lock);
> +			kfree(sm);
> +		}
> +	}
> +}
> +
> +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> +{
> +	struct ceph_snapid_map *sm;
> +	unsigned long now;
> +	LIST_HEAD(to_free);
> +
> +	/* unused map expires after 5 minutes */
> +	static unsigned timeout = 5 * 60 * HZ;
> +

No need for static constant here. This could just be something like:

#define CEPH_SNAPID_MAP_TIMEOUT		(5 * 60 * HZ)

...particularly if you don't care about being able to set this at
runtime.

> +	spin_lock(&mdsc->snapid_map_lock);
> +	now = jiffies;
> +
> +	while (!list_empty(&mdsc->snapid_map_lru)) {
> +		sm = list_first_entry(&mdsc->snapid_map_lru,
> +				      struct ceph_snapid_map, lru);
> +		if (time_after(sm->last_used + timeout, now))
> +			break;
> +
> +		rb_erase(&sm->node, &mdsc->snapid_map_tree);
> +		list_move(&sm->lru, &to_free);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +
> +	while (!list_empty(&to_free)) {
> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> +		list_del(&sm->lru);
> +		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> +		free_anon_bdev(sm->dev);
> +		kfree(sm);
> +	}
> +}
> +
> +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> +{
> +	struct ceph_snapid_map *sm;
> +	struct rb_node *p;
> +	LIST_HEAD(to_free);
> +
> +	spin_lock(&mdsc->snapid_map_lock);
> +	while ((p = rb_first(&mdsc->snapid_map_tree))) {
> +		sm = rb_entry(p, struct ceph_snapid_map, node);
> +		rb_erase(p, &mdsc->snapid_map_tree);
> +		RB_CLEAR_NODE(p);
> +		list_move(&sm->lru, &to_free);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +
> +	while (!list_empty(&to_free)) {
> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> +		list_del(&sm->lru);
> +		free_anon_bdev(sm->dev);
> +		if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
> +			pr_err("snapid map %llx -> %x still in use\n",
> +			       sm->snap, sm->dev);
> +		}
> +	}
> +}
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 79a265ba9200..7a74b5652a0c 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -370,7 +370,10 @@ struct ceph_inode_info {
>  	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
>  	spinlock_t i_unsafe_lock;
>  
> -	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> +	union {
> +		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> +		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> +	};
>  	int i_snap_realm_counter; /* snap realm (if caps) */
>  	struct list_head i_snap_realm_item;
>  	struct list_head i_snap_flush_item;
> @@ -837,6 +840,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
>  				  struct ceph_cap_snap *capsnap);
>  extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
>  
> +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> +						   u64 snap);
> +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> +				struct ceph_snapid_map *sm);
> +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> +
> +
>  /*
>   * a cap_snap is "pending" if it is still awaiting an in-progress
>   * sync write (that may/may not still update size, mtime, etc.).


I think this looks reasonable overall. It's definitely less confusing
for userland than showing two different dentries as having the same
ino/dev tuple without them being hardlinked (particularly if they are
directories).

Acked-by: Jeff Layton <jlayton@redhat.com>

Ilya Dryomov Jan. 7, 2019, 2:18 p.m. UTC | #2

On Mon, Jan 7, 2019 at 2:05 PM Jeff Layton <jlayton@redhat.com> wrote:
>
> On Mon, 2019-01-07 at 14:31 +0800, Yan, Zheng wrote:
> > ceph_getattr() return zero dev ID for head inodes and set dev ID to
> > snapid directly for snaphost inodes. This is not good because userspace
> > utilities may consider device ID of 0 as invalid, snapid may conflict
> > with other device's ID.
> >
> > This patch introduces "snapids to anonymous bdev IDs" map. we create a
> > new mapping when we see a snapid for the first time. we trim unused
> > mapping after it is ilde for 5 minutes.
> >
> > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > ---
> >  fs/ceph/inode.c      |  37 +++++++----
> >  fs/ceph/mds_client.c |   8 +++
> >  fs/ceph/mds_client.h |  13 ++++
> >  fs/ceph/snap.c       | 154 +++++++++++++++++++++++++++++++++++++++++++
> >  fs/ceph/super.h      |  13 +++-
> >  5 files changed, 210 insertions(+), 15 deletions(-)
> >
> > Changes since v1
> >  change the code that handles leaked snap map on umount
> >
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index c080b133c08b..7f82ceff510a 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
> >        */
> >       if (ci->i_snap_realm) {
> >               struct ceph_mds_client *mdsc =
> > -                     ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> > -             struct ceph_snap_realm *realm = ci->i_snap_realm;
> > -
> > -             dout(" dropping residual ref to snap realm %p\n", realm);
> > -             spin_lock(&realm->inodes_with_caps_lock);
> > -             list_del_init(&ci->i_snap_realm_item);
> > -             ci->i_snap_realm = NULL;
> > -             if (realm->ino == ci->i_vino.ino)
> > -                     realm->inode = NULL;
> > -             spin_unlock(&realm->inodes_with_caps_lock);
> > -             ceph_put_snap_realm(mdsc, realm);
> > +                                     ceph_inode_to_client(inode)->mdsc;
> > +             if (ceph_snap(inode) == CEPH_NOSNAP) {
> > +                     struct ceph_snap_realm *realm = ci->i_snap_realm;
> > +                     dout(" dropping residual ref to snap realm %p\n",
> > +                          realm);
> > +                     spin_lock(&realm->inodes_with_caps_lock);
> > +                     list_del_init(&ci->i_snap_realm_item);
> > +                     ci->i_snap_realm = NULL;
> > +                     if (realm->ino == ci->i_vino.ino)
> > +                             realm->inode = NULL;
> > +                     spin_unlock(&realm->inodes_with_caps_lock);
> > +                     ceph_put_snap_realm(mdsc, realm);
> > +             } else {
> > +                     ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> > +                     ci->i_snap_realm = NULL;
> > +             }
> >       }
> >
> >       kfree(ci->i_symlink);
> > @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> >               pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
> >                                                    iinfo->pool_ns_len);
> >
> > +     if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> > +             ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> > +
> >       spin_lock(&ci->i_ceph_lock);
> >
> >       /*
> > @@ -2260,10 +2268,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> >       if (!err) {
> >               generic_fillattr(inode, stat);
> >               stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > -             if (ceph_snap(inode) != CEPH_NOSNAP)
> > -                     stat->dev = ceph_snap(inode);
> > +             if (ceph_snap(inode) == CEPH_NOSNAP)
> > +                     stat->dev = inode->i_sb->s_dev;
> >               else
> > -                     stat->dev = 0;
> > +                     stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> > +
> >               if (S_ISDIR(inode->i_mode)) {
> >                       if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
> >                                               RBYTES))
> > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > index 32f627517fd0..1ff490c2d38e 100644
> > --- a/fs/ceph/mds_client.c
> > +++ b/fs/ceph/mds_client.c
> > @@ -3789,6 +3789,8 @@ static void delayed_work(struct work_struct *work)
> >       dout("mdsc delayed_work\n");
> >       ceph_check_delayed_caps(mdsc);
> >
> > +     ceph_trim_snapid_map(mdsc);
> > +
> >       mutex_lock(&mdsc->mutex);
> >       renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> >       renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> > @@ -3891,6 +3893,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> >       ceph_caps_init(mdsc);
> >       ceph_adjust_min_caps(mdsc, fsc->min_caps);
> >
> > +     spin_lock_init(&mdsc->snapid_map_lock);
> > +     mdsc->snapid_map_tree = RB_ROOT;
> > +     INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> > +
> >       init_rwsem(&mdsc->pool_perm_rwsem);
> >       mdsc->pool_perm_tree = RB_ROOT;
> >
> > @@ -4084,6 +4090,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> >       WARN_ON(!list_empty(&mdsc->cap_delay_list));
> >       mutex_unlock(&mdsc->mutex);
> >
> > +     ceph_cleanup_snapid_map(mdsc);
> > +
> >       ceph_cleanup_empty_realms(mdsc);
> >
> >       cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > index 4f962642fee4..d3a5c4046316 100644
> > --- a/fs/ceph/mds_client.h
> > +++ b/fs/ceph/mds_client.h
> > @@ -313,6 +313,15 @@ struct ceph_pool_perm {
> >       char pool_ns[];
> >  };
> >
> > +struct ceph_snapid_map {
> > +     struct rb_node node;
> > +     struct list_head lru;
> > +     atomic_t ref;
> > +     u64 snap;
> > +     dev_t dev;
> > +     unsigned long last_used;
> > +};
> > +
> >  /*
> >   * mds client state
> >   */
> > @@ -390,6 +399,10 @@ struct ceph_mds_client {
> >       struct list_head  dentry_lru;
> >       int               num_dentry;
> >
> > +     spinlock_t              snapid_map_lock;
> > +     struct rb_root          snapid_map_tree;
> > +     struct list_head        snapid_map_lru;
> > +
> >       struct rw_semaphore     pool_perm_rwsem;
> >       struct rb_root          pool_perm_tree;
> >
> > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> > index d7359628fd1f..c3fd66d6e03a 100644
> > --- a/fs/ceph/snap.c
> > +++ b/fs/ceph/snap.c
> > @@ -988,3 +988,157 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
> >               up_write(&mdsc->snap_rwsem);
> >       return;
> >  }
> > +
> > +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > +                                         u64 snap)
> > +{
> > +     struct ceph_snapid_map *sm, *exist;
> > +     struct rb_node **p, *parent;
> > +     int ret;
> > +
> > +     exist = NULL;
> > +     spin_lock(&mdsc->snapid_map_lock);
> > +     p = &mdsc->snapid_map_tree.rb_node;
> > +     while (*p) {
> > +             exist = rb_entry(*p, struct ceph_snapid_map, node);
> > +             if (snap > exist->snap) {
> > +                     p = &(*p)->rb_left;
> > +             } else if (snap < exist->snap) {
> > +                     p = &(*p)->rb_right;
> > +             } else {
> > +                     if (atomic_inc_return(&exist->ref) == 1)
> > +                             list_del_init(&exist->lru);
> > +                     break;
> > +             }
> > +             exist = NULL;
> > +     }
> > +     spin_unlock(&mdsc->snapid_map_lock);
> > +     if (exist) {
> > +             dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > +             return exist;
> > +     }
> > +
> > +     sm = kmalloc(sizeof(*sm), GFP_NOFS);
> > +     if (!sm)
> > +             return NULL;
> > +
> > +     ret = get_anon_bdev(&sm->dev);
> > +     if (ret < 0) {
> > +             kfree(sm);
> > +             return NULL;
> > +     }
> > +
> > +     INIT_LIST_HEAD(&sm->lru);
> > +     atomic_set(&sm->ref, 1);
> > +     sm->snap = snap;
> > +
> > +     exist = NULL;
> > +     parent = NULL;
> > +     p = &mdsc->snapid_map_tree.rb_node;
> > +     spin_lock(&mdsc->snapid_map_lock);
> > +     while (*p) {
> > +             parent = *p;
> > +             exist = rb_entry(*p, struct ceph_snapid_map, node);
> > +             if (snap > exist->snap)
> > +                     p = &(*p)->rb_left;
> > +             else if (snap < exist->snap)
> > +                     p = &(*p)->rb_right;
> > +             else
> > +                     break;
> > +             exist = NULL;
> > +     }
> > +     if (exist) {
> > +             if (atomic_inc_return(&exist->ref) == 1)
> > +                     list_del_init(&exist->lru);
> > +     } else {
> > +             rb_link_node(&sm->node, parent, p);
> > +             rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> > +     }
> > +     spin_unlock(&mdsc->snapid_map_lock);
> > +     if (exist) {
> > +             free_anon_bdev(sm->dev);
> > +             kfree(sm);
> > +             dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > +             return exist;
> > +     }
> > +
> > +     dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> > +     return sm;
> > +}
> > +
> > +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > +                      struct ceph_snapid_map *sm)
> > +{
> > +     if (!sm)
> > +             return;
> > +     if (!atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> > +             if (!RB_EMPTY_NODE(&sm->node)) {
> > +                     sm->last_used = jiffies;
> > +                     list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> > +                     spin_unlock(&mdsc->snapid_map_lock);
> > +             } else {
> > +                     /* already cleaned up by
> > +                      * ceph_cleanup_snapid_map() */
> > +                     spin_unlock(&mdsc->snapid_map_lock);
> > +                     kfree(sm);
> > +             }
> > +     }
> > +}
> > +
> > +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> > +{
> > +     struct ceph_snapid_map *sm;
> > +     unsigned long now;
> > +     LIST_HEAD(to_free);
> > +
> > +     /* unused map expires after 5 minutes */
> > +     static unsigned timeout = 5 * 60 * HZ;
> > +
>
> No need for static constant here. This could just be something like:
>
> #define CEPH_SNAPID_MAP_TIMEOUT         (5 * 60 * HZ)
>
> ...particularly if you don't care about being able to set this at
> runtime.
>
> > +     spin_lock(&mdsc->snapid_map_lock);
> > +     now = jiffies;
> > +
> > +     while (!list_empty(&mdsc->snapid_map_lru)) {
> > +             sm = list_first_entry(&mdsc->snapid_map_lru,
> > +                                   struct ceph_snapid_map, lru);
> > +             if (time_after(sm->last_used + timeout, now))
> > +                     break;
> > +
> > +             rb_erase(&sm->node, &mdsc->snapid_map_tree);
> > +             list_move(&sm->lru, &to_free);
> > +     }
> > +     spin_unlock(&mdsc->snapid_map_lock);
> > +
> > +     while (!list_empty(&to_free)) {
> > +             sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > +             list_del(&sm->lru);
> > +             dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> > +             free_anon_bdev(sm->dev);
> > +             kfree(sm);
> > +     }
> > +}
> > +
> > +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> > +{
> > +     struct ceph_snapid_map *sm;
> > +     struct rb_node *p;
> > +     LIST_HEAD(to_free);
> > +
> > +     spin_lock(&mdsc->snapid_map_lock);
> > +     while ((p = rb_first(&mdsc->snapid_map_tree))) {
> > +             sm = rb_entry(p, struct ceph_snapid_map, node);
> > +             rb_erase(p, &mdsc->snapid_map_tree);
> > +             RB_CLEAR_NODE(p);
> > +             list_move(&sm->lru, &to_free);
> > +     }
> > +     spin_unlock(&mdsc->snapid_map_lock);
> > +
> > +     while (!list_empty(&to_free)) {
> > +             sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > +             list_del(&sm->lru);
> > +             free_anon_bdev(sm->dev);
> > +             if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
> > +                     pr_err("snapid map %llx -> %x still in use\n",
> > +                            sm->snap, sm->dev);
> > +             }
> > +     }
> > +}
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 79a265ba9200..7a74b5652a0c 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -370,7 +370,10 @@ struct ceph_inode_info {
> >       struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
> >       spinlock_t i_unsafe_lock;
> >
> > -     struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > +     union {
> > +             struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > +             struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> > +     };
> >       int i_snap_realm_counter; /* snap realm (if caps) */
> >       struct list_head i_snap_realm_item;
> >       struct list_head i_snap_flush_item;
> > @@ -837,6 +840,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
> >                                 struct ceph_cap_snap *capsnap);
> >  extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
> >
> > +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > +                                                u64 snap);
> > +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > +                             struct ceph_snapid_map *sm);
> > +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> > +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> > +
> > +
> >  /*
> >   * a cap_snap is "pending" if it is still awaiting an in-progress
> >   * sync write (that may/may not still update size, mtime, etc.).
>
>
> I think this looks reasonable overall. It's definitely less confusing
> for userland than showing two different dentries as having the same
> ino/dev tuple without them being hardlinked (particularly if they are
> directories).

This can result in the same snapshot dentry having different dev_t
values over time, right?  Do we care about that?

Zheng, please include the link to the tracker ticket.  #22353?

Thanks,

                Ilya

Jeff Layton Jan. 7, 2019, 2:52 p.m. UTC | #3

On Mon, 2019-01-07 at 15:18 +0100, Ilya Dryomov wrote:
> On Mon, Jan 7, 2019 at 2:05 PM Jeff Layton <jlayton@redhat.com> wrote:
> > On Mon, 2019-01-07 at 14:31 +0800, Yan, Zheng wrote:
> > > ceph_getattr() return zero dev ID for head inodes and set dev ID to
> > > snapid directly for snaphost inodes. This is not good because userspace
> > > utilities may consider device ID of 0 as invalid, snapid may conflict
> > > with other device's ID.
> > > 
> > > This patch introduces "snapids to anonymous bdev IDs" map. we create a
> > > new mapping when we see a snapid for the first time. we trim unused
> > > mapping after it is ilde for 5 minutes.
> > > 
> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > >  fs/ceph/inode.c      |  37 +++++++----
> > >  fs/ceph/mds_client.c |   8 +++
> > >  fs/ceph/mds_client.h |  13 ++++
> > >  fs/ceph/snap.c       | 154 +++++++++++++++++++++++++++++++++++++++++++
> > >  fs/ceph/super.h      |  13 +++-
> > >  5 files changed, 210 insertions(+), 15 deletions(-)
> > > 
> > > Changes since v1
> > >  change the code that handles leaked snap map on umount
> > > 
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index c080b133c08b..7f82ceff510a 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
> > >        */
> > >       if (ci->i_snap_realm) {
> > >               struct ceph_mds_client *mdsc =
> > > -                     ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> > > -             struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > -
> > > -             dout(" dropping residual ref to snap realm %p\n", realm);
> > > -             spin_lock(&realm->inodes_with_caps_lock);
> > > -             list_del_init(&ci->i_snap_realm_item);
> > > -             ci->i_snap_realm = NULL;
> > > -             if (realm->ino == ci->i_vino.ino)
> > > -                     realm->inode = NULL;
> > > -             spin_unlock(&realm->inodes_with_caps_lock);
> > > -             ceph_put_snap_realm(mdsc, realm);
> > > +                                     ceph_inode_to_client(inode)->mdsc;
> > > +             if (ceph_snap(inode) == CEPH_NOSNAP) {
> > > +                     struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > +                     dout(" dropping residual ref to snap realm %p\n",
> > > +                          realm);
> > > +                     spin_lock(&realm->inodes_with_caps_lock);
> > > +                     list_del_init(&ci->i_snap_realm_item);
> > > +                     ci->i_snap_realm = NULL;
> > > +                     if (realm->ino == ci->i_vino.ino)
> > > +                             realm->inode = NULL;
> > > +                     spin_unlock(&realm->inodes_with_caps_lock);
> > > +                     ceph_put_snap_realm(mdsc, realm);
> > > +             } else {
> > > +                     ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> > > +                     ci->i_snap_realm = NULL;
> > > +             }
> > >       }
> > > 
> > >       kfree(ci->i_symlink);
> > > @@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > >               pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
> > >                                                    iinfo->pool_ns_len);
> > > 
> > > +     if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> > > +             ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> > > +
> > >       spin_lock(&ci->i_ceph_lock);
> > > 
> > >       /*
> > > @@ -2260,10 +2268,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > >       if (!err) {
> > >               generic_fillattr(inode, stat);
> > >               stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > -             if (ceph_snap(inode) != CEPH_NOSNAP)
> > > -                     stat->dev = ceph_snap(inode);
> > > +             if (ceph_snap(inode) == CEPH_NOSNAP)
> > > +                     stat->dev = inode->i_sb->s_dev;
> > >               else
> > > -                     stat->dev = 0;
> > > +                     stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> > > +
> > >               if (S_ISDIR(inode->i_mode)) {
> > >                       if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
> > >                                               RBYTES))
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 32f627517fd0..1ff490c2d38e 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -3789,6 +3789,8 @@ static void delayed_work(struct work_struct *work)
> > >       dout("mdsc delayed_work\n");
> > >       ceph_check_delayed_caps(mdsc);
> > > 
> > > +     ceph_trim_snapid_map(mdsc);
> > > +
> > >       mutex_lock(&mdsc->mutex);
> > >       renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> > >       renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> > > @@ -3891,6 +3893,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> > >       ceph_caps_init(mdsc);
> > >       ceph_adjust_min_caps(mdsc, fsc->min_caps);
> > > 
> > > +     spin_lock_init(&mdsc->snapid_map_lock);
> > > +     mdsc->snapid_map_tree = RB_ROOT;
> > > +     INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> > > +
> > >       init_rwsem(&mdsc->pool_perm_rwsem);
> > >       mdsc->pool_perm_tree = RB_ROOT;
> > > 
> > > @@ -4084,6 +4090,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > >       WARN_ON(!list_empty(&mdsc->cap_delay_list));
> > >       mutex_unlock(&mdsc->mutex);
> > > 
> > > +     ceph_cleanup_snapid_map(mdsc);
> > > +
> > >       ceph_cleanup_empty_realms(mdsc);
> > > 
> > >       cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 4f962642fee4..d3a5c4046316 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -313,6 +313,15 @@ struct ceph_pool_perm {
> > >       char pool_ns[];
> > >  };
> > > 
> > > +struct ceph_snapid_map {
> > > +     struct rb_node node;
> > > +     struct list_head lru;
> > > +     atomic_t ref;
> > > +     u64 snap;
> > > +     dev_t dev;
> > > +     unsigned long last_used;
> > > +};
> > > +
> > >  /*
> > >   * mds client state
> > >   */
> > > @@ -390,6 +399,10 @@ struct ceph_mds_client {
> > >       struct list_head  dentry_lru;
> > >       int               num_dentry;
> > > 
> > > +     spinlock_t              snapid_map_lock;
> > > +     struct rb_root          snapid_map_tree;
> > > +     struct list_head        snapid_map_lru;
> > > +
> > >       struct rw_semaphore     pool_perm_rwsem;
> > >       struct rb_root          pool_perm_tree;
> > > 
> > > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> > > index d7359628fd1f..c3fd66d6e03a 100644
> > > --- a/fs/ceph/snap.c
> > > +++ b/fs/ceph/snap.c
> > > @@ -988,3 +988,157 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
> > >               up_write(&mdsc->snap_rwsem);
> > >       return;
> > >  }
> > > +
> > > +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +                                         u64 snap)
> > > +{
> > > +     struct ceph_snapid_map *sm, *exist;
> > > +     struct rb_node **p, *parent;
> > > +     int ret;
> > > +
> > > +     exist = NULL;
> > > +     spin_lock(&mdsc->snapid_map_lock);
> > > +     p = &mdsc->snapid_map_tree.rb_node;
> > > +     while (*p) {
> > > +             exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +             if (snap > exist->snap) {
> > > +                     p = &(*p)->rb_left;
> > > +             } else if (snap < exist->snap) {
> > > +                     p = &(*p)->rb_right;
> > > +             } else {
> > > +                     if (atomic_inc_return(&exist->ref) == 1)
> > > +                             list_del_init(&exist->lru);
> > > +                     break;
> > > +             }
> > > +             exist = NULL;
> > > +     }
> > > +     spin_unlock(&mdsc->snapid_map_lock);
> > > +     if (exist) {
> > > +             dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +             return exist;
> > > +     }
> > > +
> > > +     sm = kmalloc(sizeof(*sm), GFP_NOFS);
> > > +     if (!sm)
> > > +             return NULL;
> > > +
> > > +     ret = get_anon_bdev(&sm->dev);
> > > +     if (ret < 0) {
> > > +             kfree(sm);
> > > +             return NULL;
> > > +     }
> > > +
> > > +     INIT_LIST_HEAD(&sm->lru);
> > > +     atomic_set(&sm->ref, 1);
> > > +     sm->snap = snap;
> > > +
> > > +     exist = NULL;
> > > +     parent = NULL;
> > > +     p = &mdsc->snapid_map_tree.rb_node;
> > > +     spin_lock(&mdsc->snapid_map_lock);
> > > +     while (*p) {
> > > +             parent = *p;
> > > +             exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +             if (snap > exist->snap)
> > > +                     p = &(*p)->rb_left;
> > > +             else if (snap < exist->snap)
> > > +                     p = &(*p)->rb_right;
> > > +             else
> > > +                     break;
> > > +             exist = NULL;
> > > +     }
> > > +     if (exist) {
> > > +             if (atomic_inc_return(&exist->ref) == 1)
> > > +                     list_del_init(&exist->lru);
> > > +     } else {
> > > +             rb_link_node(&sm->node, parent, p);
> > > +             rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> > > +     }
> > > +     spin_unlock(&mdsc->snapid_map_lock);
> > > +     if (exist) {
> > > +             free_anon_bdev(sm->dev);
> > > +             kfree(sm);
> > > +             dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +             return exist;
> > > +     }
> > > +
> > > +     dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +     return sm;
> > > +}
> > > +
> > > +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +                      struct ceph_snapid_map *sm)
> > > +{
> > > +     if (!sm)
> > > +             return;
> > > +     if (!atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> > > +             if (!RB_EMPTY_NODE(&sm->node)) {
> > > +                     sm->last_used = jiffies;
> > > +                     list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> > > +                     spin_unlock(&mdsc->snapid_map_lock);
> > > +             } else {
> > > +                     /* already cleaned up by
> > > +                      * ceph_cleanup_snapid_map() */
> > > +                     spin_unlock(&mdsc->snapid_map_lock);
> > > +                     kfree(sm);
> > > +             }
> > > +     }
> > > +}
> > > +
> > > +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +     struct ceph_snapid_map *sm;
> > > +     unsigned long now;
> > > +     LIST_HEAD(to_free);
> > > +
> > > +     /* unused map expires after 5 minutes */
> > > +     static unsigned timeout = 5 * 60 * HZ;
> > > +
> > 
> > No need for static constant here. This could just be something like:
> > 
> > #define CEPH_SNAPID_MAP_TIMEOUT         (5 * 60 * HZ)
> > 
> > ...particularly if you don't care about being able to set this at
> > runtime.
> > 
> > > +     spin_lock(&mdsc->snapid_map_lock);
> > > +     now = jiffies;
> > > +
> > > +     while (!list_empty(&mdsc->snapid_map_lru)) {
> > > +             sm = list_first_entry(&mdsc->snapid_map_lru,
> > > +                                   struct ceph_snapid_map, lru);
> > > +             if (time_after(sm->last_used + timeout, now))
> > > +                     break;
> > > +
> > > +             rb_erase(&sm->node, &mdsc->snapid_map_tree);
> > > +             list_move(&sm->lru, &to_free);
> > > +     }
> > > +     spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +     while (!list_empty(&to_free)) {
> > > +             sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +             list_del(&sm->lru);
> > > +             dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +             free_anon_bdev(sm->dev);
> > > +             kfree(sm);
> > > +     }
> > > +}
> > > +
> > > +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +     struct ceph_snapid_map *sm;
> > > +     struct rb_node *p;
> > > +     LIST_HEAD(to_free);
> > > +
> > > +     spin_lock(&mdsc->snapid_map_lock);
> > > +     while ((p = rb_first(&mdsc->snapid_map_tree))) {
> > > +             sm = rb_entry(p, struct ceph_snapid_map, node);
> > > +             rb_erase(p, &mdsc->snapid_map_tree);
> > > +             RB_CLEAR_NODE(p);
> > > +             list_move(&sm->lru, &to_free);
> > > +     }
> > > +     spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +     while (!list_empty(&to_free)) {
> > > +             sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +             list_del(&sm->lru);
> > > +             free_anon_bdev(sm->dev);
> > > +             if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
> > > +                     pr_err("snapid map %llx -> %x still in use\n",
> > > +                            sm->snap, sm->dev);
> > > +             }
> > > +     }
> > > +}
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 79a265ba9200..7a74b5652a0c 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -370,7 +370,10 @@ struct ceph_inode_info {
> > >       struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
> > >       spinlock_t i_unsafe_lock;
> > > 
> > > -     struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +     union {
> > > +             struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +             struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> > > +     };
> > >       int i_snap_realm_counter; /* snap realm (if caps) */
> > >       struct list_head i_snap_realm_item;
> > >       struct list_head i_snap_flush_item;
> > > @@ -837,6 +840,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
> > >                                 struct ceph_cap_snap *capsnap);
> > >  extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
> > > 
> > > +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +                                                u64 snap);
> > > +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +                             struct ceph_snapid_map *sm);
> > > +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> > > +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> > > +
> > > +
> > >  /*
> > >   * a cap_snap is "pending" if it is still awaiting an in-progress
> > >   * sync write (that may/may not still update size, mtime, etc.).
> > 
> > I think this looks reasonable overall. It's definitely less confusing
> > for userland than showing two different dentries as having the same
> > ino/dev tuple without them being hardlinked (particularly if they are
> > directories).
> 
> This can result in the same snapshot dentry having different dev_t
> values over time, right?  Do we care about that?
> 
> Zheng, please include the link to the tracker ticket.  #22353?
> 
> Thanks,
> 

I don't think it really matters.

FWIW, the behavior is somewhat similar on NFS. On an NFS mount, you get
a dynamically-allocated device number as well, so if your mount gets
unmounted (e.g. automounted and shrunk due to lack of activity) then you
can easily get a different dev_t on a subsequent stat() call.

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c080b133c08b..7f82ceff510a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -548,17 +548,22 @@  void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
-		struct ceph_snap_realm *realm = ci->i_snap_realm;
-
-		dout(" dropping residual ref to snap realm %p\n", realm);
-		spin_lock(&realm->inodes_with_caps_lock);
-		list_del_init(&ci->i_snap_realm_item);
-		ci->i_snap_realm = NULL;
-		if (realm->ino == ci->i_vino.ino)
-			realm->inode = NULL;
-		spin_unlock(&realm->inodes_with_caps_lock);
-		ceph_put_snap_realm(mdsc, realm);
+					ceph_inode_to_client(inode)->mdsc;
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			struct ceph_snap_realm *realm = ci->i_snap_realm;
+			dout(" dropping residual ref to snap realm %p\n",
+			     realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			ci->i_snap_realm = NULL;
+			if (realm->ino == ci->i_vino.ino)
+				realm->inode = NULL;
+			spin_unlock(&realm->inodes_with_caps_lock);
+			ceph_put_snap_realm(mdsc, realm);
+		} else {
+			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+			ci->i_snap_realm = NULL;
+		}
 	}
 
 	kfree(ci->i_symlink);
@@ -776,6 +781,9 @@  static int fill_inode(struct inode *inode, struct page *locked_page,
 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
 						     iinfo->pool_ns_len);
 
+	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
 	spin_lock(&ci->i_ceph_lock);
 
 	/*
@@ -2260,10 +2268,11 @@  int ceph_getattr(const struct path *path, struct kstat *stat,
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
-		if (ceph_snap(inode) != CEPH_NOSNAP)
-			stat->dev = ceph_snap(inode);
+		if (ceph_snap(inode) == CEPH_NOSNAP)
+			stat->dev = inode->i_sb->s_dev;
 		else
-			stat->dev = 0;
+			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
 		if (S_ISDIR(inode->i_mode)) {
 			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
 						RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 32f627517fd0..1ff490c2d38e 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3789,6 +3789,8 @@  static void delayed_work(struct work_struct *work)
 	dout("mdsc delayed_work\n");
 	ceph_check_delayed_caps(mdsc);
 
+	ceph_trim_snapid_map(mdsc);
+
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -3891,6 +3893,10 @@  int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	ceph_caps_init(mdsc);
 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
+	spin_lock_init(&mdsc->snapid_map_lock);
+	mdsc->snapid_map_tree = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
+
 	init_rwsem(&mdsc->pool_perm_rwsem);
 	mdsc->pool_perm_tree = RB_ROOT;
 
@@ -4084,6 +4090,8 @@  void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
 	mutex_unlock(&mdsc->mutex);
 
+	ceph_cleanup_snapid_map(mdsc);
+
 	ceph_cleanup_empty_realms(mdsc);
 
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4f962642fee4..d3a5c4046316 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -313,6 +313,15 @@  struct ceph_pool_perm {
 	char pool_ns[];
 };
 
+struct ceph_snapid_map {
+	struct rb_node node;
+	struct list_head lru;
+	atomic_t ref;
+	u64 snap;
+	dev_t dev;
+	unsigned long last_used;
+};
+
 /*
  * mds client state
  */
@@ -390,6 +399,10 @@  struct ceph_mds_client {
 	struct list_head  dentry_lru;
 	int		  num_dentry;
 
+	spinlock_t		snapid_map_lock;
+	struct rb_root		snapid_map_tree;
+	struct list_head	snapid_map_lru;
+
 	struct rw_semaphore     pool_perm_rwsem;
 	struct rb_root		pool_perm_tree;
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d7359628fd1f..c3fd66d6e03a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -988,3 +988,157 @@  void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		up_write(&mdsc->snap_rwsem);
 	return;
 }
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+					    u64 snap)
+{
+	struct ceph_snapid_map *sm, *exist;
+	struct rb_node **p, *parent;
+	int ret;
+
+	exist = NULL;
+	spin_lock(&mdsc->snapid_map_lock);
+	p = &mdsc->snapid_map_tree.rb_node;
+	while (*p) {
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap) {
+			p = &(*p)->rb_left;
+		} else if (snap < exist->snap) {
+			p = &(*p)->rb_right;
+		} else {
+			if (atomic_inc_return(&exist->ref) == 1)
+				list_del_init(&exist->lru);
+			break;
+		}
+		exist = NULL;
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+
+	sm = kmalloc(sizeof(*sm), GFP_NOFS);
+	if (!sm)
+		return NULL;
+
+	ret = get_anon_bdev(&sm->dev);
+	if (ret < 0) {
+		kfree(sm);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&sm->lru);
+	atomic_set(&sm->ref, 1);
+	sm->snap = snap;
+
+	exist = NULL;
+	parent = NULL;
+	p = &mdsc->snapid_map_tree.rb_node;
+	spin_lock(&mdsc->snapid_map_lock);
+	while (*p) {
+		parent = *p;
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap)
+			p = &(*p)->rb_left;
+		else if (snap < exist->snap)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	if (exist) {
+		if (atomic_inc_return(&exist->ref) == 1)
+			list_del_init(&exist->lru);
+	} else {
+		rb_link_node(&sm->node, parent, p);
+		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+
+	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+	return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+			 struct ceph_snapid_map *sm)
+{
+	if (!sm)
+		return;
+	if (!atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+		if (!RB_EMPTY_NODE(&sm->node)) {
+			sm->last_used = jiffies;
+			list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+			spin_unlock(&mdsc->snapid_map_lock);
+		} else {
+			/* already cleaned up by
+			 * ceph_cleanup_snapid_map() */
+			spin_unlock(&mdsc->snapid_map_lock);
+			kfree(sm);
+		}
+	}
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	unsigned long now;
+	LIST_HEAD(to_free);
+
+	/* unused map expires after 5 minutes */
+	static unsigned timeout = 5 * 60 * HZ;
+
+	spin_lock(&mdsc->snapid_map_lock);
+	now = jiffies;
+
+	while (!list_empty(&mdsc->snapid_map_lru)) {
+		sm = list_first_entry(&mdsc->snapid_map_lru,
+				      struct ceph_snapid_map, lru);
+		if (time_after(sm->last_used + timeout, now))
+			break;
+
+		rb_erase(&sm->node, &mdsc->snapid_map_tree);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+	}
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	struct rb_node *p;
+	LIST_HEAD(to_free);
+
+	spin_lock(&mdsc->snapid_map_lock);
+	while ((p = rb_first(&mdsc->snapid_map_tree))) {
+		sm = rb_entry(p, struct ceph_snapid_map, node);
+		rb_erase(p, &mdsc->snapid_map_tree);
+		RB_CLEAR_NODE(p);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		free_anon_bdev(sm->dev);
+		if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
+			pr_err("snapid map %llx -> %x still in use\n",
+			       sm->snap, sm->dev);
+		}
+	}
+}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 79a265ba9200..7a74b5652a0c 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -370,7 +370,10 @@  struct ceph_inode_info {
 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
 	spinlock_t i_unsafe_lock;
 
-	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	union {
+		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+	};
 	int i_snap_realm_counter; /* snap realm (if caps) */
 	struct list_head i_snap_realm_item;
 	struct list_head i_snap_flush_item;
@@ -837,6 +840,14 @@  extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 				  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
 
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+						   u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+				struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
 /*
  * a cap_snap is "pending" if it is still awaiting an in-progress
  * sync write (that may/may not still update size, mtime, etc.).

[v2] ceph: map snapid to anonymous bdev ID

Commit Message

Comments

Patch