diff mbox

[06/47] blk-mq: add a flags parameter to blk_mq_alloc_request

Message ID 1448037342-18384-7-git-send-email-hch@lst.de (mailing list archive)
State New, archived
Headers show

Commit Message

Christoph Hellwig Nov. 20, 2015, 4:35 p.m. UTC
We already have the reserved flag, and a nowait flag awkwardly encoded as
a gfp_t.  Add a real flags argument to make the scheme more extensible and
allow for a nicer calling convention.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c                  |   11 +-
 block/blk-mq-tag.c                |   11 +-
 block/blk-mq.c                    |   20 +-
 block/blk-mq.h                    |   11 +-
 block/blk.h                       |    2 +-
 drivers/block/mtip32xx/mtip32xx.c |    2 +-
 drivers/nvme/host/core.c          | 1172 +++++++++++++++++++++++++++++++++++++
 drivers/nvme/host/pci.c           |   11 +-
 include/linux/blk-mq.h            |    8 +-
 9 files changed, 1210 insertions(+), 38 deletions(-)
 create mode 100644 drivers/nvme/host/core.c

Comments

Jeff Moyer Nov. 24, 2015, 3:19 p.m. UTC | #1
Christoph Hellwig <hch@lst.de> writes:

> We already have the reserved flag, and a nowait flag awkwardly encoded as
> a gfp_t.  Add a real flags argument to make the scheme more extensible and
> allow for a nicer calling convention.
>
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  block/blk-core.c                  |   11 +-
>  block/blk-mq-tag.c                |   11 +-
>  block/blk-mq.c                    |   20 +-
>  block/blk-mq.h                    |   11 +-
>  block/blk.h                       |    2 +-
>  drivers/block/mtip32xx/mtip32xx.c |    2 +-
>  drivers/nvme/host/core.c          | 1172 +++++++++++++++++++++++++++++++++++++

Christoph, I think you included a bit too much in this patch!  ;-)

-Jeff

>  drivers/nvme/host/pci.c           |   11 +-
>  include/linux/blk-mq.h            |    8 +-
>  9 files changed, 1210 insertions(+), 38 deletions(-)
>  create mode 100644 drivers/nvme/host/core.c
>
> diff --git a/block/blk-core.c b/block/blk-core.c
> index af9c315..d2100aa 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -630,7 +630,7 @@ struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
>  }
>  EXPORT_SYMBOL(blk_alloc_queue);
>  
> -int blk_queue_enter(struct request_queue *q, gfp_t gfp)
> +int blk_queue_enter(struct request_queue *q, bool nowait)
>  {
>  	while (true) {
>  		int ret;
> @@ -638,7 +638,7 @@ int blk_queue_enter(struct request_queue *q, gfp_t gfp)
>  		if (percpu_ref_tryget_live(&q->q_usage_counter))
>  			return 0;
>  
> -		if (!gfpflags_allow_blocking(gfp))
> +		if (nowait)
>  			return -EBUSY;
>  
>  		ret = wait_event_interruptible(q->mq_freeze_wq,
> @@ -1284,7 +1284,9 @@ static struct request *blk_old_get_request(struct request_queue *q, int rw,
>  struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
>  {
>  	if (q->mq_ops)
> -		return blk_mq_alloc_request(q, rw, gfp_mask, false);
> +		return blk_mq_alloc_request(q, rw,
> +			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
> +				0 : BLK_MQ_REQ_NOWAIT);
>  	else
>  		return blk_old_get_request(q, rw, gfp_mask);
>  }
> @@ -2052,8 +2054,7 @@ blk_qc_t generic_make_request(struct bio *bio)
>  	do {
>  		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
>  
> -		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
> -
> +		if (likely(blk_queue_enter(q, false) == 0)) {
>  			ret = q->make_request_fn(q, bio);
>  
>  			blk_queue_exit(q);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index a07ca34..abdbb47 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -268,7 +268,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
>  	if (tag != -1)
>  		return tag;
>  
> -	if (!gfpflags_allow_blocking(data->gfp))
> +	if (data->flags & BLK_MQ_REQ_NOWAIT)
>  		return -1;
>  
>  	bs = bt_wait_ptr(bt, hctx);
> @@ -303,7 +303,7 @@ static int bt_get(struct blk_mq_alloc_data *data,
>  		data->ctx = blk_mq_get_ctx(data->q);
>  		data->hctx = data->q->mq_ops->map_queue(data->q,
>  				data->ctx->cpu);
> -		if (data->reserved) {
> +		if (data->flags & BLK_MQ_REQ_RESERVED) {
>  			bt = &data->hctx->tags->breserved_tags;
>  		} else {
>  			last_tag = &data->ctx->last_tag;
> @@ -349,10 +349,9 @@ static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
>  
>  unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
>  {
> -	if (!data->reserved)
> -		return __blk_mq_get_tag(data);
> -
> -	return __blk_mq_get_reserved_tag(data);
> +	if (data->flags & BLK_MQ_REQ_RESERVED)
> +		return __blk_mq_get_reserved_tag(data);
> +	return __blk_mq_get_tag(data);
>  }
>  
>  static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index c932605..6da03f1 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -230,8 +230,8 @@ __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
>  	return NULL;
>  }
>  
> -struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
> -		bool reserved)
> +struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
> +		unsigned int flags)
>  {
>  	struct blk_mq_ctx *ctx;
>  	struct blk_mq_hw_ctx *hctx;
> @@ -239,24 +239,22 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
>  	struct blk_mq_alloc_data alloc_data;
>  	int ret;
>  
> -	ret = blk_queue_enter(q, gfp);
> +	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
>  	if (ret)
>  		return ERR_PTR(ret);
>  
>  	ctx = blk_mq_get_ctx(q);
>  	hctx = q->mq_ops->map_queue(q, ctx->cpu);
> -	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
> -			reserved, ctx, hctx);
> +	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
>  
>  	rq = __blk_mq_alloc_request(&alloc_data, rw);
> -	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
> +	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
>  		__blk_mq_run_hw_queue(hctx);
>  		blk_mq_put_ctx(ctx);
>  
>  		ctx = blk_mq_get_ctx(q);
>  		hctx = q->mq_ops->map_queue(q, ctx->cpu);
> -		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
> -				hctx);
> +		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
>  		rq =  __blk_mq_alloc_request(&alloc_data, rw);
>  		ctx = alloc_data.ctx;
>  	}
> @@ -1181,8 +1179,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
>  		rw |= REQ_SYNC;
>  
>  	trace_block_getrq(q, bio, rw);
> -	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
> -			hctx);
> +	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
>  	rq = __blk_mq_alloc_request(&alloc_data, rw);
>  	if (unlikely(!rq)) {
>  		__blk_mq_run_hw_queue(hctx);
> @@ -1191,8 +1188,7 @@ static struct request *blk_mq_map_request(struct request_queue *q,
>  
>  		ctx = blk_mq_get_ctx(q);
>  		hctx = q->mq_ops->map_queue(q, ctx->cpu);
> -		blk_mq_set_alloc_data(&alloc_data, q,
> -				__GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
> +		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
>  		rq = __blk_mq_alloc_request(&alloc_data, rw);
>  		ctx = alloc_data.ctx;
>  		hctx = alloc_data.hctx;
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 713820b..eaede8e 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -96,8 +96,7 @@ static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
>  struct blk_mq_alloc_data {
>  	/* input parameter */
>  	struct request_queue *q;
> -	gfp_t gfp;
> -	bool reserved;
> +	unsigned int flags;
>  
>  	/* input & output parameter */
>  	struct blk_mq_ctx *ctx;
> @@ -105,13 +104,11 @@ struct blk_mq_alloc_data {
>  };
>  
>  static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
> -		struct request_queue *q, gfp_t gfp, bool reserved,
> -		struct blk_mq_ctx *ctx,
> -		struct blk_mq_hw_ctx *hctx)
> +		struct request_queue *q, unsigned int flags,
> +		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
>  {
>  	data->q = q;
> -	data->gfp = gfp;
> -	data->reserved = reserved;
> +	data->flags = flags;
>  	data->ctx = ctx;
>  	data->hctx = hctx;
>  }
> diff --git a/block/blk.h b/block/blk.h
> index 1d95107..38bf997 100644
> --- a/block/blk.h
> +++ b/block/blk.h
> @@ -72,7 +72,7 @@ void blk_dequeue_request(struct request *rq);
>  void __blk_queue_free_tags(struct request_queue *q);
>  bool __blk_end_bidi_request(struct request *rq, int error,
>  			    unsigned int nr_bytes, unsigned int bidi_bytes);
> -int blk_queue_enter(struct request_queue *q, gfp_t gfp);
> +int blk_queue_enter(struct request_queue *q, bool nowait);
>  void blk_queue_exit(struct request_queue *q);
>  void blk_freeze_queue(struct request_queue *q);
>  
> diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
> index a28a562..cf3b51a 100644
> --- a/drivers/block/mtip32xx/mtip32xx.c
> +++ b/drivers/block/mtip32xx/mtip32xx.c
> @@ -173,7 +173,7 @@ static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
>  {
>  	struct request *rq;
>  
> -	rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
> +	rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED);
>  	return blk_mq_rq_to_pdu(rq);
>  }
>  
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> new file mode 100644
> index 0000000..53cf507
> --- /dev/null
> +++ b/drivers/nvme/host/core.c
> @@ -0,0 +1,1172 @@
> +/*
> + * NVM Express device driver
> + * Copyright (c) 2011-2014, Intel Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> + * more details.
> + */
> +
> +#include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> +#include <linux/errno.h>
> +#include <linux/hdreg.h>
> +#include <linux/kernel.h>
> +#include <linux/module.h>
> +#include <linux/list_sort.h>
> +#include <linux/slab.h>
> +#include <linux/types.h>
> +#include <linux/pr.h>
> +#include <linux/ptrace.h>
> +#include <linux/nvme_ioctl.h>
> +#include <linux/t10-pi.h>
> +#include <scsi/sg.h>
> +#include <asm/unaligned.h>
> +
> +#include "nvme.h"
> +
> +#define NVME_MINORS		(1U << MINORBITS)
> +
> +static int nvme_major;
> +module_param(nvme_major, int, 0);
> +
> +static int nvme_char_major;
> +module_param(nvme_char_major, int, 0);
> +
> +static LIST_HEAD(nvme_ctrl_list);
> +DEFINE_SPINLOCK(dev_list_lock);
> +
> +static struct class *nvme_class;
> +
> +static void nvme_free_ns(struct kref *kref)
> +{
> +	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
> +
> +	if (ns->type == NVME_NS_LIGHTNVM)
> +		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
> +
> +	spin_lock(&dev_list_lock);
> +	ns->disk->private_data = NULL;
> +	spin_unlock(&dev_list_lock);
> +
> +	nvme_put_ctrl(ns->ctrl);
> +	put_disk(ns->disk);
> +	kfree(ns);
> +}
> +
> +static void nvme_put_ns(struct nvme_ns *ns)
> +{
> +	kref_put(&ns->kref, nvme_free_ns);
> +}
> +
> +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
> +{
> +	struct nvme_ns *ns;
> +
> +	spin_lock(&dev_list_lock);
> +	ns = disk->private_data;
> +	if (ns && !kref_get_unless_zero(&ns->kref))
> +		ns = NULL;
> +	spin_unlock(&dev_list_lock);
> +
> +	return ns;
> +}
> +
> +static struct request *nvme_alloc_request(struct request_queue *q,
> +		struct nvme_command *cmd)
> +{
> +	bool write = cmd->common.opcode & 1;
> +	struct request *req;
> +
> +	req = blk_mq_alloc_request(q, write, 0);
> +	if (IS_ERR(req))
> +		return req;
> +
> +	req->cmd_type = REQ_TYPE_DRV_PRIV;
> +	req->cmd_flags |= REQ_FAILFAST_DRIVER;
> +	req->__data_len = 0;
> +	req->__sector = (sector_t) -1;
> +	req->bio = req->biotail = NULL;
> +
> +	req->cmd = (unsigned char *)cmd;
> +	req->cmd_len = sizeof(struct nvme_command);
> +	req->special = (void *)0;
> +
> +	return req;
> +}
> +
> +/*
> + * Returns 0 on success.  If the result is negative, it's a Linux error code;
> + * if the result is positive, it's an NVM Express status code
> + */
> +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
> +		void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
> +{
> +	struct request *req;
> +	int ret;
> +
> +	req = nvme_alloc_request(q, cmd);
> +	if (IS_ERR(req))
> +		return PTR_ERR(req);
> +
> +	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
> +
> +	if (buffer && bufflen) {
> +		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
> +		if (ret)
> +			goto out;
> +	}
> +
> +	blk_execute_rq(req->q, NULL, req, 0);
> +	if (result)
> +		*result = (u32)(uintptr_t)req->special;
> +	ret = req->errors;
> + out:
> +	blk_mq_free_request(req);
> +	return ret;
> +}
> +
> +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
> +		void *buffer, unsigned bufflen)
> +{
> +	return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
> +}
> +
> +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
> +		void __user *ubuffer, unsigned bufflen,
> +		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
> +		u32 *result, unsigned timeout)
> +{
> +	bool write = cmd->common.opcode & 1;
> +	struct nvme_ns *ns = q->queuedata;
> +	struct gendisk *disk = ns ? ns->disk : NULL;
> +	struct request *req;
> +	struct bio *bio = NULL;
> +	void *meta = NULL;
> +	int ret;
> +
> +	req = nvme_alloc_request(q, cmd);
> +	if (IS_ERR(req))
> +		return PTR_ERR(req);
> +
> +	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
> +
> +	if (ubuffer && bufflen) {
> +		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
> +				GFP_KERNEL);
> +		if (ret)
> +			goto out;
> +		bio = req->bio;
> +
> +		if (!disk)
> +			goto submit;
> +		bio->bi_bdev = bdget_disk(disk, 0);
> +		if (!bio->bi_bdev) {
> +			ret = -ENODEV;
> +			goto out_unmap;
> +		}
> +
> +		if (meta_buffer) {
> +			struct bio_integrity_payload *bip;
> +
> +			meta = kmalloc(meta_len, GFP_KERNEL);
> +			if (!meta) {
> +				ret = -ENOMEM;
> +				goto out_unmap;
> +			}
> +
> +			if (write) {
> +				if (copy_from_user(meta, meta_buffer,
> +						meta_len)) {
> +					ret = -EFAULT;
> +					goto out_free_meta;
> +				}
> +			}
> +
> +			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
> +			if (!bip) {
> +				ret = -ENOMEM;
> +				goto out_free_meta;
> +			}
> +
> +			bip->bip_iter.bi_size = meta_len;
> +			bip->bip_iter.bi_sector = meta_seed;
> +
> +			ret = bio_integrity_add_page(bio, virt_to_page(meta),
> +					meta_len, offset_in_page(meta));
> +			if (ret != meta_len) {
> +				ret = -ENOMEM;
> +				goto out_free_meta;
> +			}
> +		}
> +	}
> + submit:
> +	blk_execute_rq(req->q, disk, req, 0);
> +	ret = req->errors;
> +	if (result)
> +		*result = (u32)(uintptr_t)req->special;
> +	if (meta && !ret && !write) {
> +		if (copy_to_user(meta_buffer, meta, meta_len))
> +			ret = -EFAULT;
> +	}
> + out_free_meta:
> +	kfree(meta);
> + out_unmap:
> +	if (bio) {
> +		if (disk && bio->bi_bdev)
> +			bdput(bio->bi_bdev);
> +		blk_rq_unmap_user(bio);
> +	}
> + out:
> +	blk_mq_free_request(req);
> +	return ret;
> +}
> +
> +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
> +		void __user *ubuffer, unsigned bufflen, u32 *result,
> +		unsigned timeout)
> +{
> +	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
> +			result, timeout);
> +}
> +
> +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
> +{
> +	struct nvme_command c = { };
> +	int error;
> +
> +	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
> +	c.identify.opcode = nvme_admin_identify;
> +	c.identify.cns = cpu_to_le32(1);
> +
> +	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
> +	if (!*id)
> +		return -ENOMEM;
> +
> +	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
> +			sizeof(struct nvme_id_ctrl));
> +	if (error)
> +		kfree(*id);
> +	return error;
> +}
> +
> +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
> +{
> +	struct nvme_command c = { };
> +
> +	c.identify.opcode = nvme_admin_identify;
> +	c.identify.cns = cpu_to_le32(2);
> +	c.identify.nsid = cpu_to_le32(nsid);
> +	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
> +}
> +
> +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
> +		struct nvme_id_ns **id)
> +{
> +	struct nvme_command c = { };
> +	int error;
> +
> +	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
> +	c.identify.opcode = nvme_admin_identify,
> +	c.identify.nsid = cpu_to_le32(nsid),
> +
> +	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
> +	if (!*id)
> +		return -ENOMEM;
> +
> +	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
> +			sizeof(struct nvme_id_ns));
> +	if (error)
> +		kfree(*id);
> +	return error;
> +}
> +
> +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
> +					dma_addr_t dma_addr, u32 *result)
> +{
> +	struct nvme_command c;
> +
> +	memset(&c, 0, sizeof(c));
> +	c.features.opcode = nvme_admin_get_features;
> +	c.features.nsid = cpu_to_le32(nsid);
> +	c.features.prp1 = cpu_to_le64(dma_addr);
> +	c.features.fid = cpu_to_le32(fid);
> +
> +	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
> +}
> +
> +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
> +					dma_addr_t dma_addr, u32 *result)
> +{
> +	struct nvme_command c;
> +
> +	memset(&c, 0, sizeof(c));
> +	c.features.opcode = nvme_admin_set_features;
> +	c.features.prp1 = cpu_to_le64(dma_addr);
> +	c.features.fid = cpu_to_le32(fid);
> +	c.features.dword11 = cpu_to_le32(dword11);
> +
> +	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
> +}
> +
> +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
> +{
> +	struct nvme_command c = { };
> +	int error;
> +
> +	c.common.opcode = nvme_admin_get_log_page,
> +	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
> +	c.common.cdw10[0] = cpu_to_le32(
> +			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
> +			 NVME_LOG_SMART),
> +
> +	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
> +	if (!*log)
> +		return -ENOMEM;
> +
> +	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
> +			sizeof(struct nvme_smart_log));
> +	if (error)
> +		kfree(*log);
> +	return error;
> +}
> +
> +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
> +{
> +	struct nvme_user_io io;
> +	struct nvme_command c;
> +	unsigned length, meta_len;
> +	void __user *metadata;
> +
> +	if (copy_from_user(&io, uio, sizeof(io)))
> +		return -EFAULT;
> +
> +	switch (io.opcode) {
> +	case nvme_cmd_write:
> +	case nvme_cmd_read:
> +	case nvme_cmd_compare:
> +		break;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	length = (io.nblocks + 1) << ns->lba_shift;
> +	meta_len = (io.nblocks + 1) * ns->ms;
> +	metadata = (void __user *)(uintptr_t)io.metadata;
> +
> +	if (ns->ext) {
> +		length += meta_len;
> +		meta_len = 0;
> +	} else if (meta_len) {
> +		if ((io.metadata & 3) || !io.metadata)
> +			return -EINVAL;
> +	}
> +
> +	memset(&c, 0, sizeof(c));
> +	c.rw.opcode = io.opcode;
> +	c.rw.flags = io.flags;
> +	c.rw.nsid = cpu_to_le32(ns->ns_id);
> +	c.rw.slba = cpu_to_le64(io.slba);
> +	c.rw.length = cpu_to_le16(io.nblocks);
> +	c.rw.control = cpu_to_le16(io.control);
> +	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
> +	c.rw.reftag = cpu_to_le32(io.reftag);
> +	c.rw.apptag = cpu_to_le16(io.apptag);
> +	c.rw.appmask = cpu_to_le16(io.appmask);
> +
> +	return __nvme_submit_user_cmd(ns->queue, &c,
> +			(void __user *)(uintptr_t)io.addr, length,
> +			metadata, meta_len, io.slba, NULL, 0);
> +}
> +
> +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
> +			struct nvme_passthru_cmd __user *ucmd)
> +{
> +	struct nvme_passthru_cmd cmd;
> +	struct nvme_command c;
> +	unsigned timeout = 0;
> +	int status;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EACCES;
> +	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
> +		return -EFAULT;
> +
> +	memset(&c, 0, sizeof(c));
> +	c.common.opcode = cmd.opcode;
> +	c.common.flags = cmd.flags;
> +	c.common.nsid = cpu_to_le32(cmd.nsid);
> +	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
> +	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
> +	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
> +	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
> +	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
> +	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
> +	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
> +	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
> +
> +	if (cmd.timeout_ms)
> +		timeout = msecs_to_jiffies(cmd.timeout_ms);
> +
> +	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
> +			(void __user *)cmd.addr, cmd.data_len,
> +			&cmd.result, timeout);
> +	if (status >= 0) {
> +		if (put_user(cmd.result, &ucmd->result))
> +			return -EFAULT;
> +	}
> +
> +	return status;
> +}
> +
> +static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
> +		unsigned int cmd, unsigned long arg)
> +{
> +	struct nvme_ns *ns = bdev->bd_disk->private_data;
> +
> +	switch (cmd) {
> +	case NVME_IOCTL_ID:
> +		force_successful_syscall_return();
> +		return ns->ns_id;
> +	case NVME_IOCTL_ADMIN_CMD:
> +		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
> +	case NVME_IOCTL_IO_CMD:
> +		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
> +	case NVME_IOCTL_SUBMIT_IO:
> +		return nvme_submit_io(ns, (void __user *)arg);
> +	case SG_GET_VERSION_NUM:
> +		return nvme_sg_get_version_num((void __user *)arg);
> +	case SG_IO:
> +		return nvme_sg_io(ns, (void __user *)arg);
> +	default:
> +		return -ENOTTY;
> +	}
> +}
> +
> +#ifdef CONFIG_COMPAT
> +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
> +			unsigned int cmd, unsigned long arg)
> +{
> +	switch (cmd) {
> +	case SG_IO:
> +		return -ENOIOCTLCMD;
> +	}
> +	return nvme_ioctl(bdev, mode, cmd, arg);
> +}
> +#else
> +#define nvme_compat_ioctl	NULL
> +#endif
> +
> +static int nvme_open(struct block_device *bdev, fmode_t mode)
> +{
> +	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
> +}
> +
> +static void nvme_release(struct gendisk *disk, fmode_t mode)
> +{
> +	nvme_put_ns(disk->private_data);
> +}
> +
> +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
> +{
> +	/* some standard values */
> +	geo->heads = 1 << 6;
> +	geo->sectors = 1 << 5;
> +	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
> +	return 0;
> +}
> +
> +#ifdef CONFIG_BLK_DEV_INTEGRITY
> +static void nvme_init_integrity(struct nvme_ns *ns)
> +{
> +	struct blk_integrity integrity;
> +
> +	switch (ns->pi_type) {
> +	case NVME_NS_DPS_PI_TYPE3:
> +		integrity.profile = &t10_pi_type3_crc;
> +		break;
> +	case NVME_NS_DPS_PI_TYPE1:
> +	case NVME_NS_DPS_PI_TYPE2:
> +		integrity.profile = &t10_pi_type1_crc;
> +		break;
> +	default:
> +		integrity.profile = NULL;
> +		break;
> +	}
> +	integrity.tuple_size = ns->ms;
> +	blk_integrity_register(ns->disk, &integrity);
> +	blk_queue_max_integrity_segments(ns->queue, 1);
> +}
> +#else
> +static void nvme_init_integrity(struct nvme_ns *ns)
> +{
> +}
> +#endif /* CONFIG_BLK_DEV_INTEGRITY */
> +
> +static void nvme_config_discard(struct nvme_ns *ns)
> +{
> +	u32 logical_block_size = queue_logical_block_size(ns->queue);
> +	ns->queue->limits.discard_zeroes_data = 0;
> +	ns->queue->limits.discard_alignment = logical_block_size;
> +	ns->queue->limits.discard_granularity = logical_block_size;
> +	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
> +	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
> +}
> +
> +static int nvme_revalidate_disk(struct gendisk *disk)
> +{
> +	struct nvme_ns *ns = disk->private_data;
> +	struct nvme_id_ns *id;
> +	u8 lbaf, pi_type;
> +	u16 old_ms;
> +	unsigned short bs;
> +
> +	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
> +		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
> +				__func__, ns->ctrl->instance, ns->ns_id);
> +		return -ENODEV;
> +	}
> +	if (id->ncap == 0) {
> +		kfree(id);
> +		return -ENODEV;
> +	}
> +
> +	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
> +		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
> +			dev_warn(ns->ctrl->dev,
> +				"%s: LightNVM init failure\n", __func__);
> +			kfree(id);
> +			return -ENODEV;
> +		}
> +		ns->type = NVME_NS_LIGHTNVM;
> +	}
> +
> +	old_ms = ns->ms;
> +	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
> +	ns->lba_shift = id->lbaf[lbaf].ds;
> +	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
> +	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
> +
> +	/*
> +	 * If identify namespace failed, use default 512 byte block size so
> +	 * block layer can use before failing read/write for 0 capacity.
> +	 */
> +	if (ns->lba_shift == 0)
> +		ns->lba_shift = 9;
> +	bs = 1 << ns->lba_shift;
> +	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
> +	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
> +					id->dps & NVME_NS_DPS_PI_MASK : 0;
> +
> +	blk_mq_freeze_queue(disk->queue);
> +	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
> +				ns->ms != old_ms ||
> +				bs != queue_logical_block_size(disk->queue) ||
> +				(ns->ms && ns->ext)))
> +		blk_integrity_unregister(disk);
> +
> +	ns->pi_type = pi_type;
> +	blk_queue_logical_block_size(ns->queue, bs);
> +
> +	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
> +		nvme_init_integrity(ns);
> +	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
> +		set_capacity(disk, 0);
> +	else
> +		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
> +
> +	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
> +		nvme_config_discard(ns);
> +	blk_mq_unfreeze_queue(disk->queue);
> +
> +	kfree(id);
> +	return 0;
> +}
> +
> +static char nvme_pr_type(enum pr_type type)
> +{
> +	switch (type) {
> +	case PR_WRITE_EXCLUSIVE:
> +		return 1;
> +	case PR_EXCLUSIVE_ACCESS:
> +		return 2;
> +	case PR_WRITE_EXCLUSIVE_REG_ONLY:
> +		return 3;
> +	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
> +		return 4;
> +	case PR_WRITE_EXCLUSIVE_ALL_REGS:
> +		return 5;
> +	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
> +		return 6;
> +	default:
> +		return 0;
> +	}
> +};
> +
> +static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
> +				u64 key, u64 sa_key, u8 op)
> +{
> +	struct nvme_ns *ns = bdev->bd_disk->private_data;
> +	struct nvme_command c;
> +	u8 data[16] = { 0, };
> +
> +	put_unaligned_le64(key, &data[0]);
> +	put_unaligned_le64(sa_key, &data[8]);
> +
> +	memset(&c, 0, sizeof(c));
> +	c.common.opcode = op;
> +	c.common.nsid = cpu_to_le32(ns->ns_id);
> +	c.common.cdw10[0] = cpu_to_le32(cdw10);
> +
> +	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
> +}
> +
> +static int nvme_pr_register(struct block_device *bdev, u64 old,
> +		u64 new, unsigned flags)
> +{
> +	u32 cdw10;
> +
> +	if (flags & ~PR_FL_IGNORE_KEY)
> +		return -EOPNOTSUPP;
> +
> +	cdw10 = old ? 2 : 0;
> +	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
> +	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
> +	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
> +}
> +
> +static int nvme_pr_reserve(struct block_device *bdev, u64 key,
> +		enum pr_type type, unsigned flags)
> +{
> +	u32 cdw10;
> +
> +	if (flags & ~PR_FL_IGNORE_KEY)
> +		return -EOPNOTSUPP;
> +
> +	cdw10 = nvme_pr_type(type) << 8;
> +	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
> +	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
> +}
> +
> +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
> +		enum pr_type type, bool abort)
> +{
> +	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
> +	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
> +}
> +
> +static int nvme_pr_clear(struct block_device *bdev, u64 key)
> +{
> +	u32 cdw10 = 1 | key ? 1 << 3 : 0;
> +	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
> +}
> +
> +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
> +{
> +	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
> +	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
> +}
> +
> +static const struct pr_ops nvme_pr_ops = {
> +	.pr_register	= nvme_pr_register,
> +	.pr_reserve	= nvme_pr_reserve,
> +	.pr_release	= nvme_pr_release,
> +	.pr_preempt	= nvme_pr_preempt,
> +	.pr_clear	= nvme_pr_clear,
> +};
> +
> +static const struct block_device_operations nvme_fops = {
> +	.owner		= THIS_MODULE,
> +	.ioctl		= nvme_ioctl,
> +	.compat_ioctl	= nvme_compat_ioctl,
> +	.open		= nvme_open,
> +	.release	= nvme_release,
> +	.getgeo		= nvme_getgeo,
> +	.revalidate_disk= nvme_revalidate_disk,
> +	.pr_ops		= &nvme_pr_ops,
> +};
> +
> +/*
> + * Initialize the cached copies of the Identify data and various controller
> + * register in our nvme_ctrl structure.  This should be called as soon as
> + * the admin queue is fully up and running.
> + */
> +int nvme_init_identify(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_id_ctrl *id;
> +	u64 cap;
> +	int ret, page_shift;
> +
> +	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
> +	if (ret) {
> +		dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
> +		return ret;
> +	}
> +
> +	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
> +	if (ret) {
> +		dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
> +		return ret;
> +	}
> +	page_shift = NVME_CAP_MPSMIN(cap) + 12;
> +	ctrl->page_size = 1 << page_shift;
> +
> +	if (ctrl->vs >= NVME_VS(1, 1))
> +		ctrl->subsystem = NVME_CAP_NSSRC(cap);
> +
> +	ret = nvme_identify_ctrl(ctrl, &id);
> +	if (ret) {
> +		dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
> +		return -EIO;
> +	}
> +
> +	ctrl->oncs = le16_to_cpup(&id->oncs);
> +	atomic_set(&ctrl->abort_limit, id->acl + 1);
> +	ctrl->vwc = id->vwc;
> +	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
> +	memcpy(ctrl->model, id->mn, sizeof(id->mn));
> +	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
> +	if (id->mdts)
> +		ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
> +	else
> +		ctrl->max_hw_sectors = UINT_MAX;
> +
> +	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
> +		unsigned int max_hw_sectors;
> +
> +		ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
> +		max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
> +		if (ctrl->max_hw_sectors) {
> +			ctrl->max_hw_sectors = min(max_hw_sectors,
> +							ctrl->max_hw_sectors);
> +		} else {
> +			ctrl->max_hw_sectors = max_hw_sectors;
> +		}
> +	}
> +
> +	kfree(id);
> +	return 0;
> +}
> +
> +static int nvme_dev_open(struct inode *inode, struct file *file)
> +{
> +	struct nvme_ctrl *ctrl;
> +	int instance = iminor(inode);
> +	int ret = -ENODEV;
> +
> +	spin_lock(&dev_list_lock);
> +	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
> +		if (ctrl->instance != instance)
> +			continue;
> +
> +		if (!ctrl->admin_q) {
> +			ret = -EWOULDBLOCK;
> +			break;
> +		}
> +		if (!kref_get_unless_zero(&ctrl->kref))
> +			break;
> +		file->private_data = ctrl;
> +		ret = 0;
> +		break;
> +	}
> +	spin_unlock(&dev_list_lock);
> +
> +	return ret;
> +}
> +
> +static int nvme_dev_release(struct inode *inode, struct file *file)
> +{
> +	nvme_put_ctrl(file->private_data);
> +	return 0;
> +}
> +
> +static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
> +		unsigned long arg)
> +{
> +	struct nvme_ctrl *ctrl = file->private_data;
> +	void __user *argp = (void __user *)arg;
> +	struct nvme_ns *ns;
> +
> +	switch (cmd) {
> +	case NVME_IOCTL_ADMIN_CMD:
> +		return nvme_user_cmd(ctrl, NULL, argp);
> +	case NVME_IOCTL_IO_CMD:
> +		if (list_empty(&ctrl->namespaces))
> +			return -ENOTTY;
> +		ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
> +		return nvme_user_cmd(ctrl, ns, argp);
> +	case NVME_IOCTL_RESET:
> +		dev_warn(ctrl->dev, "resetting controller\n");
> +		return ctrl->ops->reset_ctrl(ctrl);
> +	case NVME_IOCTL_SUBSYS_RESET:
> +		return nvme_reset_subsystem(ctrl);
> +	default:
> +		return -ENOTTY;
> +	}
> +}
> +
> +static const struct file_operations nvme_dev_fops = {
> +	.owner		= THIS_MODULE,
> +	.open		= nvme_dev_open,
> +	.release	= nvme_dev_release,
> +	.unlocked_ioctl	= nvme_dev_ioctl,
> +	.compat_ioctl	= nvme_dev_ioctl,
> +};
> +
> +static ssize_t nvme_sysfs_reset(struct device *dev,
> +				struct device_attribute *attr, const char *buf,
> +				size_t count)
> +{
> +	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
> +	int ret;
> +
> +	ret = ctrl->ops->reset_ctrl(ctrl);
> +	if (ret < 0)
> +		return ret;
> +	return count;
> +}
> +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
> +
> +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
> +{
> +	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
> +	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
> +
> +	return nsa->ns_id - nsb->ns_id;
> +}
> +
> +static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> +	struct nvme_ns *ns;
> +
> +	list_for_each_entry(ns, &ctrl->namespaces, list) {
> +		if (ns->ns_id == nsid)
> +			return ns;
> +		if (ns->ns_id > nsid)
> +			break;
> +	}
> +	return NULL;
> +}
> +
> +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> +	struct nvme_ns *ns;
> +	struct gendisk *disk;
> +	int node = dev_to_node(ctrl->dev);
> +
> +	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
> +	if (!ns)
> +		return;
> +
> +	ns->queue = blk_mq_init_queue(ctrl->tagset);
> +	if (IS_ERR(ns->queue))
> +		goto out_free_ns;
> +	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
> +	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
> +	ns->queue->queuedata = ns;
> +	ns->ctrl = ctrl;
> +
> +	disk = alloc_disk_node(0, node);
> +	if (!disk)
> +		goto out_free_queue;
> +
> +	kref_init(&ns->kref);
> +	ns->ns_id = nsid;
> +	ns->disk = disk;
> +	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
> +
> +	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
> +	if (ctrl->max_hw_sectors) {
> +		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
> +		blk_queue_max_segments(ns->queue,
> +			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
> +	}
> +	if (ctrl->stripe_size)
> +		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
> +	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
> +		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
> +	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
> +
> +	disk->major = nvme_major;
> +	disk->first_minor = 0;
> +	disk->fops = &nvme_fops;
> +	disk->private_data = ns;
> +	disk->queue = ns->queue;
> +	disk->driverfs_dev = ctrl->device;
> +	disk->flags = GENHD_FL_EXT_DEVT;
> +	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
> +
> +	if (nvme_revalidate_disk(ns->disk))
> +		goto out_free_disk;
> +
> +	list_add_tail(&ns->list, &ctrl->namespaces);
> +	kref_get(&ctrl->kref);
> +	if (ns->type != NVME_NS_LIGHTNVM)
> +		add_disk(ns->disk);
> +
> +	return;
> + out_free_disk:
> +	kfree(disk);
> + out_free_queue:
> +	blk_cleanup_queue(ns->queue);
> + out_free_ns:
> +	kfree(ns);
> +}
> +
> +static void nvme_ns_remove(struct nvme_ns *ns)
> +{
> +	bool kill = nvme_io_incapable(ns->ctrl) &&
> +			!blk_queue_dying(ns->queue);
> +
> +	if (kill)
> +		blk_set_queue_dying(ns->queue);
> +	if (ns->disk->flags & GENHD_FL_UP) {
> +		if (blk_get_integrity(ns->disk))
> +			blk_integrity_unregister(ns->disk);
> +		del_gendisk(ns->disk);
> +	}
> +	if (kill || !blk_queue_dying(ns->queue)) {
> +		blk_mq_abort_requeue_list(ns->queue);
> +		blk_cleanup_queue(ns->queue);
> +	}
> +	list_del_init(&ns->list);
> +	nvme_put_ns(ns);
> +}
> +
> +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> +{
> +	struct nvme_ns *ns;
> +
> +	ns = nvme_find_ns(ctrl, nsid);
> +	if (ns) {
> +		if (revalidate_disk(ns->disk))
> +			nvme_ns_remove(ns);
> +	} else
> +		nvme_alloc_ns(ctrl, nsid);
> +}
> +
> +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
> +{
> +	struct nvme_ns *ns;
> +	__le32 *ns_list;
> +	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
> +	int ret = 0;
> +
> +	ns_list = kzalloc(0x1000, GFP_KERNEL);
> +	if (!ns_list)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < num_lists; i++) {
> +		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
> +		if (ret)
> +			goto out;
> +
> +		for (j = 0; j < min(nn, 1024U); j++) {
> +			nsid = le32_to_cpu(ns_list[j]);
> +			if (!nsid)
> +				goto out;
> +
> +			nvme_validate_ns(ctrl, nsid);
> +
> +			while (++prev < nsid) {
> +				ns = nvme_find_ns(ctrl, prev);
> +				if (ns)
> +					nvme_ns_remove(ns);
> +			}
> +		}
> +		nn -= j;
> +	}
> + out:
> +	kfree(ns_list);
> +	return ret;
> +}
> +
> +static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
> +{
> +	struct nvme_ns *ns, *next;
> +	unsigned i;
> +
> +	for (i = 1; i <= nn; i++)
> +		nvme_validate_ns(ctrl, i);
> +
> +	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> +		if (ns->ns_id > nn)
> +			nvme_ns_remove(ns);
> +	}
> +}
> +
> +void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_id_ctrl *id;
> +	unsigned nn;
> +
> +	if (nvme_identify_ctrl(ctrl, &id))
> +		return;
> +
> +	nn = le32_to_cpu(id->nn);
> +	if (ctrl->vs >= NVME_VS(1, 1)) {
> +		if (!nvme_scan_ns_list(ctrl, nn))
> +			goto done;
> +	}
> +	__nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
> + done:
> +	list_sort(NULL, &ctrl->namespaces, ns_cmp);
> +	kfree(id);
> +}
> +
> +void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns, *next;
> +
> +	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
> +		nvme_ns_remove(ns);
> +}
> +
> +static DEFINE_IDA(nvme_instance_ida);
> +
> +static int nvme_set_instance(struct nvme_ctrl *ctrl)
> +{
> +	int instance, error;
> +
> +	do {
> +		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
> +			return -ENODEV;
> +
> +		spin_lock(&dev_list_lock);
> +		error = ida_get_new(&nvme_instance_ida, &instance);
> +		spin_unlock(&dev_list_lock);
> +	} while (error == -EAGAIN);
> +
> +	if (error)
> +		return -ENODEV;
> +
> +	ctrl->instance = instance;
> +	return 0;
> +}
> +
> +static void nvme_release_instance(struct nvme_ctrl *ctrl)
> +{
> +	spin_lock(&dev_list_lock);
> +	ida_remove(&nvme_instance_ida, ctrl->instance);
> +	spin_unlock(&dev_list_lock);
> +}
> +
> +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
> + {
> +	device_remove_file(ctrl->device, &dev_attr_reset_controller);
> +	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
> +
> +	spin_lock(&dev_list_lock);
> +	list_del(&ctrl->node);
> +	spin_unlock(&dev_list_lock);
> +}
> +
> +static void nvme_free_ctrl(struct kref *kref)
> +{
> +	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
> +
> +	put_device(ctrl->device);
> +	nvme_release_instance(ctrl);
> +
> +	ctrl->ops->free_ctrl(ctrl);
> +}
> +
> +void nvme_put_ctrl(struct nvme_ctrl *ctrl)
> +{
> +	kref_put(&ctrl->kref, nvme_free_ctrl);
> +}
> +
> +/*
> + * Initialize a NVMe controller structures.  This needs to be called during
> + * earliest initialization so that we have the initialized structured around
> + * during probing.
> + */
> +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
> +		const struct nvme_ctrl_ops *ops, u16 vendor,
> +		unsigned long quirks)
> +{
> +	int ret;
> +
> +	INIT_LIST_HEAD(&ctrl->namespaces);
> +	kref_init(&ctrl->kref);
> +	ctrl->dev = dev;
> +	ctrl->ops = ops;
> +	ctrl->vendor = vendor;
> +	ctrl->quirks = quirks;
> +
> +	ret = nvme_set_instance(ctrl);
> +	if (ret)
> +		goto out;
> +
> +	ctrl->device = device_create(nvme_class, ctrl->dev,
> +				MKDEV(nvme_char_major, ctrl->instance),
> +				dev, "nvme%d", ctrl->instance);
> +	if (IS_ERR(ctrl->device)) {
> +		ret = PTR_ERR(ctrl->device);
> +		goto out_release_instance;
> +	}
> +	get_device(ctrl->device);
> +	dev_set_drvdata(ctrl->device, ctrl);
> +
> +	ret = device_create_file(ctrl->device, &dev_attr_reset_controller);
> +	if (ret)
> +		goto out_put_device;
> +
> +	spin_lock(&dev_list_lock);
> +	list_add_tail(&ctrl->node, &nvme_ctrl_list);
> +	spin_unlock(&dev_list_lock);
> +
> +	return 0;
> +
> +out_put_device:
> +	put_device(ctrl->device);
> +	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
> +out_release_instance:
> +	nvme_release_instance(ctrl);
> +out:
> +	return ret;
> +}
> +
> +int __init nvme_core_init(void)
> +{
> +	int result;
> +
> +	result = register_blkdev(nvme_major, "nvme");
> +	if (result < 0)
> +		return result;
> +	else if (result > 0)
> +		nvme_major = result;
> +
> +	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
> +							&nvme_dev_fops);
> +	if (result < 0)
> +		goto unregister_blkdev;
> +	else if (result > 0)
> +		nvme_char_major = result;
> +
> +	nvme_class = class_create(THIS_MODULE, "nvme");
> +	if (IS_ERR(nvme_class)) {
> +		result = PTR_ERR(nvme_class);
> +		goto unregister_chrdev;
> +	}
> +
> +	return 0;
> +
> + unregister_chrdev:
> +	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
> + unregister_blkdev:
> +	unregister_blkdev(nvme_major, "nvme");
> +	return result;
> +}
> +
> +void nvme_core_exit(void)
> +{
> +	unregister_blkdev(nvme_major, "nvme");
> +	class_destroy(nvme_class);
> +	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
> +}
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 9444884..5c5f455 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1040,7 +1040,7 @@ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
>  	struct request *req;
>  	int ret;
>  
> -	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
> +	req = blk_mq_alloc_request(q, write, 0);
>  	if (IS_ERR(req))
>  		return PTR_ERR(req);
>  
> @@ -1093,7 +1093,8 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev)
>  	struct nvme_cmd_info *cmd_info;
>  	struct request *req;
>  
> -	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
> +	req = blk_mq_alloc_request(dev->admin_q, WRITE,
> +			BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
>  	if (IS_ERR(req))
>  		return PTR_ERR(req);
>  
> @@ -1118,7 +1119,7 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
>  	struct request *req;
>  	struct nvme_cmd_info *cmd_rq;
>  
> -	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
> +	req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
>  	if (IS_ERR(req))
>  		return PTR_ERR(req);
>  
> @@ -1319,8 +1320,8 @@ static void nvme_abort_req(struct request *req)
>  	if (!dev->abort_limit)
>  		return;
>  
> -	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
> -									false);
> +	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
> +			BLK_MQ_REQ_NOWAIT);
>  	if (IS_ERR(abort_req))
>  		return;
>  
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index daf17d7..7fc9296 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -188,8 +188,14 @@ void blk_mq_insert_request(struct request *, bool, bool, bool);
>  void blk_mq_free_request(struct request *rq);
>  void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
>  bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
> +
> +enum {
> +	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
> +	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
> +};
> +
>  struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
> -		gfp_t gfp, bool reserved);
> +		unsigned int flags);
>  struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
>  struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Christoph Hellwig Nov. 24, 2015, 3:32 p.m. UTC | #2
On Tue, Nov 24, 2015 at 10:19:54AM -0500, Jeff Moyer wrote:
> >  drivers/block/mtip32xx/mtip32xx.c |    2 +-
> >  drivers/nvme/host/core.c          | 1172 +++++++++++++++++++++++++++++++++++++
> 
> Christoph, I think you included a bit too much in this patch!  ;-)

Oops, looks like once of these weird git-rebase failure cases that
are totally non-obvious.  I'll push out another rebase with these
issues fixed after a few more review comments come in.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 24, 2015, 9:21 p.m. UTC | #3
On 11/20/2015 09:35 AM, Christoph Hellwig wrote:
> We already have the reserved flag, and a nowait flag awkwardly encoded as
> a gfp_t.  Add a real flags argument to make the scheme more extensible and
> allow for a nicer calling convention.

I've been reviewing these in the nvme-req.9 branch, I'll pick them out 
when for-4.5/xx is forked off. That should happen when Linus pulls the 
recent for-linus pull request.

This one looks fine, but:

>   create mode 100644 drivers/nvme/host/core.c

that should obviously not be there.
Christoph Hellwig Nov. 24, 2015, 10:22 p.m. UTC | #4
On Tue, Nov 24, 2015 at 02:21:10PM -0700, Jens Axboe wrote:
> On 11/20/2015 09:35 AM, Christoph Hellwig wrote:
>> We already have the reserved flag, and a nowait flag awkwardly encoded as
>> a gfp_t.  Add a real flags argument to make the scheme more extensible and
>> allow for a nicer calling convention.
>
> I've been reviewing these in the nvme-req.9 branch, I'll pick them out when 
> for-4.5/xx is forked off. That should happen when Linus pulls the recent 
> for-linus pull request.

Ok.  It would be good to only fork off after the NVMe page size fix,
as that is bound to create merge conflicts otherwise.

While you're at it can you send "nvme: add missing unmaps in nvme_queue_rq"
and "block: fix blk_abort_request for blk-mq drivers" from this series to
Linus for 4.4-rc as well?

>>   create mode 100644 drivers/nvme/host/core.c
>
> that should obviously not be there.

Yeah, Jeff already pointed that out.  I'll fix it for the next resend.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Jens Axboe Nov. 24, 2015, 10:25 p.m. UTC | #5
On 11/24/2015 03:22 PM, Christoph Hellwig wrote:
> On Tue, Nov 24, 2015 at 02:21:10PM -0700, Jens Axboe wrote:
>> On 11/20/2015 09:35 AM, Christoph Hellwig wrote:
>>> We already have the reserved flag, and a nowait flag awkwardly encoded as
>>> a gfp_t.  Add a real flags argument to make the scheme more extensible and
>>> allow for a nicer calling convention.
>>
>> I've been reviewing these in the nvme-req.9 branch, I'll pick them out when
>> for-4.5/xx is forked off. That should happen when Linus pulls the recent
>> for-linus pull request.
>
> Ok.  It would be good to only fork off after the NVMe page size fix,
> as that is bound to create merge conflicts otherwise.
>
> While you're at it can you send "nvme: add missing unmaps in nvme_queue_rq"
> and "block: fix blk_abort_request for blk-mq drivers" from this series to
> Linus for 4.4-rc as well?

Look fine for this series. Added and for-4.5/core kicked off.
diff mbox

Patch

diff --git a/block/blk-core.c b/block/blk-core.c
index af9c315..d2100aa 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -630,7 +630,7 @@  struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 
-int blk_queue_enter(struct request_queue *q, gfp_t gfp)
+int blk_queue_enter(struct request_queue *q, bool nowait)
 {
 	while (true) {
 		int ret;
@@ -638,7 +638,7 @@  int blk_queue_enter(struct request_queue *q, gfp_t gfp)
 		if (percpu_ref_tryget_live(&q->q_usage_counter))
 			return 0;
 
-		if (!gfpflags_allow_blocking(gfp))
+		if (nowait)
 			return -EBUSY;
 
 		ret = wait_event_interruptible(q->mq_freeze_wq,
@@ -1284,7 +1284,9 @@  static struct request *blk_old_get_request(struct request_queue *q, int rw,
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	if (q->mq_ops)
-		return blk_mq_alloc_request(q, rw, gfp_mask, false);
+		return blk_mq_alloc_request(q, rw,
+			(gfp_mask & __GFP_DIRECT_RECLAIM) ?
+				0 : BLK_MQ_REQ_NOWAIT);
 	else
 		return blk_old_get_request(q, rw, gfp_mask);
 }
@@ -2052,8 +2054,7 @@  blk_qc_t generic_make_request(struct bio *bio)
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 
-		if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
-
+		if (likely(blk_queue_enter(q, false) == 0)) {
 			ret = q->make_request_fn(q, bio);
 
 			blk_queue_exit(q);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index a07ca34..abdbb47 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -268,7 +268,7 @@  static int bt_get(struct blk_mq_alloc_data *data,
 	if (tag != -1)
 		return tag;
 
-	if (!gfpflags_allow_blocking(data->gfp))
+	if (data->flags & BLK_MQ_REQ_NOWAIT)
 		return -1;
 
 	bs = bt_wait_ptr(bt, hctx);
@@ -303,7 +303,7 @@  static int bt_get(struct blk_mq_alloc_data *data,
 		data->ctx = blk_mq_get_ctx(data->q);
 		data->hctx = data->q->mq_ops->map_queue(data->q,
 				data->ctx->cpu);
-		if (data->reserved) {
+		if (data->flags & BLK_MQ_REQ_RESERVED) {
 			bt = &data->hctx->tags->breserved_tags;
 		} else {
 			last_tag = &data->ctx->last_tag;
@@ -349,10 +349,9 @@  static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
 
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
-	if (!data->reserved)
-		return __blk_mq_get_tag(data);
-
-	return __blk_mq_get_reserved_tag(data);
+	if (data->flags & BLK_MQ_REQ_RESERVED)
+		return __blk_mq_get_reserved_tag(data);
+	return __blk_mq_get_tag(data);
 }
 
 static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c932605..6da03f1 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -230,8 +230,8 @@  __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 	return NULL;
 }
 
-struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
-		bool reserved)
+struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
+		unsigned int flags)
 {
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
@@ -239,24 +239,22 @@  struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 
-	ret = blk_queue_enter(q, gfp);
+	ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT);
 	if (ret)
 		return ERR_PTR(ret);
 
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_DIRECT_RECLAIM,
-			reserved, ctx, hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
-	if (!rq && (gfp & __GFP_DIRECT_RECLAIM)) {
+	if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
-				hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
 		rq =  __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 	}
@@ -1181,8 +1179,7 @@  static struct request *blk_mq_map_request(struct request_queue *q,
 		rw |= REQ_SYNC;
 
 	trace_block_getrq(q, bio, rw);
-	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
-			hctx);
+	blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (unlikely(!rq)) {
 		__blk_mq_run_hw_queue(hctx);
@@ -1191,8 +1188,7 @@  static struct request *blk_mq_map_request(struct request_queue *q,
 
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
-		blk_mq_set_alloc_data(&alloc_data, q,
-				__GFP_RECLAIM|__GFP_HIGH, false, ctx, hctx);
+		blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 		hctx = alloc_data.hctx;
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 713820b..eaede8e 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -96,8 +96,7 @@  static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
 struct blk_mq_alloc_data {
 	/* input parameter */
 	struct request_queue *q;
-	gfp_t gfp;
-	bool reserved;
+	unsigned int flags;
 
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
@@ -105,13 +104,11 @@  struct blk_mq_alloc_data {
 };
 
 static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
-		struct request_queue *q, gfp_t gfp, bool reserved,
-		struct blk_mq_ctx *ctx,
-		struct blk_mq_hw_ctx *hctx)
+		struct request_queue *q, unsigned int flags,
+		struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
 {
 	data->q = q;
-	data->gfp = gfp;
-	data->reserved = reserved;
+	data->flags = flags;
 	data->ctx = ctx;
 	data->hctx = hctx;
 }
diff --git a/block/blk.h b/block/blk.h
index 1d95107..38bf997 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -72,7 +72,7 @@  void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes);
-int blk_queue_enter(struct request_queue *q, gfp_t gfp);
+int blk_queue_enter(struct request_queue *q, bool nowait);
 void blk_queue_exit(struct request_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index a28a562..cf3b51a 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -173,7 +173,7 @@  static struct mtip_cmd *mtip_get_int_command(struct driver_data *dd)
 {
 	struct request *rq;
 
-	rq = blk_mq_alloc_request(dd->queue, 0, __GFP_RECLAIM, true);
+	rq = blk_mq_alloc_request(dd->queue, 0, BLK_MQ_REQ_RESERVED);
 	return blk_mq_rq_to_pdu(rq);
 }
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
new file mode 100644
index 0000000..53cf507
--- /dev/null
+++ b/drivers/nvme/host/core.c
@@ -0,0 +1,1172 @@ 
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/errno.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/list_sort.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/pr.h>
+#include <linux/ptrace.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/t10-pi.h>
+#include <scsi/sg.h>
+#include <asm/unaligned.h>
+
+#include "nvme.h"
+
+#define NVME_MINORS		(1U << MINORBITS)
+
+static int nvme_major;
+module_param(nvme_major, int, 0);
+
+static int nvme_char_major;
+module_param(nvme_char_major, int, 0);
+
+static LIST_HEAD(nvme_ctrl_list);
+DEFINE_SPINLOCK(dev_list_lock);
+
+static struct class *nvme_class;
+
+static void nvme_free_ns(struct kref *kref)
+{
+	struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref);
+
+	if (ns->type == NVME_NS_LIGHTNVM)
+		nvme_nvm_unregister(ns->queue, ns->disk->disk_name);
+
+	spin_lock(&dev_list_lock);
+	ns->disk->private_data = NULL;
+	spin_unlock(&dev_list_lock);
+
+	nvme_put_ctrl(ns->ctrl);
+	put_disk(ns->disk);
+	kfree(ns);
+}
+
+static void nvme_put_ns(struct nvme_ns *ns)
+{
+	kref_put(&ns->kref, nvme_free_ns);
+}
+
+static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns;
+
+	spin_lock(&dev_list_lock);
+	ns = disk->private_data;
+	if (ns && !kref_get_unless_zero(&ns->kref))
+		ns = NULL;
+	spin_unlock(&dev_list_lock);
+
+	return ns;
+}
+
+static struct request *nvme_alloc_request(struct request_queue *q,
+		struct nvme_command *cmd)
+{
+	bool write = cmd->common.opcode & 1;
+	struct request *req;
+
+	req = blk_mq_alloc_request(q, write, 0);
+	if (IS_ERR(req))
+		return req;
+
+	req->cmd_type = REQ_TYPE_DRV_PRIV;
+	req->cmd_flags |= REQ_FAILFAST_DRIVER;
+	req->__data_len = 0;
+	req->__sector = (sector_t) -1;
+	req->bio = req->biotail = NULL;
+
+	req->cmd = (unsigned char *)cmd;
+	req->cmd_len = sizeof(struct nvme_command);
+	req->special = (void *)0;
+
+	return req;
+}
+
+/*
+ * Returns 0 on success.  If the result is negative, it's a Linux error code;
+ * if the result is positive, it's an NVM Express status code
+ */
+int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen, u32 *result, unsigned timeout)
+{
+	struct request *req;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (buffer && bufflen) {
+		ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL);
+		if (ret)
+			goto out;
+	}
+
+	blk_execute_rq(req->q, NULL, req, 0);
+	if (result)
+		*result = (u32)(uintptr_t)req->special;
+	ret = req->errors;
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void *buffer, unsigned bufflen)
+{
+	return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0);
+}
+
+int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen,
+		void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
+		u32 *result, unsigned timeout)
+{
+	bool write = cmd->common.opcode & 1;
+	struct nvme_ns *ns = q->queuedata;
+	struct gendisk *disk = ns ? ns->disk : NULL;
+	struct request *req;
+	struct bio *bio = NULL;
+	void *meta = NULL;
+	int ret;
+
+	req = nvme_alloc_request(q, cmd);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	req->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+
+	if (ubuffer && bufflen) {
+		ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
+				GFP_KERNEL);
+		if (ret)
+			goto out;
+		bio = req->bio;
+
+		if (!disk)
+			goto submit;
+		bio->bi_bdev = bdget_disk(disk, 0);
+		if (!bio->bi_bdev) {
+			ret = -ENODEV;
+			goto out_unmap;
+		}
+
+		if (meta_buffer) {
+			struct bio_integrity_payload *bip;
+
+			meta = kmalloc(meta_len, GFP_KERNEL);
+			if (!meta) {
+				ret = -ENOMEM;
+				goto out_unmap;
+			}
+
+			if (write) {
+				if (copy_from_user(meta, meta_buffer,
+						meta_len)) {
+					ret = -EFAULT;
+					goto out_free_meta;
+				}
+			}
+
+			bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+			if (!bip) {
+				ret = -ENOMEM;
+				goto out_free_meta;
+			}
+
+			bip->bip_iter.bi_size = meta_len;
+			bip->bip_iter.bi_sector = meta_seed;
+
+			ret = bio_integrity_add_page(bio, virt_to_page(meta),
+					meta_len, offset_in_page(meta));
+			if (ret != meta_len) {
+				ret = -ENOMEM;
+				goto out_free_meta;
+			}
+		}
+	}
+ submit:
+	blk_execute_rq(req->q, disk, req, 0);
+	ret = req->errors;
+	if (result)
+		*result = (u32)(uintptr_t)req->special;
+	if (meta && !ret && !write) {
+		if (copy_to_user(meta_buffer, meta, meta_len))
+			ret = -EFAULT;
+	}
+ out_free_meta:
+	kfree(meta);
+ out_unmap:
+	if (bio) {
+		if (disk && bio->bi_bdev)
+			bdput(bio->bi_bdev);
+		blk_rq_unmap_user(bio);
+	}
+ out:
+	blk_mq_free_request(req);
+	return ret;
+}
+
+int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
+		void __user *ubuffer, unsigned bufflen, u32 *result,
+		unsigned timeout)
+{
+	return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0,
+			result, timeout);
+}
+
+int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = cpu_to_le32(1);
+
+	*id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ctrl));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list)
+{
+	struct nvme_command c = { };
+
+	c.identify.opcode = nvme_admin_identify;
+	c.identify.cns = cpu_to_le32(2);
+	c.identify.nsid = cpu_to_le32(nsid);
+	return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000);
+}
+
+int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid,
+		struct nvme_id_ns **id)
+{
+	struct nvme_command c = { };
+	int error;
+
+	/* gcc-4.4.4 (at least) has issues with initializers and anon unions */
+	c.identify.opcode = nvme_admin_identify,
+	c.identify.nsid = cpu_to_le32(nsid),
+
+	*id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL);
+	if (!*id)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
+			sizeof(struct nvme_id_ns));
+	if (error)
+		kfree(*id);
+	return error;
+}
+
+int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_get_features;
+	c.features.nsid = cpu_to_le32(nsid);
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
+					dma_addr_t dma_addr, u32 *result)
+{
+	struct nvme_command c;
+
+	memset(&c, 0, sizeof(c));
+	c.features.opcode = nvme_admin_set_features;
+	c.features.prp1 = cpu_to_le64(dma_addr);
+	c.features.fid = cpu_to_le32(fid);
+	c.features.dword11 = cpu_to_le32(dword11);
+
+	return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0);
+}
+
+int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log)
+{
+	struct nvme_command c = { };
+	int error;
+
+	c.common.opcode = nvme_admin_get_log_page,
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF),
+	c.common.cdw10[0] = cpu_to_le32(
+			(((sizeof(struct nvme_smart_log) / 4) - 1) << 16) |
+			 NVME_LOG_SMART),
+
+	*log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL);
+	if (!*log)
+		return -ENOMEM;
+
+	error = nvme_submit_sync_cmd(dev->admin_q, &c, *log,
+			sizeof(struct nvme_smart_log));
+	if (error)
+		kfree(*log);
+	return error;
+}
+
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+{
+	struct nvme_user_io io;
+	struct nvme_command c;
+	unsigned length, meta_len;
+	void __user *metadata;
+
+	if (copy_from_user(&io, uio, sizeof(io)))
+		return -EFAULT;
+
+	switch (io.opcode) {
+	case nvme_cmd_write:
+	case nvme_cmd_read:
+	case nvme_cmd_compare:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+	metadata = (void __user *)(uintptr_t)io.metadata;
+
+	if (ns->ext) {
+		length += meta_len;
+		meta_len = 0;
+	} else if (meta_len) {
+		if ((io.metadata & 3) || !io.metadata)
+			return -EINVAL;
+	}
+
+	memset(&c, 0, sizeof(c));
+	c.rw.opcode = io.opcode;
+	c.rw.flags = io.flags;
+	c.rw.nsid = cpu_to_le32(ns->ns_id);
+	c.rw.slba = cpu_to_le64(io.slba);
+	c.rw.length = cpu_to_le16(io.nblocks);
+	c.rw.control = cpu_to_le16(io.control);
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
+
+	return __nvme_submit_user_cmd(ns->queue, &c,
+			(void __user *)(uintptr_t)io.addr, length,
+			metadata, meta_len, io.slba, NULL, 0);
+}
+
+static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
+			struct nvme_passthru_cmd __user *ucmd)
+{
+	struct nvme_passthru_cmd cmd;
+	struct nvme_command c;
+	unsigned timeout = 0;
+	int status;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
+		return -EFAULT;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = cmd.opcode;
+	c.common.flags = cmd.flags;
+	c.common.nsid = cpu_to_le32(cmd.nsid);
+	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	c.common.cdw10[0] = cpu_to_le32(cmd.cdw10);
+	c.common.cdw10[1] = cpu_to_le32(cmd.cdw11);
+	c.common.cdw10[2] = cpu_to_le32(cmd.cdw12);
+	c.common.cdw10[3] = cpu_to_le32(cmd.cdw13);
+	c.common.cdw10[4] = cpu_to_le32(cmd.cdw14);
+	c.common.cdw10[5] = cpu_to_le32(cmd.cdw15);
+
+	if (cmd.timeout_ms)
+		timeout = msecs_to_jiffies(cmd.timeout_ms);
+
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+			(void __user *)cmd.addr, cmd.data_len,
+			&cmd.result, timeout);
+	if (status >= 0) {
+		if (put_user(cmd.result, &ucmd->result))
+			return -EFAULT;
+	}
+
+	return status;
+}
+
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case NVME_IOCTL_ID:
+		force_successful_syscall_return();
+		return ns->ns_id;
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg);
+	case NVME_IOCTL_IO_CMD:
+		return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg);
+	case NVME_IOCTL_SUBMIT_IO:
+		return nvme_submit_io(ns, (void __user *)arg);
+	case SG_GET_VERSION_NUM:
+		return nvme_sg_get_version_num((void __user *)arg);
+	case SG_IO:
+		return nvme_sg_io(ns, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case SG_IO:
+		return -ENOIOCTLCMD;
+	}
+	return nvme_ioctl(bdev, mode, cmd, arg);
+}
+#else
+#define nvme_compat_ioctl	NULL
+#endif
+
+static int nvme_open(struct block_device *bdev, fmode_t mode)
+{
+	return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO;
+}
+
+static void nvme_release(struct gendisk *disk, fmode_t mode)
+{
+	nvme_put_ns(disk->private_data);
+}
+
+static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	/* some standard values */
+	geo->heads = 1 << 6;
+	geo->sectors = 1 << 5;
+	geo->cylinders = get_capacity(bdev->bd_disk) >> 11;
+	return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+	struct blk_integrity integrity;
+
+	switch (ns->pi_type) {
+	case NVME_NS_DPS_PI_TYPE3:
+		integrity.profile = &t10_pi_type3_crc;
+		break;
+	case NVME_NS_DPS_PI_TYPE1:
+	case NVME_NS_DPS_PI_TYPE2:
+		integrity.profile = &t10_pi_type1_crc;
+		break;
+	default:
+		integrity.profile = NULL;
+		break;
+	}
+	integrity.tuple_size = ns->ms;
+	blk_integrity_register(ns->disk, &integrity);
+	blk_queue_max_integrity_segments(ns->queue, 1);
+}
+#else
+static void nvme_init_integrity(struct nvme_ns *ns)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	ns->queue->limits.discard_zeroes_data = 0;
+	ns->queue->limits.discard_alignment = logical_block_size;
+	ns->queue->limits.discard_granularity = logical_block_size;
+	blk_queue_max_discard_sectors(ns->queue, 0xffffffff);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
+static int nvme_revalidate_disk(struct gendisk *disk)
+{
+	struct nvme_ns *ns = disk->private_data;
+	struct nvme_id_ns *id;
+	u8 lbaf, pi_type;
+	u16 old_ms;
+	unsigned short bs;
+
+	if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
+		dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
+				__func__, ns->ctrl->instance, ns->ns_id);
+		return -ENODEV;
+	}
+	if (id->ncap == 0) {
+		kfree(id);
+		return -ENODEV;
+	}
+
+	if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) {
+		if (nvme_nvm_register(ns->queue, disk->disk_name)) {
+			dev_warn(ns->ctrl->dev,
+				"%s: LightNVM init failure\n", __func__);
+			kfree(id);
+			return -ENODEV;
+		}
+		ns->type = NVME_NS_LIGHTNVM;
+	}
+
+	old_ms = ns->ms;
+	lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK;
+	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
+	ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT);
+
+	/*
+	 * If identify namespace failed, use default 512 byte block size so
+	 * block layer can use before failing read/write for 0 capacity.
+	 */
+	if (ns->lba_shift == 0)
+		ns->lba_shift = 9;
+	bs = 1 << ns->lba_shift;
+	/* XXX: PI implementation requires metadata equal t10 pi tuple size */
+	pi_type = ns->ms == sizeof(struct t10_pi_tuple) ?
+					id->dps & NVME_NS_DPS_PI_MASK : 0;
+
+	blk_mq_freeze_queue(disk->queue);
+	if (blk_get_integrity(disk) && (ns->pi_type != pi_type ||
+				ns->ms != old_ms ||
+				bs != queue_logical_block_size(disk->queue) ||
+				(ns->ms && ns->ext)))
+		blk_integrity_unregister(disk);
+
+	ns->pi_type = pi_type;
+	blk_queue_logical_block_size(ns->queue, bs);
+
+	if (ns->ms && !blk_get_integrity(disk) && !ns->ext)
+		nvme_init_integrity(ns);
+	if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk))
+		set_capacity(disk, 0);
+	else
+		set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
+
+	if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns);
+	blk_mq_unfreeze_queue(disk->queue);
+
+	kfree(id);
+	return 0;
+}
+
+static char nvme_pr_type(enum pr_type type)
+{
+	switch (type) {
+	case PR_WRITE_EXCLUSIVE:
+		return 1;
+	case PR_EXCLUSIVE_ACCESS:
+		return 2;
+	case PR_WRITE_EXCLUSIVE_REG_ONLY:
+		return 3;
+	case PR_EXCLUSIVE_ACCESS_REG_ONLY:
+		return 4;
+	case PR_WRITE_EXCLUSIVE_ALL_REGS:
+		return 5;
+	case PR_EXCLUSIVE_ACCESS_ALL_REGS:
+		return 6;
+	default:
+		return 0;
+	}
+};
+
+static int nvme_pr_command(struct block_device *bdev, u32 cdw10,
+				u64 key, u64 sa_key, u8 op)
+{
+	struct nvme_ns *ns = bdev->bd_disk->private_data;
+	struct nvme_command c;
+	u8 data[16] = { 0, };
+
+	put_unaligned_le64(key, &data[0]);
+	put_unaligned_le64(sa_key, &data[8]);
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = op;
+	c.common.nsid = cpu_to_le32(ns->ns_id);
+	c.common.cdw10[0] = cpu_to_le32(cdw10);
+
+	return nvme_submit_sync_cmd(ns->queue, &c, data, 16);
+}
+
+static int nvme_pr_register(struct block_device *bdev, u64 old,
+		u64 new, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = old ? 2 : 0;
+	cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0;
+	cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_reserve(struct block_device *bdev, u64 key,
+		enum pr_type type, unsigned flags)
+{
+	u32 cdw10;
+
+	if (flags & ~PR_FL_IGNORE_KEY)
+		return -EOPNOTSUPP;
+
+	cdw10 = nvme_pr_type(type) << 8;
+	cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0);
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new,
+		enum pr_type type, bool abort)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1;
+	return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire);
+}
+
+static int nvme_pr_clear(struct block_device *bdev, u64 key)
+{
+	u32 cdw10 = 1 | key ? 1 << 3 : 0;
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register);
+}
+
+static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
+{
+	u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0;
+	return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release);
+}
+
+static const struct pr_ops nvme_pr_ops = {
+	.pr_register	= nvme_pr_register,
+	.pr_reserve	= nvme_pr_reserve,
+	.pr_release	= nvme_pr_release,
+	.pr_preempt	= nvme_pr_preempt,
+	.pr_clear	= nvme_pr_clear,
+};
+
+static const struct block_device_operations nvme_fops = {
+	.owner		= THIS_MODULE,
+	.ioctl		= nvme_ioctl,
+	.compat_ioctl	= nvme_compat_ioctl,
+	.open		= nvme_open,
+	.release	= nvme_release,
+	.getgeo		= nvme_getgeo,
+	.revalidate_disk= nvme_revalidate_disk,
+	.pr_ops		= &nvme_pr_ops,
+};
+
+/*
+ * Initialize the cached copies of the Identify data and various controller
+ * register in our nvme_ctrl structure.  This should be called as soon as
+ * the admin queue is fully up and running.
+ */
+int nvme_init_identify(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	u64 cap;
+	int ret, page_shift;
+
+	ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
+	if (ret) {
+		dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret);
+		return ret;
+	}
+
+	ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
+	if (ret) {
+		dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret);
+		return ret;
+	}
+	page_shift = NVME_CAP_MPSMIN(cap) + 12;
+	ctrl->page_size = 1 << page_shift;
+
+	if (ctrl->vs >= NVME_VS(1, 1))
+		ctrl->subsystem = NVME_CAP_NSSRC(cap);
+
+	ret = nvme_identify_ctrl(ctrl, &id);
+	if (ret) {
+		dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret);
+		return -EIO;
+	}
+
+	ctrl->oncs = le16_to_cpup(&id->oncs);
+	atomic_set(&ctrl->abort_limit, id->acl + 1);
+	ctrl->vwc = id->vwc;
+	memcpy(ctrl->serial, id->sn, sizeof(id->sn));
+	memcpy(ctrl->model, id->mn, sizeof(id->mn));
+	memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr));
+	if (id->mdts)
+		ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9);
+	else
+		ctrl->max_hw_sectors = UINT_MAX;
+
+	if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) {
+		unsigned int max_hw_sectors;
+
+		ctrl->stripe_size = 1 << (id->vs[3] + page_shift);
+		max_hw_sectors = ctrl->stripe_size >> (page_shift - 9);
+		if (ctrl->max_hw_sectors) {
+			ctrl->max_hw_sectors = min(max_hw_sectors,
+							ctrl->max_hw_sectors);
+		} else {
+			ctrl->max_hw_sectors = max_hw_sectors;
+		}
+	}
+
+	kfree(id);
+	return 0;
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *file)
+{
+	struct nvme_ctrl *ctrl;
+	int instance = iminor(inode);
+	int ret = -ENODEV;
+
+	spin_lock(&dev_list_lock);
+	list_for_each_entry(ctrl, &nvme_ctrl_list, node) {
+		if (ctrl->instance != instance)
+			continue;
+
+		if (!ctrl->admin_q) {
+			ret = -EWOULDBLOCK;
+			break;
+		}
+		if (!kref_get_unless_zero(&ctrl->kref))
+			break;
+		file->private_data = ctrl;
+		ret = 0;
+		break;
+	}
+	spin_unlock(&dev_list_lock);
+
+	return ret;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *file)
+{
+	nvme_put_ctrl(file->private_data);
+	return 0;
+}
+
+static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	struct nvme_ctrl *ctrl = file->private_data;
+	void __user *argp = (void __user *)arg;
+	struct nvme_ns *ns;
+
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_cmd(ctrl, NULL, argp);
+	case NVME_IOCTL_IO_CMD:
+		if (list_empty(&ctrl->namespaces))
+			return -ENOTTY;
+		ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
+		return nvme_user_cmd(ctrl, ns, argp);
+	case NVME_IOCTL_RESET:
+		dev_warn(ctrl->dev, "resetting controller\n");
+		return ctrl->ops->reset_ctrl(ctrl);
+	case NVME_IOCTL_SUBSYS_RESET:
+		return nvme_reset_subsystem(ctrl);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.release	= nvme_dev_release,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= nvme_dev_ioctl,
+};
+
+static ssize_t nvme_sysfs_reset(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
+	int ret;
+
+	ret = ctrl->ops->reset_ctrl(ctrl);
+	if (ret < 0)
+		return ret;
+	return count;
+}
+static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset);
+
+static int ns_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct nvme_ns *nsa = container_of(a, struct nvme_ns, list);
+	struct nvme_ns *nsb = container_of(b, struct nvme_ns, list);
+
+	return nsa->ns_id - nsb->ns_id;
+}
+
+static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	list_for_each_entry(ns, &ctrl->namespaces, list) {
+		if (ns->ns_id == nsid)
+			return ns;
+		if (ns->ns_id > nsid)
+			break;
+	}
+	return NULL;
+}
+
+static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+	struct gendisk *disk;
+	int node = dev_to_node(ctrl->dev);
+
+	ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node);
+	if (!ns)
+		return;
+
+	ns->queue = blk_mq_init_queue(ctrl->tagset);
+	if (IS_ERR(ns->queue))
+		goto out_free_ns;
+	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
+	ns->queue->queuedata = ns;
+	ns->ctrl = ctrl;
+
+	disk = alloc_disk_node(0, node);
+	if (!disk)
+		goto out_free_queue;
+
+	kref_init(&ns->kref);
+	ns->ns_id = nsid;
+	ns->disk = disk;
+	ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
+	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
+	if (ctrl->max_hw_sectors) {
+		blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
+		blk_queue_max_segments(ns->queue,
+			(ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
+	}
+	if (ctrl->stripe_size)
+		blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
+	if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+		blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
+	blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+
+	disk->major = nvme_major;
+	disk->first_minor = 0;
+	disk->fops = &nvme_fops;
+	disk->private_data = ns;
+	disk->queue = ns->queue;
+	disk->driverfs_dev = ctrl->device;
+	disk->flags = GENHD_FL_EXT_DEVT;
+	sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+
+	if (nvme_revalidate_disk(ns->disk))
+		goto out_free_disk;
+
+	list_add_tail(&ns->list, &ctrl->namespaces);
+	kref_get(&ctrl->kref);
+	if (ns->type != NVME_NS_LIGHTNVM)
+		add_disk(ns->disk);
+
+	return;
+ out_free_disk:
+	kfree(disk);
+ out_free_queue:
+	blk_cleanup_queue(ns->queue);
+ out_free_ns:
+	kfree(ns);
+}
+
+static void nvme_ns_remove(struct nvme_ns *ns)
+{
+	bool kill = nvme_io_incapable(ns->ctrl) &&
+			!blk_queue_dying(ns->queue);
+
+	if (kill)
+		blk_set_queue_dying(ns->queue);
+	if (ns->disk->flags & GENHD_FL_UP) {
+		if (blk_get_integrity(ns->disk))
+			blk_integrity_unregister(ns->disk);
+		del_gendisk(ns->disk);
+	}
+	if (kill || !blk_queue_dying(ns->queue)) {
+		blk_mq_abort_requeue_list(ns->queue);
+		blk_cleanup_queue(ns->queue);
+	}
+	list_del_init(&ns->list);
+	nvme_put_ns(ns);
+}
+
+static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid)
+{
+	struct nvme_ns *ns;
+
+	ns = nvme_find_ns(ctrl, nsid);
+	if (ns) {
+		if (revalidate_disk(ns->disk))
+			nvme_ns_remove(ns);
+	} else
+		nvme_alloc_ns(ctrl, nsid);
+}
+
+static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	struct nvme_ns *ns;
+	__le32 *ns_list;
+	unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024);
+	int ret = 0;
+
+	ns_list = kzalloc(0x1000, GFP_KERNEL);
+	if (!ns_list)
+		return -ENOMEM;
+
+	for (i = 0; i < num_lists; i++) {
+		ret = nvme_identify_ns_list(ctrl, prev, ns_list);
+		if (ret)
+			goto out;
+
+		for (j = 0; j < min(nn, 1024U); j++) {
+			nsid = le32_to_cpu(ns_list[j]);
+			if (!nsid)
+				goto out;
+
+			nvme_validate_ns(ctrl, nsid);
+
+			while (++prev < nsid) {
+				ns = nvme_find_ns(ctrl, prev);
+				if (ns)
+					nvme_ns_remove(ns);
+			}
+		}
+		nn -= j;
+	}
+ out:
+	kfree(ns_list);
+	return ret;
+}
+
+static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn)
+{
+	struct nvme_ns *ns, *next;
+	unsigned i;
+
+	for (i = 1; i <= nn; i++)
+		nvme_validate_ns(ctrl, i);
+
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
+		if (ns->ns_id > nn)
+			nvme_ns_remove(ns);
+	}
+}
+
+void nvme_scan_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_id_ctrl *id;
+	unsigned nn;
+
+	if (nvme_identify_ctrl(ctrl, &id))
+		return;
+
+	nn = le32_to_cpu(id->nn);
+	if (ctrl->vs >= NVME_VS(1, 1)) {
+		if (!nvme_scan_ns_list(ctrl, nn))
+			goto done;
+	}
+	__nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn));
+ done:
+	list_sort(NULL, &ctrl->namespaces, ns_cmp);
+	kfree(id);
+}
+
+void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns, *next;
+
+	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
+		nvme_ns_remove(ns);
+}
+
+static DEFINE_IDA(nvme_instance_ida);
+
+static int nvme_set_instance(struct nvme_ctrl *ctrl)
+{
+	int instance, error;
+
+	do {
+		if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL))
+			return -ENODEV;
+
+		spin_lock(&dev_list_lock);
+		error = ida_get_new(&nvme_instance_ida, &instance);
+		spin_unlock(&dev_list_lock);
+	} while (error == -EAGAIN);
+
+	if (error)
+		return -ENODEV;
+
+	ctrl->instance = instance;
+	return 0;
+}
+
+static void nvme_release_instance(struct nvme_ctrl *ctrl)
+{
+	spin_lock(&dev_list_lock);
+	ida_remove(&nvme_instance_ida, ctrl->instance);
+	spin_unlock(&dev_list_lock);
+}
+
+void nvme_uninit_ctrl(struct nvme_ctrl *ctrl)
+ {
+	device_remove_file(ctrl->device, &dev_attr_reset_controller);
+	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
+
+	spin_lock(&dev_list_lock);
+	list_del(&ctrl->node);
+	spin_unlock(&dev_list_lock);
+}
+
+static void nvme_free_ctrl(struct kref *kref)
+{
+	struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref);
+
+	put_device(ctrl->device);
+	nvme_release_instance(ctrl);
+
+	ctrl->ops->free_ctrl(ctrl);
+}
+
+void nvme_put_ctrl(struct nvme_ctrl *ctrl)
+{
+	kref_put(&ctrl->kref, nvme_free_ctrl);
+}
+
+/*
+ * Initialize a NVMe controller structures.  This needs to be called during
+ * earliest initialization so that we have the initialized structured around
+ * during probing.
+ */
+int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
+		const struct nvme_ctrl_ops *ops, u16 vendor,
+		unsigned long quirks)
+{
+	int ret;
+
+	INIT_LIST_HEAD(&ctrl->namespaces);
+	kref_init(&ctrl->kref);
+	ctrl->dev = dev;
+	ctrl->ops = ops;
+	ctrl->vendor = vendor;
+	ctrl->quirks = quirks;
+
+	ret = nvme_set_instance(ctrl);
+	if (ret)
+		goto out;
+
+	ctrl->device = device_create(nvme_class, ctrl->dev,
+				MKDEV(nvme_char_major, ctrl->instance),
+				dev, "nvme%d", ctrl->instance);
+	if (IS_ERR(ctrl->device)) {
+		ret = PTR_ERR(ctrl->device);
+		goto out_release_instance;
+	}
+	get_device(ctrl->device);
+	dev_set_drvdata(ctrl->device, ctrl);
+
+	ret = device_create_file(ctrl->device, &dev_attr_reset_controller);
+	if (ret)
+		goto out_put_device;
+
+	spin_lock(&dev_list_lock);
+	list_add_tail(&ctrl->node, &nvme_ctrl_list);
+	spin_unlock(&dev_list_lock);
+
+	return 0;
+
+out_put_device:
+	put_device(ctrl->device);
+	device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance));
+out_release_instance:
+	nvme_release_instance(ctrl);
+out:
+	return ret;
+}
+
+int __init nvme_core_init(void)
+{
+	int result;
+
+	result = register_blkdev(nvme_major, "nvme");
+	if (result < 0)
+		return result;
+	else if (result > 0)
+		nvme_major = result;
+
+	result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme",
+							&nvme_dev_fops);
+	if (result < 0)
+		goto unregister_blkdev;
+	else if (result > 0)
+		nvme_char_major = result;
+
+	nvme_class = class_create(THIS_MODULE, "nvme");
+	if (IS_ERR(nvme_class)) {
+		result = PTR_ERR(nvme_class);
+		goto unregister_chrdev;
+	}
+
+	return 0;
+
+ unregister_chrdev:
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+ unregister_blkdev:
+	unregister_blkdev(nvme_major, "nvme");
+	return result;
+}
+
+void nvme_core_exit(void)
+{
+	unregister_blkdev(nvme_major, "nvme");
+	class_destroy(nvme_class);
+	__unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme");
+}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 9444884..5c5f455 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1040,7 +1040,7 @@  int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd,
 	struct request *req;
 	int ret;
 
-	req = blk_mq_alloc_request(q, write, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(q, write, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1093,7 +1093,8 @@  static int nvme_submit_async_admin_req(struct nvme_dev *dev)
 	struct nvme_cmd_info *cmd_info;
 	struct request *req;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, true);
+	req = blk_mq_alloc_request(dev->admin_q, WRITE,
+			BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1118,7 +1119,7 @@  static int nvme_submit_admin_async_cmd(struct nvme_dev *dev,
 	struct request *req;
 	struct nvme_cmd_info *cmd_rq;
 
-	req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false);
+	req = blk_mq_alloc_request(dev->admin_q, WRITE, 0);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1319,8 +1320,8 @@  static void nvme_abort_req(struct request *req)
 	if (!dev->abort_limit)
 		return;
 
-	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC,
-									false);
+	abort_req = blk_mq_alloc_request(dev->admin_q, WRITE,
+			BLK_MQ_REQ_NOWAIT);
 	if (IS_ERR(abort_req))
 		return;
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index daf17d7..7fc9296 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -188,8 +188,14 @@  void blk_mq_insert_request(struct request *, bool, bool, bool);
 void blk_mq_free_request(struct request *rq);
 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
+
+enum {
+	BLK_MQ_REQ_NOWAIT	= (1 << 0), /* return when out of requests */
+	BLK_MQ_REQ_RESERVED	= (1 << 1), /* allocate from reserved pool */
+};
+
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
-		gfp_t gfp, bool reserved);
+		unsigned int flags);
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag);
 struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags);