diff mbox

[RFC,1/6] bsg: fix kernel panic resulting from missing allocation of a reply-buffer

Message ID 9e67ce3fc2f3cd42e9e05b2753b00d6676f46ee1.1502120928.git.bblock@linux.vnet.ibm.com (mailing list archive)
State Changes Requested, archived
Headers show

Commit Message

Benjamin Block Aug. 9, 2017, 2:11 p.m. UTC
In contrast to the normal SCSI-lib, the BSG block-queue doesn't make use of
any extra init_rq_fn() to make additional allocations during
request-creation, and the request sense-pointer is not used to transport
SCSI sense data, but is used as backing for the bsg_job->reply pointer;
that in turn is used in the LLDs to store protocol IUs or similar stuff.
This 're-purposing' of the sense-pointer is done in the BSG blk-lib
(bsg_create_job()), during the queue-processing.

Failing to allocate/assign it results in illegal dereferences because LLDs
use this pointer unquestioned, as can be seen in the various
BSG-implementations:

drivers/scsi/libfc/fc_lport.c:  fc_lport_bsg_request()
drivers/scsi/qla2xxx/qla_bsg.c: qla24xx_bsg_request()
drivers/scsi/qla4xxx/ql4_bsg.c: qla4xxx_process_vendor_specific()
drivers/s390/scsi/zfcp_fc.c:    zfcp_fc_ct_els_job_handler()
...

An example panic on s390x, using the zFCP driver, looks like this (I had
debugging on, otherwise NULL-pointer dereferences wouldn't even panic on
s390x):

Unable to handle kernel pointer dereference in virtual kernel address space
Failing address: 6b6b6b6b6b6b6000 TEID: 6b6b6b6b6b6b6403
Fault in home space mode while using kernel ASCE.
AS:0000000001590007 R3:0000000000000024
Oops: 0038 ilc:2 [#1] PREEMPT SMP DEBUG_PAGEALLOC
Modules linked in: <Long List>
CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.12.0-bsg-regression+ #3
Hardware name: IBM 2964 N96 702 (z/VM 6.4.0)
task: 0000000065cb0100 task.stack: 0000000065cb4000
Krnl PSW : 0704e00180000000 000003ff801e4156 (zfcp_fc_ct_els_job_handler+0x16/0x58 [zfcp])
           R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:2 PM:0 RI:0 EA:3
Krnl GPRS: 0000000000000001 000000005fa9d0d0 000000005fa9d078 0000000000e16866
           000003ff00000290 6b6b6b6b6b6b6b6b 0000000059f78f00 000000000000000f
           00000000593a0958 00000000593a0958 0000000060d88800 000000005ddd4c38
           0000000058b50100 07000000659cba08 000003ff801e8556 00000000659cb9a8
Krnl Code: 000003ff801e4146: e31020500004        lg      %r1,80(%r2)
           000003ff801e414c: 58402040           l       %r4,64(%r2)
          #000003ff801e4150: e35020200004       lg      %r5,32(%r2)
          >000003ff801e4156: 50405004           st      %r4,4(%r5)
           000003ff801e415a: e54c50080000       mvhi    8(%r5),0
           000003ff801e4160: e33010280012       lt      %r3,40(%r1)
           000003ff801e4166: a718fffb           lhi     %r1,-5
           000003ff801e416a: 1803               lr      %r0,%r3
Call Trace:
([<000003ff801e8556>] zfcp_fsf_req_complete+0x726/0x768 [zfcp])
 [<000003ff801ea82a>] zfcp_fsf_reqid_check+0x102/0x180 [zfcp]
 [<000003ff801eb980>] zfcp_qdio_int_resp+0x230/0x278 [zfcp]
 [<00000000009b91b6>] qdio_kick_handler+0x2ae/0x2c8
 [<00000000009b9e3e>] __tiqdio_inbound_processing+0x406/0xc10
 [<00000000001684c2>] tasklet_action+0x15a/0x1d8
 [<0000000000bd28ec>] __do_softirq+0x3ec/0x848
 [<00000000001675a4>] irq_exit+0x74/0xf8
 [<000000000010dd6a>] do_IRQ+0xba/0xf0
 [<0000000000bd19e8>] io_int_handler+0x104/0x2d4
 [<00000000001033b6>] enabled_wait+0xb6/0x188
([<000000000010339e>] enabled_wait+0x9e/0x188)
 [<000000000010396a>] arch_cpu_idle+0x32/0x50
 [<0000000000bd0112>] default_idle_call+0x52/0x68
 [<00000000001cd0fa>] do_idle+0x102/0x188
 [<00000000001cd41e>] cpu_startup_entry+0x3e/0x48
 [<0000000000118c64>] smp_start_secondary+0x11c/0x130
 [<0000000000bd2016>] restart_int_handler+0x62/0x78
 [<0000000000000000>]           (null)
INFO: lockdep is turned off.
Last Breaking-Event-Address:
 [<000003ff801e41d6>] zfcp_fc_ct_job_handler+0x3e/0x48 [zfcp]

Kernel panic - not syncing: Fatal exception in interrupt

To prevent this, allocate a buffer when the BSG blk-request is setup, and
before it is queued for LLD processing.

Reported-by: Steffen Maier <maier@linux.vnet.ibm.com>
Signed-off-by: Benjamin Block <bblock@linux.vnet.ibm.com>
Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request")
Cc: <stable@vger.kernel.org> #4.11+
---
 block/bsg.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

Comments

Christoph Hellwig Aug. 10, 2017, 9:32 a.m. UTC | #1
We can't use an on-stack buffer for the sense data, as drivers will
dma to it.  So we should reuse the SCSI init_rq_fn() for the BSG
queues and/or implement the same scheme.
Benjamin Block Aug. 10, 2017, 10:10 p.m. UTC | #2
On Thu, Aug 10, 2017 at 11:32:17AM +0200, Christoph Hellwig wrote:
> We can't use an on-stack buffer for the sense data, as drivers will
> dma to it.  So we should reuse the SCSI init_rq_fn() for the BSG
> queues and/or implement the same scheme.
> 

BSG is odd in this regard. Here is the data model as far as I understood
it from reading the source.

The user of the interface provides his input via a sg_io_v4 object.

 struct sg_io_v4
 +--------------+
 |              |  
 | request-------->+----------------------------+
 |   + _len     |  |                            |
 |   (A)        |  | BSG Request                |
 |              |  | e.g. struct fc_bsg_request | Depends on BSG implementation
 |              |  |                            | FC vs. iSCSI vs. ...
 |              |  +----------------------------+
 |              |
 | response------->+----------------------------+ Used as _Output_
 |   + max_len  |  |                            | User doesn't initialize
 |   (B)        |  | BSG Reply                  | User provides (optional)
 |              |  | e.g. struct fc_bsg_reply   |   memory; May be NULL.
 |              |  |                            |
 |              |  +----------------------------+
 |              |
 | dout_xferp----->+-----------------------+ Stuff send on the wire by
 |   + _len     |  |                       |   the LLD
 |   (C)        |  | Transport Protocol IU | Aligned on PAGE_SIZE
 |              |  | e.g. FC-GS-7 CT_IU    |
 |              |  |                       |
 |              |  +-----------------------+
 |              |
 | din_xferp------>+-----------------------+ Buffer for response data by
 |   + _len     |  |                       |   the LLD
 |   (D)        |  | Transport Protocol IU | Aligned on PAGE_SIZE
 |              |  | e.g. FC-GS-7 CT_IU    |
 |              |  |                       |
 |              |  +-----------------------+
 +--------------+

This is just a part of it, but the most important for this issue. The
BSG driver then encapsulates this into two linked block-requests he
passes down to the LLDs. The first block-request (E) is for the Request
Data, and the second request (F) for the Response Data. (F) is optional,
depending on the whether the user passes both dout_xferp and din_xferp.

 struct request (E)
 +--------------+
 |              |  struct scsi_request
 | scsi_request--->+-----------------+
 |              |  |                 |
 |              |  | cmd---------------------> Copy of (A)
 |              |  |  + _len         |         Space in struct or kzalloc
 |              |  |  (G)            |
 |              |  |                 |
 |              |  | sense-------------------> Space for BSG Reply
 |              |  |  + _len         |         Same Data-Structure as (B)
 |              |  |  (H)            |         NOT actually pointer (B)
 |              |  |                 |         'reply_buffer' in my patch 
 |              |  +-----------------+
 |              |
 | bio------------> Mapped via blk_rq_map_user() to (C) dout_xferp
 |              |
 | next_rq---------+
 |              |  |
 +--------------+  |
                   |
 struct request (F)|(if used)
 +--------------+<-+
 |              |
 | scsi_request---> Unused here
 |              |
 | bio------------> Mapped via blk_rq_map_user() to (D) din_xferp
 |              |
 +--------------+

This is enqueued in the (legacy) blk-queue. In bsg-lib.c this is
processed and encapsulated into an other data-structure struct bsg_job

 struct bsg_job
 +-----------------+
 |                 |
 | request-----------> (G) scsi_request->cmd -> Copy of (A)
 |   + _len        |       e.g. struct fc_bsg_request
 |                 |
 | reply-------------> (H) scsi_request->sense -> 'reply_buffer'
 |   + _len        |       e.g. struct fc_bsg_reply
 |                 |
 | request_payload---> struct scatterlist ... map (E)->bio
 |   + _len        |
 |   (I)           |
 |                 |
 | reply_payload-----> struct scatterlist ... map (F)->bio
 |   + _len        |
 |   (J)           |
 |                 |
 +-----------------+

This then is finally what the LLD gets to work with, with the callback
from the request-queue. Depending on contents of (G) the LLD gets to
decide whatever the user-space wants him to do. This depends highly on
the transport.

In case of zFCP we map (I) and (J) directly into the ring that passes
the data to our hardware and the one that the hardware uses to pass back
the responses.

(This is actually pretty cool if you think about it. Apart from the copy
we make of (A) into (G), this whole send was completely 'zero-copy'.
Depending on the hardware it can directly DMA onto the wire... I guess
most modern cards can do such a thing.)

When the answer is coming back, the payload data is expected to be put
into (J). If your HW can DMA into this, no more effort.

Again, depending on (H), the LLD fills in some information for
accounting and such. In case of FC, there is also some
protocol-information, but this is by no means a standard IU.

This is then passed up the stack pretty quick and if the user passed
something with (B), data from (H) is copied into (B). But this is
optional. The main payload is transferred to the user via (J) which is a
remap of (D).

So this is my understanding here. (I don't wanna say that this is
necessarily completely correct ;-), pleas correct me, if I am wrong. The
lack of proper documentation is also not helping at all.)

This worked till it broke. Right now every driver that tries to access
(H) will panic the system, or cause very undefined behavior. I suspect
no driver right now tries to do any DMA into (H); before the regression,
this has been also an on-stack variable (I suspect since BSG was
introduced, haven't checked though).

The asymmetries between the first struct request (E) and the following
(F) also makes it hard to use the same scheme as in other drivers, where
init_rq_fn() gets to initialize each request in the same'ish way. Or?
Just looking at it right now, this would require some bigger rework that
is not appropriate for a stable bug-fix.

I think it would be best if we fix the possible panics every user of
this is gonna experience and after that we can still think about
improving it further beyond what the rest of the patch set already does,
if that is necessary.


                                                    Beste Grüße / Best regards,
                                                      - Benjamin Block
Benjamin Block Aug. 10, 2017, 10:45 p.m. UTC | #3
On Fri, Aug 11, 2017 at 12:10:38AM +0200, Benjamin Block wrote:
> On Thu, Aug 10, 2017 at 11:32:17AM +0200, Christoph Hellwig wrote:
> > We can't use an on-stack buffer for the sense data, as drivers will
> > dma to it.  So we should reuse the SCSI init_rq_fn() for the BSG
> > queues and/or implement the same scheme.
> > 
> 
...
> 
>  struct sg_io_v4
>  +--------------+
>  |              |  
>  | request-------->+----------------------------+
>  |   + _len     |  |                            |
>  |   (A)        |  | BSG Request                |
>  |              |  | e.g. struct fc_bsg_request | Depends on BSG implementation
>  |              |  |                            | FC vs. iSCSI vs. ...
>  |              |  +----------------------------+
>  |              |
>  | response------->+----------------------------+ Used as _Output_
>  |   + max_len  |  |                            | User doesn't initialize
>  |   (B)        |  | BSG Reply                  | User provides (optional)
>  |              |  | e.g. struct fc_bsg_reply   |   memory; May be NULL.
>  |              |  |                            |
>  |              |  +----------------------------+
>  |              |
>  | dout_xferp----->+-----------------------+ Stuff send on the wire by
>  |   + _len     |  |                       |   the LLD
>  |   (C)        |  | Transport Protocol IU | Aligned on PAGE_SIZE
>  |              |  | e.g. FC-GS-7 CT_IU    |
>  |              |  |                       |
>  |              |  +-----------------------+
>  |              |
>  | din_xferp------>+-----------------------+ Buffer for response data by
>  |   + _len     |  |                       |   the LLD
>  |   (D)        |  | Transport Protocol IU | Aligned on PAGE_SIZE
>  |              |  | e.g. FC-GS-7 CT_IU    |
>  |              |  |                       |
>  |              |  +-----------------------+
>  +--------------+
> 
...
> 
>  struct request (E)
>  +--------------+
>  |              |  struct scsi_request
>  | scsi_request--->+-----------------+
>  |              |  |                 |
>  |              |  | cmd---------------------> Copy of (A)
>  |              |  |  + _len         |         Space in struct or kzalloc
>  |              |  |  (G)            |
>  |              |  |                 |
>  |              |  | sense-------------------> Space for BSG Reply
>  |              |  |  + _len         |         Same Data-Structure as (B)
>  |              |  |  (H)            |         NOT actually pointer (B)
>  |              |  |                 |         'reply_buffer' in my patch 
>  |              |  +-----------------+
>  |              |
>  | bio------------> Mapped via blk_rq_map_user() to (C) dout_xferp
>  |              |
>  | next_rq---------+
>  |              |  |
>  +--------------+  |
>                    |
>  struct request (F)|(if used)
>  +--------------+<-+
>  |              |
>  | scsi_request---> Unused here
>  |              |
>  | bio------------> Mapped via blk_rq_map_user() to (D) din_xferp
>  |              |
>  +--------------+
> 
...
> 
>  struct bsg_job
>  +-----------------+
>  |                 |
>  | request-----------> (G) scsi_request->cmd -> Copy of (A)
>  |   + _len        |       e.g. struct fc_bsg_request
>  |                 |
>  | reply-------------> (H) scsi_request->sense -> 'reply_buffer'
>  |   + _len        |       e.g. struct fc_bsg_reply
>  |                 |
>  | request_payload---> struct scatterlist ... map (E)->bio
>  |   + _len        |
>  |   (I)           |
>  |                 |
>  | reply_payload-----> struct scatterlist ... map (F)->bio
>  |   + _len        |
>  |   (J)           |
>  |                 |
>  +-----------------+
> 
....
> 
> This worked till it broke. Right now every driver that tries to access
> (H) will panic the system, or cause very undefined behavior. I suspect
> no driver right now tries to do any DMA into (H); before the regression,
> this has been also an on-stack variable (I suspect since BSG was
> introduced, haven't checked though).
> 
> The asymmetries between the first struct request (E) and the following
> (F) also makes it hard to use the same scheme as in other drivers, where
> init_rq_fn() gets to initialize each request in the same'ish way. Or?
> Just looking at it right now, this would require some bigger rework that
> is not appropriate for a stable bug-fix.
> 

Just some more brain-dump here.

One more problem for direct DMA into (H) in the current BSG setup is
probably, that the transport classes have each their own private format
for the BSG reply (struct fc_bsg_reply and struct iscsi_bsg_reply right
now I think). The current stack doesn't take any precaution to properly
align this in accords to what the LLDs specifies for the blk-layer... so
lets assume struct fc_bsg_reply. This has fields for actual protocol IUs
(in contrast to iSCSI, where it only has some vendor-reply buffer [an
array with 0 length...]), but they start after some BSG meta-data that
are non-standard. We would have to rewrite that to allow mapping the
start of the protocol IUs in accords to the expectations the LLDs have..
page-aligned and such things. Otherwise we would break whatever handles
the meta-data the LLD pass up the stack - added that this is actually
user visible data, passed back via struct sg_io_v4.

This could be something of a new feature I guess, it would be an
improvement in terms that it could reduce copies even more, but w/o
further research I guess it is a bit more work.



                                                    Beste Grüße / Best regards,
                                                      - Benjamin Block
Christoph Hellwig Aug. 11, 2017, 8:38 a.m. UTC | #4
My point was that we now gurantee that that the sense data is not
a stack pointer an a driver can DMA to it.  Now for BSG the sense
data is "just" abused as reply, but the point still stands - we
don't want to pass a possible stack pointer to drivers in a data
buffer because we want to allow DMA to it.

That being said with your patch 4 that becomes a moot point as we'll
now always dynamically allocate it.  So maybe just reorder that to go
first and we should be fine.
diff mbox

Patch

diff --git a/block/bsg.c b/block/bsg.c
index 37663b664666..285b1b8126c3 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -74,6 +74,8 @@  static int bsg_major;
 
 static struct kmem_cache *bsg_cmd_cachep;
 
+#define BSG_COMMAND_REPLY_BUFFERSIZE	SCSI_SENSE_BUFFERSIZE
+
 /*
  * our internal command type
  */
@@ -85,6 +87,7 @@  struct bsg_command {
 	struct bio *bidi_bio;
 	int err;
 	struct sg_io_v4 hdr;
+	u8 reply_buffer[BSG_COMMAND_REPLY_BUFFERSIZE];
 };
 
 static void bsg_free_command(struct bsg_command *bc)
@@ -137,7 +140,7 @@  static inline struct hlist_head *bsg_dev_idx_hash(int index)
 
 static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 				struct sg_io_v4 *hdr, struct bsg_device *bd,
-				fmode_t has_write_perm)
+				u8 *reply_buffer, fmode_t has_write_perm)
 {
 	struct scsi_request *req = scsi_req(rq);
 
@@ -162,6 +165,10 @@  static int blk_fill_sgv4_hdr_rq(struct request_queue *q, struct request *rq,
 	 */
 	req->cmd_len = hdr->request_len;
 
+	/* this is later asigned to bsg_job as reply */
+	req->sense = reply_buffer;
+	req->sense_len = BSG_COMMAND_REPLY_BUFFERSIZE;
+
 	rq->timeout = msecs_to_jiffies(hdr->timeout);
 	if (!rq->timeout)
 		rq->timeout = q->sg_timeout;
@@ -206,7 +213,8 @@  bsg_validate_sgv4_hdr(struct sg_io_v4 *hdr, int *op)
  * map sg_io_v4 to a request.
  */
 static struct request *
-bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
+bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm,
+	    u8 *reply_buffer)
 {
 	struct request_queue *q = bd->queue;
 	struct request *rq, *next_rq = NULL;
@@ -237,7 +245,8 @@  bsg_map_hdr(struct bsg_device *bd, struct sg_io_v4 *hdr, fmode_t has_write_perm)
 	if (IS_ERR(rq))
 		return rq;
 
-	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, has_write_perm);
+	ret = blk_fill_sgv4_hdr_rq(q, rq, hdr, bd, reply_buffer,
+				   has_write_perm);
 	if (ret)
 		goto out;
 
@@ -619,7 +628,8 @@  static int __bsg_write(struct bsg_device *bd, const char __user *buf,
 		/*
 		 * get a request, fill in the blanks, and add to request queue
 		 */
-		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm);
+		rq = bsg_map_hdr(bd, &bc->hdr, has_write_perm,
+				 bc->reply_buffer);
 		if (IS_ERR(rq)) {
 			ret = PTR_ERR(rq);
 			rq = NULL;
@@ -908,6 +918,7 @@  static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	}
 	case SG_IO: {
 		struct request *rq;
+		u8 reply_buffer[BSG_COMMAND_REPLY_BUFFERSIZE] = { 0, };
 		struct bio *bio, *bidi_bio = NULL;
 		struct sg_io_v4 hdr;
 		int at_head;
@@ -915,7 +926,8 @@  static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (copy_from_user(&hdr, uarg, sizeof(hdr)))
 			return -EFAULT;
 
-		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE);
+		rq = bsg_map_hdr(bd, &hdr, file->f_mode & FMODE_WRITE,
+				 reply_buffer);
 		if (IS_ERR(rq))
 			return PTR_ERR(rq);