diff mbox

[v3,11/13] IB/srp: Make HCA completion vector configurable

Message ID 51D41FFC.6070105@acm.org (mailing list archive)
State Rejected
Headers show

Commit Message

Bart Van Assche July 3, 2013, 12:58 p.m. UTC
Several InfiniBand HCA's allow to configure the completion vector
per queue pair. This allows to spread the workload created by IB
completion interrupts over multiple MSI-X vectors and hence over
multiple CPU cores. In other words, configuring the completion
vector properly not only allows to reduce latency on an initiator
connected to multiple SRP targets but also allows to improve
throughput.

Signed-off-by: Bart Van Assche <bvanassche@acm.org>
Acked-by: David Dillow <dillowda@ornl.gov>
Cc: Roland Dreier <roland@kernel.org>
Cc: Vu Pham <vu@mellanox.com>
Cc: Sebastian Riemer <sebastian.riemer@profitbricks.com>
---
 Documentation/ABI/stable/sysfs-driver-ib_srp |    7 +++++++
 drivers/infiniband/ulp/srp/ib_srp.c          |   26 ++++++++++++++++++++++++--
 drivers/infiniband/ulp/srp/ib_srp.h          |    1 +
 3 files changed, 32 insertions(+), 2 deletions(-)

Comments

Sagi Grimberg July 14, 2013, 9:43 a.m. UTC | #1
On 7/3/2013 3:58 PM, Bart Van Assche wrote:
> Several InfiniBand HCA's allow to configure the completion vector
> per queue pair. This allows to spread the workload created by IB
> completion interrupts over multiple MSI-X vectors and hence over
> multiple CPU cores. In other words, configuring the completion
> vector properly not only allows to reduce latency on an initiator
> connected to multiple SRP targets but also allows to improve
> throughput.

Hey Bart,
Just wrote a small patch to allow srp_daemon spread connection across 
HCA's completion vectors.
But re-thinking on this, is it really a good idea to give the user 
control over completion
vectors for CQs he doesn't really owns. This way the user must retrieve 
the maximum completion
vectors from the ib_device and consider this when adding a connection 
and In addition will need to set proper IRQ affinity.

Perhaps the driver can manage this on it's own without involving the 
user, take the mlx4_en driver for
example, it spreads it's CQs across HCAs completion vectors without 
involving the user. the user that
opens a socket has no influence of the underlying cq<->comp-vector 
assignment.

The only use-case I can think of is where the user will want to use only 
a subset of the completion-vectors
if the user will want to reserve some completion-vectors for native IB 
applications but I don't know
how common it is.

Other from that, I think it is always better to spread the CQs across 
HCA completion-vectors, so perhaps the driver
just assign connection CQs across comp-vecs without getting args from 
the user, but simply iterate over comp_vectors.

What do you think?

> Signed-off-by: Bart Van Assche <bvanassche@acm.org>
> Acked-by: David Dillow <dillowda@ornl.gov>
> Cc: Roland Dreier <roland@kernel.org>
> Cc: Vu Pham <vu@mellanox.com>
> Cc: Sebastian Riemer <sebastian.riemer@profitbricks.com>
> ---
>   Documentation/ABI/stable/sysfs-driver-ib_srp |    7 +++++++
>   drivers/infiniband/ulp/srp/ib_srp.c          |   26 ++++++++++++++++++++++++--
>   drivers/infiniband/ulp/srp/ib_srp.h          |    1 +
>   3 files changed, 32 insertions(+), 2 deletions(-)
>
> diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp
> index 481aae9..5c53d28 100644
> --- a/Documentation/ABI/stable/sysfs-driver-ib_srp
> +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp
> @@ -54,6 +54,13 @@ Description:	Interface for making ib_srp connect to a new target.
>   		  ib_srp. Specifying a value that exceeds cmd_sg_entries is
>   		  only safe with partial memory descriptor list support enabled
>   		  (allow_ext_sg=1).
> +		* comp_vector, a number in the range 0..n-1 specifying the
> +		  MSI-X completion vector. Some HCA's allocate multiple (n)
> +		  MSI-X vectors per HCA port. If the IRQ affinity masks of
> +		  these interrupts have been configured such that each MSI-X
> +		  interrupt is handled by a different CPU then the comp_vector
> +		  parameter can be used to spread the SRP completion workload
> +		  over multiple CPU's.
>   
>   What:		/sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
>   Date:		January 2, 2006
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
> index 2557b7a..6c164f6 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.c
> +++ b/drivers/infiniband/ulp/srp/ib_srp.c
> @@ -294,14 +294,16 @@ static int srp_create_target_ib(struct srp_target_port *target)
>   		return -ENOMEM;
>   
>   	recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
> -			       srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
> +			       srp_recv_completion, NULL, target, SRP_RQ_SIZE,
> +			       target->comp_vector);
>   	if (IS_ERR(recv_cq)) {
>   		ret = PTR_ERR(recv_cq);
>   		goto err;
>   	}
>   
>   	send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
> -			       srp_send_completion, NULL, target, SRP_SQ_SIZE, 0);
> +			       srp_send_completion, NULL, target, SRP_SQ_SIZE,
> +			       target->comp_vector);
>   	if (IS_ERR(send_cq)) {
>   		ret = PTR_ERR(send_cq);
>   		goto err_recv_cq;
> @@ -1976,6 +1978,14 @@ static ssize_t show_local_ib_device(struct device *dev,
>   	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
>   }
>   
> +static ssize_t show_comp_vector(struct device *dev,
> +				struct device_attribute *attr, char *buf)
> +{
> +	struct srp_target_port *target = host_to_target(class_to_shost(dev));
> +
> +	return sprintf(buf, "%d\n", target->comp_vector);
> +}
> +
>   static ssize_t show_cmd_sg_entries(struct device *dev,
>   				   struct device_attribute *attr, char *buf)
>   {
> @@ -2002,6 +2012,7 @@ static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
>   static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);
>   static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
>   static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
> +static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
>   static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
>   static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
>   
> @@ -2016,6 +2027,7 @@ static struct device_attribute *srp_host_attrs[] = {
>   	&dev_attr_zero_req_lim,
>   	&dev_attr_local_ib_port,
>   	&dev_attr_local_ib_device,
> +	&dev_attr_comp_vector,
>   	&dev_attr_cmd_sg_entries,
>   	&dev_attr_allow_ext_sg,
>   	NULL
> @@ -2140,6 +2152,7 @@ enum {
>   	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9,
>   	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
>   	SRP_OPT_SG_TABLESIZE	= 1 << 11,
> +	SRP_OPT_COMP_VECTOR	= 1 << 12,
>   	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
>   				   SRP_OPT_IOC_GUID	|
>   				   SRP_OPT_DGID		|
> @@ -2160,6 +2173,7 @@ static const match_table_t srp_opt_tokens = {
>   	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	},
>   	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
>   	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
> +	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
>   	{ SRP_OPT_ERR,			NULL 			}
>   };
>   
> @@ -2315,6 +2329,14 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target)
>   			target->sg_tablesize = token;
>   			break;
>   
> +		case SRP_OPT_COMP_VECTOR:
> +			if (match_int(args, &token) || token < 0) {
> +				pr_warn("bad comp_vector parameter '%s'\n", p);
> +				goto out;
> +			}
> +			target->comp_vector = token;
> +			break;
> +
>   		default:
>   			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
>   				p);
> diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
> index e45d9d0..cbc0b14 100644
> --- a/drivers/infiniband/ulp/srp/ib_srp.h
> +++ b/drivers/infiniband/ulp/srp/ib_srp.h
> @@ -156,6 +156,7 @@ struct srp_target_port {
>   	char			target_name[32];
>   	unsigned int		scsi_id;
>   	unsigned int		sg_tablesize;
> +	int			comp_vector;
>   
>   	struct ib_sa_path_rec	path;
>   	__be16			orig_dgid[8];

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche July 15, 2013, 11:06 a.m. UTC | #2
On 14/07/2013 3:43, Sagi Grimberg wrote:
> On 7/3/2013 3:58 PM, Bart Van Assche wrote:
>> Several InfiniBand HCA's allow to configure the completion vector
>> per queue pair. This allows to spread the workload created by IB
>> completion interrupts over multiple MSI-X vectors and hence over
>> multiple CPU cores. In other words, configuring the completion
>> vector properly not only allows to reduce latency on an initiator
>> connected to multiple SRP targets but also allows to improve
>> throughput.
>
> Hey Bart,
> Just wrote a small patch to allow srp_daemon spread connection across
> HCA's completion vectors.
> But re-thinking on this, is it really a good idea to give the user
> control over completion
> vectors for CQs he doesn't really owns. This way the user must retrieve
> the maximum completion
> vectors from the ib_device and consider this when adding a connection
> and In addition will need to set proper IRQ affinity.
>
> Perhaps the driver can manage this on it's own without involving the
> user, take the mlx4_en driver for
> example, it spreads it's CQs across HCAs completion vectors without
> involving the user. the user that
> opens a socket has no influence of the underlying cq<->comp-vector
> assignment.
>
> The only use-case I can think of is where the user will want to use only
> a subset of the completion-vectors
> if the user will want to reserve some completion-vectors for native IB
> applications but I don't know
> how common it is.
>
> Other from that, I think it is always better to spread the CQs across
> HCA completion-vectors, so perhaps the driver
> just assign connection CQs across comp-vecs without getting args from
> the user, but simply iterate over comp_vectors.
>
> What do you think?

Hello Sagi,

Sorry but I do not think it is a good idea to let srp_daemon assign the 
completion vector. While this might work well on single-socket systems 
this will result in suboptimal results on NUMA systems. For certain 
workloads on NUMA systems, and when a NUMA initiator system is connected 
to multiple target systems, the optimal configuration is to make sure 
that all processing that is associated with a single SCSI host occurs on 
the same NUMA node. This means configuring the completion vector value 
such that IB interrupts are generated on the same NUMA node where the 
associated SCSI host and applications are running.

More in general, performance tuning on NUMA systems requires system-wide 
knowledge of all applications that are running and also of which 
interrupt is processed by which NUMA node. So choosing a proper value 
for the completion vector is only possible once the system topology and 
the IRQ affinity masks are known. I don't think we should build 
knowledge of all this in srp_daemon.

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 15, 2013, 1:29 p.m. UTC | #3
On 7/15/2013 2:06 PM, Bart Van Assche wrote:
> On 14/07/2013 3:43, Sagi Grimberg wrote:
>> On 7/3/2013 3:58 PM, Bart Van Assche wrote:
>>> Several InfiniBand HCA's allow to configure the completion vector
>>> per queue pair. This allows to spread the workload created by IB
>>> completion interrupts over multiple MSI-X vectors and hence over
>>> multiple CPU cores. In other words, configuring the completion
>>> vector properly not only allows to reduce latency on an initiator
>>> connected to multiple SRP targets but also allows to improve
>>> throughput.
>>
>> Hey Bart,
>> Just wrote a small patch to allow srp_daemon spread connection across
>> HCA's completion vectors.
>> But re-thinking on this, is it really a good idea to give the user
>> control over completion
>> vectors for CQs he doesn't really owns. This way the user must retrieve
>> the maximum completion
>> vectors from the ib_device and consider this when adding a connection
>> and In addition will need to set proper IRQ affinity.
>>
>> Perhaps the driver can manage this on it's own without involving the
>> user, take the mlx4_en driver for
>> example, it spreads it's CQs across HCAs completion vectors without
>> involving the user. the user that
>> opens a socket has no influence of the underlying cq<->comp-vector
>> assignment.
>>
>> The only use-case I can think of is where the user will want to use only
>> a subset of the completion-vectors
>> if the user will want to reserve some completion-vectors for native IB
>> applications but I don't know
>> how common it is.
>>
>> Other from that, I think it is always better to spread the CQs across
>> HCA completion-vectors, so perhaps the driver
>> just assign connection CQs across comp-vecs without getting args from
>> the user, but simply iterate over comp_vectors.
>>
>> What do you think?
>
> Hello Sagi,
>
> Sorry but I do not think it is a good idea to let srp_daemon assign 
> the completion vector. While this might work well on single-socket 
> systems this will result in suboptimal results on NUMA systems. For 
> certain workloads on NUMA systems, and when a NUMA initiator system is 
> connected to multiple target systems, the optimal configuration is to 
> make sure that all processing that is associated with a single SCSI 
> host occurs on the same NUMA node. This means configuring the 
> completion vector value such that IB interrupts are generated on the 
> same NUMA node where the associated SCSI host and applications are 
> running.
>
> More in general, performance tuning on NUMA systems requires 
> system-wide knowledge of all applications that are running and also of 
> which interrupt is processed by which NUMA node. So choosing a proper 
> value for the completion vector is only possible once the system 
> topology and the IRQ affinity masks are known. I don't think we should 
> build knowledge of all this in srp_daemon.
>
> Bart.
>

Hey Bart,

Thanks for your quick attention for my question.
srp_daemon is a package designated for the costumer to automatically 
detect targets in the IB fabric. From our expeirience here in Mellanox, 
costumers/users like automatic "plug&play" tools.
They are reluctant to build their own scriptology to enhance performance 
and settle with srp_daemon which is preferred over use of ibsrpdm and 
manual adding new targets.
Regardless, the completion vectors assignment is meaningless without 
setting proper IRQ affinity, so in the worst case where the user didn't 
set his IRQ affinity,
this assignment will perform like the default completion vector 
assignment as all IRQs are directed without any masking i.e. core 0.

 From my expiriments in NUMA systems, optimal performance is gained 
where all IRQs are directed to half of the cores on the NUMA node close 
to the HCA, and all traffic generators share the other half of the cores 
on the same NUMA node. So based on that knowledge, I thought that 
srp_daemon/srp driver will assign it's CQs across the HCAs completion 
vectors, and the user is encouraged to set the IRQ affinity as described 
above to gain optimal performance.
Adding connections over the far NUMA node don't seem to benefit 
performance too much...

As I mentioned, a use-case I see that may raise a problem here, is if 
the user would like to maintain multiple SRP connections and reserve 
some completion vectors for other IB applications on the system.
in this case the user will be able to disable srp_daemon/srp driver 
completion vectors assignment.

So, this was just an idea, and easy implementation that would 
potentionaly give the user semi-automatic performance optimized 
configuration...

-Sagi
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche July 15, 2013, 6:23 p.m. UTC | #4
On 15/07/2013 7:29, Sagi Grimberg wrote:
> srp_daemon is a package designated for the customer to automatically
> detect targets in the IB fabric. From our experience here in Mellanox,
> customers/users like automatic "plug&play" tools.
> They are reluctant to build their own scriptology to enhance performance
> and settle with srp_daemon which is preferred over use of ibsrpdm and
> manual adding new targets.
> Regardless, the completion vectors assignment is meaningless without
> setting proper IRQ affinity, so in the worst case where the user didn't
> set his IRQ affinity,
> this assignment will perform like the default completion vector
> assignment as all IRQs are directed without any masking i.e. core 0.
>
> From my experiments in NUMA systems, optimal performance is gained
> where all IRQs are directed to half of the cores on the NUMA node close
> to the HCA, and all traffic generators share the other half of the cores
> on the same NUMA node. So based on that knowledge, I thought that
> srp_daemon/srp driver will assign it's CQs across the HCAs completion
> vectors, and the user is encouraged to set the IRQ affinity as described
> above to gain optimal performance.
> Adding connections over the far NUMA node don't seem to benefit
> performance too much...
>
> As I mentioned, a use-case I see that may raise a problem here, is if
> the user would like to maintain multiple SRP connections and reserve
> some completion vectors for other IB applications on the system.
> in this case the user will be able to disable srp_daemon/srp driver
> completion vectors assignment.
>
> So, this was just an idea, and easy implementation that would
> potentially give the user semi-automatic performance optimized
> configuration...

Hello Sagi,

I agree with you that it would help a lot if completion vector 
assignment could be automated such that end users do not have to care 
about assigning completion vector numbers. The challenge is to find an 
approach that is general enough such that it works for all possible use 
cases. One possible approach is to let a tool that has knowledge about 
the application fill in completion vector numbers in srp_daemon.conf and 
let srp_daemon use the values generated by this tool. That approach 
would avoid that srp_daemon has to have any knowledge about the 
application but would still allow srp_daemon to assign the completion 
vector numbers.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 16, 2013, 10:11 a.m. UTC | #5
On 7/15/2013 9:23 PM, Bart Van Assche wrote:
> On 15/07/2013 7:29, Sagi Grimberg wrote:
>> srp_daemon is a package designated for the customer to automatically
>> detect targets in the IB fabric. From our experience here in Mellanox,
>> customers/users like automatic "plug&play" tools.
>> They are reluctant to build their own scriptology to enhance performance
>> and settle with srp_daemon which is preferred over use of ibsrpdm and
>> manual adding new targets.
>> Regardless, the completion vectors assignment is meaningless without
>> setting proper IRQ affinity, so in the worst case where the user didn't
>> set his IRQ affinity,
>> this assignment will perform like the default completion vector
>> assignment as all IRQs are directed without any masking i.e. core 0.
>>
>> From my experiments in NUMA systems, optimal performance is gained
>> where all IRQs are directed to half of the cores on the NUMA node close
>> to the HCA, and all traffic generators share the other half of the cores
>> on the same NUMA node. So based on that knowledge, I thought that
>> srp_daemon/srp driver will assign it's CQs across the HCAs completion
>> vectors, and the user is encouraged to set the IRQ affinity as described
>> above to gain optimal performance.
>> Adding connections over the far NUMA node don't seem to benefit
>> performance too much...
>>
>> As I mentioned, a use-case I see that may raise a problem here, is if
>> the user would like to maintain multiple SRP connections and reserve
>> some completion vectors for other IB applications on the system.
>> in this case the user will be able to disable srp_daemon/srp driver
>> completion vectors assignment.
>>
>> So, this was just an idea, and easy implementation that would
>> potentially give the user semi-automatic performance optimized
>> configuration...
>
> Hello Sagi,
>
> I agree with you that it would help a lot if completion vector 
> assignment could be automated such that end users do not have to care 
> about assigning completion vector numbers. The challenge is to find an 
> approach that is general enough such that it works for all possible 
> use cases. One possible approach is to let a tool that has knowledge 
> about the application fill in completion vector numbers in 
> srp_daemon.conf and let srp_daemon use the values generated by this 
> tool. That approach would avoid that srp_daemon has to have any 
> knowledge about the application but would still allow srp_daemon to 
> assign the completion vector numbers.
>
> Bart.

Hey Bart,
This sounds like a nice Idea, but there an inherent problem about 
applications coming and going while the connections are static (somewhat),
how can you control pinning an arbitrary application running (over SRP 
devices of-course) at certain point of time.

So will you agree at least to give target->comp_vector a default of 
IB_CQ_VECTOR_LEAST_ATTACHED?
 From my point of view, a user that don't have a slightest clue about 
completion vectors and performance optimization, this is somewhat better 
than doing nothing...

-Sagi
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche July 16, 2013, 10:58 a.m. UTC | #6
On 16/07/2013 4:11, Sagi Grimberg wrote:
> This sounds like a nice Idea, but there an inherent problem about
> applications coming and going while the connections are static (somewhat),
> how can you control pinning an arbitrary application running (over SRP
> devices of-course) at certain point of time.
>
> So will you agree at least to give target->comp_vector a default of
> IB_CQ_VECTOR_LEAST_ATTACHED?
>  From my point of view, a user that don't have a slightest clue about
> completion vectors and performance optimization, this is somewhat better
> than doing nothing...

Hello Sagi,

That sounds like an interesting proposal to me. But did the patch that 
adds the IB_CQ_VECTOR_LEAST_ATTACHED feature ever get accepted in the 
upstream Linux kernel ? I have tried to find that symbol in Linux kernel 
v3.11-rc1 but couldn't find it. Maybe I have overlooked something ?

Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 16, 2013, 12:41 p.m. UTC | #7
On 7/16/2013 1:58 PM, Bart Van Assche wrote:
> On 16/07/2013 4:11, Sagi Grimberg wrote:
>> This sounds like a nice Idea, but there an inherent problem about
>> applications coming and going while the connections are static 
>> (somewhat),
>> how can you control pinning an arbitrary application running (over SRP
>> devices of-course) at certain point of time.
>>
>> So will you agree at least to give target->comp_vector a default of
>> IB_CQ_VECTOR_LEAST_ATTACHED?
>>  From my point of view, a user that don't have a slightest clue about
>> completion vectors and performance optimization, this is somewhat better
>> than doing nothing...
>
> Hello Sagi,
>
> That sounds like an interesting proposal to me. But did the patch that 
> adds the IB_CQ_VECTOR_LEAST_ATTACHED feature ever get accepted in the 
> upstream Linux kernel ? I have tried to find that symbol in Linux 
> kernel v3.11-rc1 but couldn't find it. Maybe I have overlooked 
> something ?
>
> Bart.
>

Oh you're right!

I'll ask Vu, from git blame on old OFED I see that He wrote the code...
Perhaps this should be added as well.

-Sagi
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bart Van Assche July 16, 2013, 3:11 p.m. UTC | #8
On 14/07/2013 3:43, Sagi Grimberg wrote:
> Just wrote a small patch to allow srp_daemon spread connection across
> HCA's completion vectors.

Hello Sagi,

How about the following approach:
- Add support for reading the completion vector from srp_daemon.conf,
   similar to how several other parameters are already read from that
   file.
- If the completion vector parameter has not been set in
   srp_daemon.conf, let srp_daemon assign a completion vector such that
   IB interrupts for different SRP hosts use different completion
   vectors.

Bart.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sagi Grimberg July 17, 2013, 9:27 a.m. UTC | #9
On 7/16/2013 6:11 PM, Bart Van Assche wrote:
> On 14/07/2013 3:43, Sagi Grimberg wrote:
>> Just wrote a small patch to allow srp_daemon spread connection across
>> HCA's completion vectors.
>
> Hello Sagi,
>
> How about the following approach:
> - Add support for reading the completion vector from srp_daemon.conf,
>   similar to how several other parameters are already read from that
>   file.

Here We need to take into consideration that we are changing the 
functionality of srp_daemon.conf.
Now instead of simply allowing/dis-allowing targets of specific 
attributes, we are also defining configuration attributes of allowed 
targets.
This might be uncomfortable for the user to explicitly write N target 
strings in srp_daemon.conf just for completion vectors assignment.

Perhaps srp_daemon.conf can contain a list (comma separated) of reserved 
completion vectors for srp_daemon to spread CQs among them.
If this line won't exist - srp_daemon will spread assignment on all HCAs 
completion vectors.

> - If the completion vector parameter has not been set in
>   srp_daemon.conf, let srp_daemon assign a completion vector such that
>   IB interrupts for different SRP hosts use different completion
>   vectors.
>
> Bart.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp
index 481aae9..5c53d28 100644
--- a/Documentation/ABI/stable/sysfs-driver-ib_srp
+++ b/Documentation/ABI/stable/sysfs-driver-ib_srp
@@ -54,6 +54,13 @@  Description:	Interface for making ib_srp connect to a new target.
 		  ib_srp. Specifying a value that exceeds cmd_sg_entries is
 		  only safe with partial memory descriptor list support enabled
 		  (allow_ext_sg=1).
+		* comp_vector, a number in the range 0..n-1 specifying the
+		  MSI-X completion vector. Some HCA's allocate multiple (n)
+		  MSI-X vectors per HCA port. If the IRQ affinity masks of
+		  these interrupts have been configured such that each MSI-X
+		  interrupt is handled by a different CPU then the comp_vector
+		  parameter can be used to spread the SRP completion workload
+		  over multiple CPU's.
 
 What:		/sys/class/infiniband_srp/srp-<hca>-<port_number>/ibdev
 Date:		January 2, 2006
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index 2557b7a..6c164f6 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -294,14 +294,16 @@  static int srp_create_target_ib(struct srp_target_port *target)
 		return -ENOMEM;
 
 	recv_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-			       srp_recv_completion, NULL, target, SRP_RQ_SIZE, 0);
+			       srp_recv_completion, NULL, target, SRP_RQ_SIZE,
+			       target->comp_vector);
 	if (IS_ERR(recv_cq)) {
 		ret = PTR_ERR(recv_cq);
 		goto err;
 	}
 
 	send_cq = ib_create_cq(target->srp_host->srp_dev->dev,
-			       srp_send_completion, NULL, target, SRP_SQ_SIZE, 0);
+			       srp_send_completion, NULL, target, SRP_SQ_SIZE,
+			       target->comp_vector);
 	if (IS_ERR(send_cq)) {
 		ret = PTR_ERR(send_cq);
 		goto err_recv_cq;
@@ -1976,6 +1978,14 @@  static ssize_t show_local_ib_device(struct device *dev,
 	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
 }
 
+static ssize_t show_comp_vector(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->comp_vector);
+}
+
 static ssize_t show_cmd_sg_entries(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
@@ -2002,6 +2012,7 @@  static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
 static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);
 static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
 static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
+static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
 static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
 static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
 
@@ -2016,6 +2027,7 @@  static struct device_attribute *srp_host_attrs[] = {
 	&dev_attr_zero_req_lim,
 	&dev_attr_local_ib_port,
 	&dev_attr_local_ib_device,
+	&dev_attr_comp_vector,
 	&dev_attr_cmd_sg_entries,
 	&dev_attr_allow_ext_sg,
 	NULL
@@ -2140,6 +2152,7 @@  enum {
 	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9,
 	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
 	SRP_OPT_SG_TABLESIZE	= 1 << 11,
+	SRP_OPT_COMP_VECTOR	= 1 << 12,
 	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
 				   SRP_OPT_IOC_GUID	|
 				   SRP_OPT_DGID		|
@@ -2160,6 +2173,7 @@  static const match_table_t srp_opt_tokens = {
 	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	},
 	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
 	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
+	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
 	{ SRP_OPT_ERR,			NULL 			}
 };
 
@@ -2315,6 +2329,14 @@  static int srp_parse_options(const char *buf, struct srp_target_port *target)
 			target->sg_tablesize = token;
 			break;
 
+		case SRP_OPT_COMP_VECTOR:
+			if (match_int(args, &token) || token < 0) {
+				pr_warn("bad comp_vector parameter '%s'\n", p);
+				goto out;
+			}
+			target->comp_vector = token;
+			break;
+
 		default:
 			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
 				p);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index e45d9d0..cbc0b14 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -156,6 +156,7 @@  struct srp_target_port {
 	char			target_name[32];
 	unsigned int		scsi_id;
 	unsigned int		sg_tablesize;
+	int			comp_vector;
 
 	struct ib_sa_path_rec	path;
 	__be16			orig_dgid[8];