diff mbox

[05/15] pci: resource assignment based on p2p alignment

Message ID 20120717052333.GE2369@ram-ThinkPad-T61 (mailing list archive)
State New, archived
Delegated to: Bjorn Helgaas
Headers show

Commit Message

Ram Pai July 17, 2012, 5:23 a.m. UTC
On Tue, Jul 17, 2012 at 01:05:47PM +0800, Ram Pai wrote:
> On Tue, Jul 17, 2012 at 10:23:17AM +0800, Gavin Shan wrote:
> > The patch changes function pbus_size_io() and pbus_size_mem() to
> > do resource (I/O, memory and prefetchable memory) reassignment
> > based on the minimal alignments for the p2p bridge, which was
> > retrieved by function window_alignment().
> > 
> > Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> > ---
> >  drivers/pci/setup-bus.c |   13 +++++++++----
> >  1 files changed, 9 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
> > index c0fb9da..a29483a 100644
> > --- a/drivers/pci/setup-bus.c
> > +++ b/drivers/pci/setup-bus.c
> > @@ -731,6 +731,7 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
> >  	struct resource *b_res = find_free_bus_resource(bus, IORESOURCE_IO);
> >  	unsigned long size = 0, size0 = 0, size1 = 0;
> >  	resource_size_t children_add_size = 0;
> > +	resource_size_t io_align;
> > 
> >  	if (!b_res)
> >   		return;
> > @@ -756,13 +757,15 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
> >  				children_add_size += get_res_add_size(realloc_head, r);
> >  		}
> >  	}
> > +
> > +	io_align = window_alignment(bus, IORESOURCE_IO);
> 
> this should also be
> 	io_align = max(4096, window_alignment(bus, IORESOURCE_IO));
> 
> right?
> 
> 
> >  	size0 = calculate_iosize(size, min_size, size1,
> > -			resource_size(b_res), 4096);
> > +			resource_size(b_res), io_align);
> >  	if (children_add_size > add_size)
> >  		add_size = children_add_size;
> >  	size1 = (!realloc_head || (realloc_head && !add_size)) ? size0 :
> >  		calculate_iosize(size, min_size, add_size + size1,
> > -			resource_size(b_res), 4096);
> > +			resource_size(b_res), io_align);
> >  	if (!size0 && !size1) {
> >  		if (b_res->start || b_res->end)
> >  			dev_info(&bus->self->dev, "disabling bridge window "
> > @@ -772,11 +775,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
> >  		return;
> >  	}
> >  	/* Alignment of the IO window is always 4K */
> > -	b_res->start = 4096;
> > +	b_res->start = io_align;
> >  	b_res->end = b_res->start + size0 - 1;
> >  	b_res->flags |= IORESOURCE_STARTALIGN;
> >  	if (size1 > size0 && realloc_head) {
> > -		add_to_list(realloc_head, bus->self, b_res, size1-size0, 4096);
> > +		add_to_list(realloc_head, bus->self, b_res, size1-size0, io_align);
> >  		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
> >  				 "%pR to [bus %02x-%02x] add_size %lx\n", b_res,
> >  				 bus->secondary, bus->subordinate, size1-size0);
> > @@ -875,6 +878,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> >  			min_align = align1 >> 1;
> >  		align += aligns[order];
> >  	}
> > +
> > +	min_align = max(min_align, window_alignment(bus, type));
> 
>   'type' can sometimes be (IORESOURCE_MEM | IORESOURCE_PREFETCH), which
>   can lead to unpredictable results depending on how window_alignment()
>   is implemented... Hence to be on the safer side I suggest
> 
> 	min_align = max(min_align, window_alignment(bus, b_res->flags & mask));

While you are at it, can we move the min_align calculation into
a separate function?

Somewhat around the following patch


More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Ram Pai July 17, 2012, 5:57 a.m. UTC | #1
On Tue, Jul 17, 2012 at 01:36:49PM +0800, Gavin Shan wrote:
> On Tue, Jul 17, 2012 at 01:23:33PM +0800, Ram Pai wrote:
> >On Tue, Jul 17, 2012 at 01:05:47PM +0800, Ram Pai wrote:
> >> On Tue, Jul 17, 2012 at 10:23:17AM +0800, Gavin Shan wrote:
> >> > The patch changes function pbus_size_io() and pbus_size_mem() to
> >> > do resource (I/O, memory and prefetchable memory) reassignment
> >> > based on the minimal alignments for the p2p bridge, which was
> >> > retrieved by function window_alignment().
> >> > 
> >> > Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
> 
> [snip]
> 
> >> > @@ -772,11 +775,11 @@ static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
> >> >  		return;
> >> >  	}
> >> >  	/* Alignment of the IO window is always 4K */
> >> > -	b_res->start = 4096;
> >> > +	b_res->start = io_align;
> >> >  	b_res->end = b_res->start + size0 - 1;
> >> >  	b_res->flags |= IORESOURCE_STARTALIGN;
> >> >  	if (size1 > size0 && realloc_head) {
> >> > -		add_to_list(realloc_head, bus->self, b_res, size1-size0, 4096);
> >> > +		add_to_list(realloc_head, bus->self, b_res, size1-size0, io_align);
> >> >  		dev_printk(KERN_DEBUG, &bus->self->dev, "bridge window "
> >> >  				 "%pR to [bus %02x-%02x] add_size %lx\n", b_res,
> >> >  				 bus->secondary, bus->subordinate, size1-size0);
> >> > @@ -875,6 +878,8 @@ static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
> >> >  			min_align = align1 >> 1;
> >> >  		align += aligns[order];
> >> >  	}
> >> > +
> >> > +	min_align = max(min_align, window_alignment(bus, type));
> >> 
> >>   'type' can sometimes be (IORESOURCE_MEM | IORESOURCE_PREFETCH), which
> >>   can lead to unpredictable results depending on how window_alignment()
> >>   is implemented... Hence to be on the safer side I suggest
> >> 
> >> 	min_align = max(min_align, window_alignment(bus, b_res->flags & mask));
> 
> Sorry, Ram. I didn't see your concern in last reply. So I have to
> cover your conver in this reply.
> 
> I think it'd better to pass "type" directly because platform (e.g. powernv)
> expects both IORESOURCE_MEM as well as IORESOURCE_PREFETCH. 
> In future, powernv platform will return M32 segment size for IORESOURCE_MEM, but
> might return M64 segment size for (IORESOURCE_MEM | IORESOURCE_PREFETCH).

Hmm.. this code is not about determining what kind of segment the
platform is returning. This code is about using the right alignment
constraints for the type of segment from which resource will be
allocated. right?

b_res is the resource that is being sized. b_res already knows
what kind of resource it is, i.e IORESOURCE_MEM or IORESOURCE_PREFETCH.
Hence we should be exactly using the same alignment constraints as
that dictated by the type of b_res. no?

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt July 17, 2012, 9:16 a.m. UTC | #2
On Tue, 2012-07-17 at 13:57 +0800, Ram Pai wrote:
> Hmm.. this code is not about determining what kind of segment the
> platform is returning. This code is about using the right alignment
> constraints for the type of segment from which resource will be
> allocated. right?
> 
> b_res is the resource that is being sized. b_res already knows
> what kind of resource it is, i.e IORESOURCE_MEM or
> IORESOURCE_PREFETCH.
> Hence we should be exactly using the same alignment constraints as
> that dictated by the type of b_res. no? 

This is unclear.... ideally we want to know which of the host bridge
"apertures" is about to be used...

IE. A prefetchable resource can very well be allocated to a
non-prefetchable window, though the other way isn't supposed to happen.

Additionally, our PHB doesn't actually differenciate prefetchable and
non-prefetchable windows (whether you can prefetch or not is an
attribute of the CPU mapping, but basically non-cachable mappings are
never prefetchable for us).

So we can be lax in how we assign things between our single 32-bit
window divided in 128 segments and our 16x64-bit windows divided in 8
segments (and future HW will do thins differently even).

For example we would like in some cases to use M64's (64-bit windows) to
map SR-IOV BARs regardless of the "prefetchability" though that can only
work if we are not behind a PCIe switch, as those are technically
allowed to prefetch :-)

Worst is that the alignment constraint is based on the segment size, and
while we more/less fix the size of the 32-bit window, we plan to
dynamically allocate/resize the 64-bit ones which will mean variable
segment sizes as well.

So the more information you can get at that point, the better. The type
is useful because it allows us to know if you are trying to put a
prefetchable memory BAR inside a non-prefetchable region, in which case
we know it has to be in M32.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ram Pai July 17, 2012, 10:03 a.m. UTC | #3
On Tue, Jul 17, 2012 at 07:16:59PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-07-17 at 13:57 +0800, Ram Pai wrote:
> > Hmm.. this code is not about determining what kind of segment the
> > platform is returning. This code is about using the right alignment
> > constraints for the type of segment from which resource will be
> > allocated. right?
> > 
> > b_res is the resource that is being sized. b_res already knows
> > what kind of resource it is, i.e IORESOURCE_MEM or
> > IORESOURCE_PREFETCH.
> > Hence we should be exactly using the same alignment constraints as
> > that dictated by the type of b_res. no? 
> 
> This is unclear.... ideally we want to know which of the host bridge
> "apertures" is about to be used...
> 
> IE. A prefetchable resource can very well be allocated to a
> non-prefetchable window, though the other way isn't supposed to happen.
> 
> Additionally, our PHB doesn't actually differenciate prefetchable and
> non-prefetchable windows (whether you can prefetch or not is an
> attribute of the CPU mapping, but basically non-cachable mappings are
> never prefetchable for us).
> 
> So we can be lax in how we assign things between our single 32-bit
> window divided in 128 segments and our 16x64-bit windows divided in 8
> segments (and future HW will do thins differently even).
> 
> For example we would like in some cases to use M64's (64-bit windows) to
> map SR-IOV BARs regardless of the "prefetchability" though that can only
> work if we are not behind a PCIe switch, as those are technically
> allowed to prefetch :-)
> 
> Worst is that the alignment constraint is based on the segment size, and
> while we more/less fix the size of the 32-bit window, we plan to
> dynamically allocate/resize the 64-bit ones which will mean variable
> segment sizes as well.
> 
> So the more information you can get at that point, the better. The type
> is useful because it allows us to know if you are trying to put a
> prefetchable memory BAR inside a non-prefetchable region, in which case
> we know it has to be in M32.


Ben,
	Lets say we passed that 'type' flag to size the minimum
	alignment constraints for that b_res.  And window_alignment(bus,
	type) of your platform  used that 'type' information to
	determine whether to use the alignment constraints of 32-bit
	window or 64-bit window.

	However, later when the b_res is actually allocated a resource,
	the pci_assign_resource() has no idea whether to allocate 32-bit
	window resource or 64-bit window resource, because the 'type'
	information is not captured anywhere in b_res.

	You would basically have a disconnect between what is sized and 
	what is allocated. Unless offcourse you pass that 'type' to
	the b_res->flags, which is currently not the case.

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Benjamin Herrenschmidt July 17, 2012, 10:38 a.m. UTC | #4
On Tue, 2012-07-17 at 18:03 +0800, Ram Pai wrote:
>         Lets say we passed that 'type' flag to size the minimum
>         alignment constraints for that b_res.  And window_alignment(bus,
>         type) of your platform  used that 'type' information to
>         determine whether to use the alignment constraints of 32-bit
>         window or 64-bit window.
> 
>         However, later when the b_res is actually allocated a resource,
>         the pci_assign_resource() has no idea whether to allocate 32-bit
>         window resource or 64-bit window resource, because the 'type'
>         information is not captured anywhere in b_res.
> 
>         You would basically have a disconnect between what is sized and 
>         what is allocated. Unless offcourse you pass that 'type' to
>         the b_res->flags, which is currently not the case. 

Right, we ideally would need the core to query the alignment once per
"apertures" it tries as a candidate to allocate a given resource... but
that's for later.

For now we can probably live with giving out the max of the minimum
alignment we support for M64 and our M32 segment size.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas July 17, 2012, 5:14 p.m. UTC | #5
On Tue, Jul 17, 2012 at 4:38 AM, Benjamin Herrenschmidt
<benh@kernel.crashing.org> wrote:
> On Tue, 2012-07-17 at 18:03 +0800, Ram Pai wrote:
>>         Lets say we passed that 'type' flag to size the minimum
>>         alignment constraints for that b_res.  And window_alignment(bus,
>>         type) of your platform  used that 'type' information to
>>         determine whether to use the alignment constraints of 32-bit
>>         window or 64-bit window.
>>
>>         However, later when the b_res is actually allocated a resource,
>>         the pci_assign_resource() has no idea whether to allocate 32-bit
>>         window resource or 64-bit window resource, because the 'type'
>>         information is not captured anywhere in b_res.
>>
>>         You would basically have a disconnect between what is sized and
>>         what is allocated. Unless offcourse you pass that 'type' to
>>         the b_res->flags, which is currently not the case.
>
> Right, we ideally would need the core to query the alignment once per
> "apertures" it tries as a candidate to allocate a given resource... but
> that's for later.
>
> For now we can probably live with giving out the max of the minimum
> alignment we support for M64 and our M32 segment size.

We already know the aperture we're proposing to allocate from (the
result of find_free_bus_resource()), don't we?  What if we passed it
to pcibios_window_alignment() along with the struct pci_bus *, e.g.:

  resource_size_t pcibios_window_alignment(struct pci_bus *bus, struct
resource *window)
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ram Pai July 18, 2012, 4:25 a.m. UTC | #6
On Tue, Jul 17, 2012 at 11:14:51AM -0600, Bjorn Helgaas wrote:
> On Tue, Jul 17, 2012 at 4:38 AM, Benjamin Herrenschmidt
> <benh@kernel.crashing.org> wrote:
> > On Tue, 2012-07-17 at 18:03 +0800, Ram Pai wrote:
> >>         Lets say we passed that 'type' flag to size the minimum
> >>         alignment constraints for that b_res.  And window_alignment(bus,
> >>         type) of your platform  used that 'type' information to
> >>         determine whether to use the alignment constraints of 32-bit
> >>         window or 64-bit window.
> >>
> >>         However, later when the b_res is actually allocated a resource,
> >>         the pci_assign_resource() has no idea whether to allocate 32-bit
> >>         window resource or 64-bit window resource, because the 'type'
> >>         information is not captured anywhere in b_res.
> >>
> >>         You would basically have a disconnect between what is sized and
> >>         what is allocated. Unless offcourse you pass that 'type' to
> >>         the b_res->flags, which is currently not the case.
> >
> > Right, we ideally would need the core to query the alignment once per
> > "apertures" it tries as a candidate to allocate a given resource... but
> > that's for later.
> >
> > For now we can probably live with giving out the max of the minimum
> > alignment we support for M64 and our M32 segment size.
> 
> We already know the aperture we're proposing to allocate from (the
> result of find_free_bus_resource()), don't we?  What if we passed it
> to pcibios_window_alignment() along with the struct pci_bus *, e.g.:
> 
>   resource_size_t pcibios_window_alignment(struct pci_bus *bus, struct
> resource *window)

Hmm..'struct resource *window' may not yet know which aperature it is
allocated from. Will it? We are still in the sizing process, the allocation will
be done much later.

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ram Pai July 18, 2012, 4:28 a.m. UTC | #7
On Tue, Jul 17, 2012 at 08:38:18PM +1000, Benjamin Herrenschmidt wrote:
> On Tue, 2012-07-17 at 18:03 +0800, Ram Pai wrote:
> >         Lets say we passed that 'type' flag to size the minimum
> >         alignment constraints for that b_res.  And window_alignment(bus,
> >         type) of your platform  used that 'type' information to
> >         determine whether to use the alignment constraints of 32-bit
> >         window or 64-bit window.
> > 
> >         However, later when the b_res is actually allocated a resource,
> >         the pci_assign_resource() has no idea whether to allocate 32-bit
> >         window resource or 64-bit window resource, because the 'type'
> >         information is not captured anywhere in b_res.
> > 
> >         You would basically have a disconnect between what is sized and 
> >         what is allocated. Unless offcourse you pass that 'type' to
> >         the b_res->flags, which is currently not the case. 
> 
> Right, we ideally would need the core to query the alignment once per
> "apertures" it tries as a candidate to allocate a given resource... but
> that's for later.
> 
> For now we can probably live with giving out the max of the minimum
> alignment we support for M64 and our M32 segment size.

Its an approximation, which may not be terribly bad. But not comforting enough.

RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Ram Pai July 18, 2012, 5:02 a.m. UTC | #8
On Wed, Jul 18, 2012 at 09:07:46AM +0800, Gavin Shan wrote:
> 4. Either "b_res->flags & mask" or "type" to be used for window_alignment(),
>    I don't think it's big deal because "b_res->flags & mask == type" for
>    current implementation. However, I'm not sure I still need change it
>    to "type"?
> 
>    min_align = max(min_align, window_alignment(bus, b_res->flags & mask));

Ah. you are right.

(b_res->flags & mask) or type, either one is ok. I had a wrong
assumption about b_res->flags. I thought it has either IORESOURCE_MEM
set or IORESOURCE_PREFETCH set, but not both.

Whatever concern I had, i dont have it any more.
RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Bjorn Helgaas July 18, 2012, 4:59 p.m. UTC | #9
On Tue, Jul 17, 2012 at 10:25 PM, Ram Pai <linuxram@us.ibm.com> wrote:
> On Tue, Jul 17, 2012 at 11:14:51AM -0600, Bjorn Helgaas wrote:
>> On Tue, Jul 17, 2012 at 4:38 AM, Benjamin Herrenschmidt
>> <benh@kernel.crashing.org> wrote:
>> > On Tue, 2012-07-17 at 18:03 +0800, Ram Pai wrote:
>> >>         Lets say we passed that 'type' flag to size the minimum
>> >>         alignment constraints for that b_res.  And window_alignment(bus,
>> >>         type) of your platform  used that 'type' information to
>> >>         determine whether to use the alignment constraints of 32-bit
>> >>         window or 64-bit window.
>> >>
>> >>         However, later when the b_res is actually allocated a resource,
>> >>         the pci_assign_resource() has no idea whether to allocate 32-bit
>> >>         window resource or 64-bit window resource, because the 'type'
>> >>         information is not captured anywhere in b_res.
>> >>
>> >>         You would basically have a disconnect between what is sized and
>> >>         what is allocated. Unless offcourse you pass that 'type' to
>> >>         the b_res->flags, which is currently not the case.
>> >
>> > Right, we ideally would need the core to query the alignment once per
>> > "apertures" it tries as a candidate to allocate a given resource... but
>> > that's for later.
>> >
>> > For now we can probably live with giving out the max of the minimum
>> > alignment we support for M64 and our M32 segment size.
>>
>> We already know the aperture we're proposing to allocate from (the
>> result of find_free_bus_resource()), don't we?  What if we passed it
>> to pcibios_window_alignment() along with the struct pci_bus *, e.g.:
>>
>>   resource_size_t pcibios_window_alignment(struct pci_bus *bus, struct
>> resource *window)
>
> Hmm..'struct resource *window' may not yet know which aperature it is
> allocated from. Will it? We are still in the sizing process, the allocation will
> be done much later.

Of course, you're absolutely right; I had this backwards.  In
pbus_size_io/mem(), we do "b_res = find_free_bus_resource()", so b_res
is a bus resource that matches the desired type (IO/MEM).  This
resource represents an aperture of the upstream bridge leading to the
bus.  I was thinking that b_res->start would contain address
information that the arch could use to decide alignment.

But at this point, in pbus_size_io/mem(), we set "b_res->start =
min_align", so obviously b_res contains no information about the
window base that will eventually be assigned.  I think b_res is
basically the *container* into which we'll eventually put the P2P
aperture start/end, but here, we're using that container to hold the
information about the size and alignment we need for that aperture.

The fact that we deal with alignment in pbus_size_mem() and again in
__pci_assign_resource() (via pcibios_align_resource) is confusing to
me -- I don't have a clear idea of what sorts of alignment are done in
each place.  Could this powerpc alignment be done in
pcibios_align_resource()?  We do have the actual proposed address
there, as well as the pci_dev.

Bjorn
--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index 8fa2d4b..426c8ad6 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -762,6 +762,25 @@  static void pbus_size_io(struct pci_bus *bus, resource_size_t min_size,
 	}
 }
 
+static inline calculate_min_align(resource_size_t aligns *aligns, int max_order)
+{
+	resource_size_t align = 0;
+	resource_size_t min_align = 0;
+	int order;
+	for (order = 0; order <= max_order; order++) {
+		resource_size_t align1 = 1;
+
+		align1 <<= (order + 20);
+
+		if (!align)
+			min_align = align1;
+		else if (ALIGN(align + min_align, min_align) < align1)
+			min_align = align1 >> 1;
+		align += aligns[order];
+	}
+	return min_align;
+}
+
 /**
  * pbus_size_mem() - size the memory window of a given bus
  *
@@ -841,19 +860,9 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 				children_add_size += get_res_add_size(realloc_head, r);
 		}
 	}
-	align = 0;
-	min_align = 0;
-	for (order = 0; order <= max_order; order++) {
-		resource_size_t align1 = 1;
 
-		align1 <<= (order + 20);
+	min_align = calculate_min_align(aligns, max_order);
 
-		if (!align)
-			min_align = align1;
-		else if (ALIGN(align + min_align, min_align) < align1)
-			min_align = align1 >> 1;
-		align += aligns[order];
-	}
 	size0 = calculate_memsize(size, min_size, 0, resource_size(b_res), min_align);
 	if (children_add_size > add_size)
 		add_size = children_add_size;
@@ -880,6 +889,7 @@  static int pbus_size_mem(struct pci_bus *bus, unsigned long mask,
 	return 1;
 }

 RP

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org