diff mbox

spi: SPI_MASTER_MUST_* with scatter-gather only option and avoiding realloc

Message ID 1431597524-7907-1-git-send-email-kernel@martin.sperl.org (mailing list archive)
State New, archived
Headers show

Commit Message

Martin Sperl May 14, 2015, 9:58 a.m. UTC
From: Martin Sperl <kernel@martin.sperl.org>

Rewrite of spi_map_msg and spi_map_buf so that for SPI_MASTER_MUST_*:
* short transfers are handled by a page-sized buffer instead of
  reallocating and freeing memory (if smaller than PAGE_SIZE)
* with an extra set of flags allows to ONLY create a scatter/gather list
  that reuses the same page for all the transfers
  The scatter list produced is a match of the corresponding template
  scatter list (so tx-sg is the template for the rx-sg and vice versa)

It also fixes the insufficient cleanup in case __spi_map_msg returns
an error.

Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
---
 drivers/spi/spi-bcm2835.c |    2 +-
 drivers/spi/spi.c         |  174 ++++++++++++++++++++++++++++++++++++++++-----
 include/linux/spi/spi.h   |   10 ++-
 3 files changed, 165 insertions(+), 21 deletions(-)

Comments

Martin Sperl May 14, 2015, 11:19 a.m. UTC | #1
> On 14.05.2015, at 11:58, kernel@martin.sperl.org wrote:
> 
> From: Martin Sperl <kernel@martin.sperl.org>
> 
> Rewrite of spi_map_msg and spi_map_buf so that for SPI_MASTER_MUST_*:
> * short transfers are handled by a page-sized buffer instead of
>  reallocating and freeing memory (if smaller than PAGE_SIZE)
> * with an extra set of flags allows to ONLY create a scatter/gather list
>  that reuses the same page for all the transfers
>  The scatter list produced is a match of the corresponding template
>  scatter list (so tx-sg is the template for the rx-sg and vice versa)
> 
> It also fixes the insufficient cleanup in case __spi_map_msg returns
> an error.
> 
> Signed-off-by: Martin Sperl <kernel@martin.sperl.org>
> ---

Forgot to mention:

Tested on a raspberry pi with the following devices:
* 2x mcp2515
* 1x mmc_spi (with dma)
* 1x enc28j60 (with dma)
* 1x fb_st7735r (with dma)

The fbtft-device played all of the BigBuckBunny movie without any issues.
The spi-ethernet card was able to transfer 64MB without hickups
the fsck of the attached sd card worked flawlessly.

--
To unsubscribe from this list: send the line "unsubscribe linux-spi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Brown May 19, 2015, 12:46 p.m. UTC | #2
On Thu, May 14, 2015 at 09:58:44AM +0000, kernel@martin.sperl.org wrote:
> From: Martin Sperl <kernel@martin.sperl.org>
> 
> Rewrite of spi_map_msg and spi_map_buf so that for SPI_MASTER_MUST_*:

This isn't just rewriting the internals, this is adding a completely new
API with separate code, fixing a bug, optimising the existing API and
adding a user of the new API.  I'd have expected all of that to be
called out in the changelog and for it to be split out into several
patches - especially the bug fix since that should go to Linus' tree and
stable.

Like I say I'm not entirely convinced we need the extra flags over just
using can_dma().

> It also fixes the insufficient cleanup in case __spi_map_msg returns
> an error.

This should be a separate patch.

> diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
> index ac1760e..ac74456 100644
> --- a/drivers/spi/spi-bcm2835.c
> +++ b/drivers/spi/spi-bcm2835.c
> @@ -463,7 +463,7 @@ void bcm2835_dma_init(struct spi_master *master, struct device *dev)
>  	master->can_dma = bcm2835_spi_can_dma;
>  	master->max_dma_len = 65535; /* limitation by BCM2835_SPI_DLEN */
>  	/* need to do TX AND RX DMA, so we need dummy buffers */
> -	master->flags = SPI_MASTER_MUST_RX | SPI_MASTER_MUST_TX;
> +	master->flags = SPI_MASTER_MUST_RX_SG | SPI_MASTER_MUST_TX_SG;
>  
>  	return;
>  

This is updating a specific driver to use the new API you're adding,
this should be in a separate patch.

>  #ifdef CONFIG_HAS_DMA
> +static int spi_map_null(struct spi_master *master, struct device *dev,
> +			struct sg_table *sgt,
> +			struct sg_table *sgt_template,
> +			void *buf,
> +			enum dma_data_direction dir)
> +{

It's not obvious to me what this is supposed to do and it looks awfully
like it's duplicating the existing DMA mapping code (but if that's the
case the existing code should be using this not duplicating it so I
guess not).  I think it's supposed to be for creating a scatterlist that
repeatedly reused the same dummy page but there's this template thing,
it's getting a buffer passed in and...

> +	if (is_vmalloc_addr(buf)) {
> +		vm_page = vmalloc_to_page(buf);
> +		if (!vm_page) {
> +			sg_free_table(sgt);
> +			return -ENOMEM;
> +		}
> +		page_offset = offset_in_page(buf);
> +	} else {
> +		vm_page = NULL;
> +		page_offset = 0;
> +	}

...this is really weird for that case.  

I'd have expected to be allocating a scatterlist that transfers a
specified length to/from a normal page sized kmalloc() buffer in which
case we don't need to worry about vmalloc() or this template stuff.

> +		/*
> +		 * handle the SPI_MASTER_MUST_*_SG
> +		 * note that the situation with tx_buf and rx_buf both NULL
> +		 * is checked and handled inside spi_transfer_one_message
> +		 */
> +		if ((!xfer->tx_buf) && (xfer->rx_buf) &&
> +		    (master->flags & SPI_MASTER_MUST_TX_SG)) {
> +			ret = spi_map_null(master, tx_dev,
> +					   &xfer->tx_sg, &xfer->rx_sg,
> +					   master->page_tx,
> +					   DMA_TO_DEVICE);
> +			if (ret != 0) {
> +				spi_unmap_buf(master, rx_dev, &xfer->rx_sg,
> +					      DMA_FROM_DEVICE);
> +				return ret;
> +			}
> +		}

Why does the transmit handling use a scatterlist allocated for receive
(and vice versa)?  This goes back to the lack of clarity about what
spi_map_null() is doing.

>  		if (max_tx) {
> -			tmp = krealloc(master->dummy_tx, max_tx,
> -				       GFP_KERNEL | GFP_DMA);
> -			if (!tmp)
> -				return -ENOMEM;
> -			master->dummy_tx = tmp;
> -			memset(tmp, 0, max_tx);
> +			if (max_tx > PAGE_SIZE) {
> +				tmp_tx = krealloc(master->dummy_tx, max_tx,
> +						  GFP_KERNEL | GFP_DMA);
> +				if (!tmp_tx)
> +					return -ENOMEM;
> +				master->dummy_tx = tmp_tx;
> +				memset(tmp_tx, 0, max_tx);
> +			} else {
> +				tmp_tx = master->page_tx;
> +			}

This idea is fine but should be a separate patch.

I'd expect it should be about as cheap to change the realloc to be the
max of max_tx and PAGE_SIZE (the code was written on the basis that we'd
basically get that behaviour from the allocator anyway most of the
time though IIRC it will work in chunks smaller than a page) but this
does save us the memset() on the transmit path which is handy.
Martin Sperl May 19, 2015, 2:17 p.m. UTC | #3
> On 19.05.2015, at 14:46, Mark Brown <broonie@kernel.org> wrote:
> 
> Like I say I'm not entirely convinced we need the extra flags over just
> using can_dma().
If you can tell me that:
	(master->flags & (SPI_MASTER_MUST_TX | SPI_MASTER_MUST_RX)) &&
	master->can_dma && master->can_dma(...) 
means that we do NOT need real memory allocated in the first place but
we only expect a scatter list to be created 
and memory allocated in all other cases then that be it.

I am just not convinced that this is necessarily true for all
existing (or future) drivers.

During my discovery questions you mentioned that for some drivers
it may not be as simple as
  tx = (xfer->tx_buf) ? xfer->tx_buf[i] : 0;

So this “extension” of the API is to avoid the potential risk of
breaking other drivers by making the above assumption (with
additional calls to can_dma, which then we should start to cache)

> 
>> It also fixes the insufficient cleanup in case __spi_map_msg returns
>> an error.
> 
> This should be a separate patch.

The problem is that this came up while developing the patch, so
I left it in the patch, assuming there would be some feedback
that requires another version of the patch...

> This is updating a specific driver to use the new API you're adding,
> this should be in a separate patch.

At one occasion I got scolded for separating things out into different
patches (in that case api from implemetation), now the other way around,
but we can do that...

> 
>> #ifdef CONFIG_HAS_DMA
>> +static int spi_map_null(struct spi_master *master, struct device *dev,
>> +			struct sg_table *sgt,
>> +			struct sg_table *sgt_template,
>> +			void *buf,
>> +			enum dma_data_direction dir)
>> +{
> 
> It's not obvious to me what this is supposed to do and it looks awfully
> like it's duplicating the existing DMA mapping code (but if that's the
> case the existing code should be using this not duplicating it so I
> guess not).  I think it's supposed to be for creating a scatterlist that
> repeatedly reused the same dummy page but there's this template thing,
> it's getting a buffer passed in and…

OK, so the idea is that this situation ONLY happens when we have either
tx_buf or rx_buf for which we need to map the other transfer from a
“dummy/null” page.

So maybe instead of spi_map_null it should get called spi_map_dummy.

The template is there to create an (almost) identical scatter lists for
both rx and tx (modulo PAGE_SIZE). So if we have a tx-scatter list of:
4, 8, 4103 bytes, and we need to do a dummy transfer for rx, then the
(basic) identical pattern of 4, 8, 4096, 7 would get created.

This kind avoids some restrictions that I have seen with the spi-bcm2835
hardware.

As for replication - the code is similar in some aspects,
but still serves a different purpose (especially with the templating).

Adding all that into one would mean actually merging tx and rx cases
into one piece of code, and I did not want to go that far.
Keep what is working.

Also if we would do that then we should also add additional features
like creating a list of scatter lists for some cases, when:
* an individual dma transfer to the HW can only be < 65536 bytes in
  length (the spi HW got a counter)
* or when individual entries in the scatter list are not a multiple
  of 4
we would need to add multiple sg lists and handle each one separately.
(again examples from spi-bcm2835)

This would be a much bigger change.

> 
>> +	if (is_vmalloc_addr(buf)) {
>> +		vm_page = vmalloc_to_page(buf);
>> +		if (!vm_page) {
>> +			sg_free_table(sgt);
>> +			return -ENOMEM;
>> +		}
>> +		page_offset = offset_in_page(buf);
>> +	} else {
>> +		vm_page = NULL;
>> +		page_offset = 0;
>> +	}
> 
> ...this is really weird for that case.  
> 
> I'd have expected to be allocating a scatterlist that transfers a
> specified length to/from a normal page sized kmalloc() buffer in which
> case we don't need to worry about vmalloc() or this template stuff.

vm_page actually serves as a flag what call should get used a little
further down - sg_set_page (in case of not null) or sg_set_buf.

This is there as I am not 100% sure that the allocation used as
buffer is ALWAYS of the same identical type on _all_ platforms
in all .config variants (and I want to avoid breaking things...

If you can tell me that it is always of one type (even when used
early during kernel initialization), then we can remove that.

> Why does the transmit handling use a scatterlist allocated for receive
> (and vice versa)?  This goes back to the lack of clarity about what
> spi_map_null() is doing.

See above about “templating"

> 
>> 		if (max_tx) {
>> -			tmp = krealloc(master->dummy_tx, max_tx,
>> -				       GFP_KERNEL | GFP_DMA);
>> -			if (!tmp)
>> -				return -ENOMEM;
>> -			master->dummy_tx = tmp;
>> -			memset(tmp, 0, max_tx);
>> +			if (max_tx > PAGE_SIZE) {
>> +				tmp_tx = krealloc(master->dummy_tx, max_tx,
>> +						  GFP_KERNEL | GFP_DMA);
>> +				if (!tmp_tx)
>> +					return -ENOMEM;
>> +				master->dummy_tx = tmp_tx;
>> +				memset(tmp_tx, 0, max_tx);
>> +			} else {
>> +				tmp_tx = master->page_tx;
>> +			}
> 
> This idea is fine but should be a separate patch.
> 
> I'd expect it should be about as cheap to change the realloc to be the
> max of max_tx and PAGE_SIZE (the code was written on the basis that we'd
> basically get that behaviour from the allocator anyway most of the
> time though IIRC it will work in chunks smaller than a page) but this
> does save us the memset() on the transmit path which is handy.

Well - this comes mostly from the fact that we now have a PAGE that
is cleaned already (and we need it for mapping anyway), so 
just make use of it as long as the size is small enough, which
probably happens in most cases.

Martin

--
To unsubscribe from this list: send the line "unsubscribe linux-spi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mark Brown May 22, 2015, 11:34 a.m. UTC | #4
On Tue, May 19, 2015 at 04:17:33PM +0200, Martin Sperl wrote:
> > On 19.05.2015, at 14:46, Mark Brown <broonie@kernel.org> wrote:

Quite a few process issues here which are rather annoying so I'm just
going to ignore the technical content for now.

> > Like I say I'm not entirely convinced we need the extra flags over just
> > using can_dma().

> If you can tell me that:
> 	(master->flags & (SPI_MASTER_MUST_TX | SPI_MASTER_MUST_RX)) &&
> 	master->can_dma && master->can_dma(...) 
> means that we do NOT need real memory allocated in the first place but
> we only expect a scatter list to be created 
> and memory allocated in all other cases then that be it.

I replied to your last mail on this in the other thread.

> >> It also fixes the insufficient cleanup in case __spi_map_msg returns
> >> an error.

> > This should be a separate patch.

> The problem is that this came up while developing the patch, so
> I left it in the patch, assuming there would be some feedback
> that requires another version of the patch...

There is no barrier to creating a separate patch for unrelated
improvments you happen to notice while working on an issue, it is
something that is well supported by the tooling.  Not doing this
guarantees that a resubmission will be required, means that reviewers
have to review the same code multiple times (which takes their time) and
means that it takes longer for other people to get the benefit of the
change.

> > This is updating a specific driver to use the new API you're adding,
> > this should be in a separate patch.

> At one occasion I got scolded for separating things out into different
> patches (in that case api from implemetation), now the other way around,
> but we can do that...

One patch per change as covered in SubmittingPatches.  Prototypes for a
function with no implementation are not a separate change since they
can't be used without the implementation also being added.  Changing a
driver is a separate change to adding an API since people can use the
API change even if they don't use that driver.

Not doing this causes extra work for people, it makes review harder,
causes resubmissions, and if the randomly mixed changes do end up
getting applied somehow then it makes things like bisection and
backporting harder.
Martin Sperl May 22, 2015, 2:09 p.m. UTC | #5
> On 22.05.2015, at 13:34, Mark Brown <broonie@kernel.org> wrote:
> 
> Quite a few process issues here which are rather annoying so I'm just
> going to ignore the technical content for now.
> 
Accepting those - thanks for the clarifications.

> 
>> If you can tell me that:
>> 	(master->flags & (SPI_MASTER_MUST_TX | SPI_MASTER_MUST_RX)) &&
>> 	master->can_dma && master->can_dma(...) 
>> means that we do NOT need real memory allocated in the first place but
>> we only expect a scatter list to be created 
>> and memory allocated in all other cases then that be it.
> 
> I replied to your last mail on this in the other thread.
Well - I will take that as a yes.

With the logic based only on can_dma we still would be allocating memory
for those cases where can_dma returns false - even if the driver only
requires the scatter/gather list but would not require memory for those
rx/tx_buf == NULL cases (which is the case for the spi-bcm2835).

The extra flags solve that requirement cleanly.

An alternative would be that we provide a spi_map_dummy function
that the driver could call explicitly if it just needs the scatter_gather
list for null-buffers and leave the master->flags = 0.

One other reasons was that with this logic we need to run can_dma several
times in the process at different stages which is a waste of CPU-resources
(especially as the compiler can not inline those function pointer calls).

To reduce that overhead, would it be acceptable if we add a field to
spi_transfer (and possibly to spi_message) - say bool can_dma -  that is
filled during the loop over all the transfers in __spi_validate?

Essentially adding something like this to __spi_validate:
	message->can_dma = false;
	list_for_each_entry(xfer, &message->transfers, transfer_list) {
		/* calculate can_dma once and cache it */
		xfer->can_dma = master->can_dma ? master->can_dma(xfer) : false;
		/* and set it also in the message itself */
		message->can_dma |= xfer->can_dma
	}


--
To unsubscribe from this list: send the line "unsubscribe linux-spi" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin Sperl May 25, 2015, 12:44 p.m. UTC | #6
From: Martin Sperl <kernel@martin.sperl.org>

This patch series implements mapping a single page multiple times
in the sg_list when rx/tx_buf == NULL.

The 1st patch makes this apply to all spi_transfers for which
can_dma returns true, but this means it applies to all spi-masters
with DMA support.

To avoid this there is an api extension in the 3rd patch that allows
drivers to state the requirement in spi_master.flags.

spi-bcm2835 is converted to the new api in patches 2 and 4.

Patch 3 and 4 are optional and reduce impact on drivers that do
not need this feature by extending the API.

All patches apply against for-next, but note that there will be a conflict
if the patch "spi: add missing cleanup in spi_map_msg on error" is
applied already.

Martin Sperl (4):
  spi: dma map a single page multiple times in sg_list for rx/tx_buf ==
    NULL
  spi: bcm2835: no longer requires SPI_MASTER_MUST_RX/TX
  spi: add SPI_MASTER_MUST_*_SG to avoid unnecessary mapping of null
    pages in sg_lists for masters that do not need it
  spi: bcm2835: set SPI_MASTER_MUST_RX_SG/TX_SG flags

 drivers/spi/spi-bcm2835.c |    4 ++--
 drivers/spi/spi.c         |   40 +++++++++++++++++++++++++++++++++-------
 include/linux/spi/spi.h   |    6 ++++++
 3 files changed, 41 insertions(+), 9 deletions(-)
diff mbox

Patch

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index ac1760e..ac74456 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -463,7 +463,7 @@  void bcm2835_dma_init(struct spi_master *master, struct device *dev)
 	master->can_dma = bcm2835_spi_can_dma;
 	master->max_dma_len = 65535; /* limitation by BCM2835_SPI_DLEN */
 	/* need to do TX AND RX DMA, so we need dummy buffers */
-	master->flags = SPI_MASTER_MUST_RX | SPI_MASTER_MUST_TX;
+	master->flags = SPI_MASTER_MUST_RX_SG | SPI_MASTER_MUST_TX_SG;
 
 	return;
 
diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index d35c1a1..c85cf58 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -471,6 +471,73 @@  static void spi_set_cs(struct spi_device *spi, bool enable)
 }
 
 #ifdef CONFIG_HAS_DMA
+static int spi_map_null(struct spi_master *master, struct device *dev,
+			struct sg_table *sgt,
+			struct sg_table *sgt_template,
+			void *buf,
+			enum dma_data_direction dir)
+{
+	struct page *vm_page;
+	unsigned long page_offset;
+	int sgs = 0;
+	int i, j, ret;
+	ssize_t len, l;
+
+	if (is_vmalloc_addr(buf)) {
+		vm_page = vmalloc_to_page(buf);
+		if (!vm_page) {
+			sg_free_table(sgt);
+			return -ENOMEM;
+		}
+		page_offset = offset_in_page(buf);
+	} else {
+		vm_page = NULL;
+		page_offset = 0;
+	}
+
+	/* count the number of sgs we will need walking the template */
+	for (i = 0; i < sgt_template->nents; i++) {
+		len = sg_dma_len(&sgt_template->sgl[i]);
+		while (len) {
+			len -= min_t(size_t, len, PAGE_SIZE);
+			sgs++;
+		}
+	}
+
+	/* now allocate it */
+	ret = sg_alloc_table(sgt, sgs, GFP_KERNEL);
+	if (ret)
+		return ret;
+
+	/* and iterate over the template to fill our own table */
+	for (i = 0, j = 0; i < sgt_template->nents; i++) {
+		len = sg_dma_len(&sgt_template->sgl[i]);
+		/* split into multiple transfers if needed */
+		while (len) {
+			l = min_t(size_t, len, PAGE_SIZE);
+			if (vm_page)
+				sg_set_page(&sgt->sgl[i], vm_page,
+					    l, page_offset);
+			else
+				sg_set_buf(&sgt->sgl[j], buf, l);
+			len -= l;
+			j++;
+		}
+	}
+
+	ret = dma_map_sg(dev, sgt->sgl, sgt->nents, dir);
+	if (!ret)
+		ret = -ENOMEM;
+	if (ret < 0) {
+		sg_free_table(sgt);
+		return ret;
+	}
+
+	sgt->nents = ret;
+
+	return 0;
+}
+
 static int spi_map_buf(struct spi_master *master, struct device *dev,
 		       struct sg_table *sgt, void *buf, size_t len,
 		       enum dma_data_direction dir)
@@ -564,6 +631,37 @@  static int __spi_map_msg(struct spi_master *master, struct spi_message *msg)
 				return ret;
 			}
 		}
+
+		/*
+		 * handle the SPI_MASTER_MUST_*_SG
+		 * note that the situation with tx_buf and rx_buf both NULL
+		 * is checked and handled inside spi_transfer_one_message
+		 */
+		if ((!xfer->tx_buf) && (xfer->rx_buf) &&
+		    (master->flags & SPI_MASTER_MUST_TX_SG)) {
+			ret = spi_map_null(master, tx_dev,
+					   &xfer->tx_sg, &xfer->rx_sg,
+					   master->page_tx,
+					   DMA_TO_DEVICE);
+			if (ret != 0) {
+				spi_unmap_buf(master, rx_dev, &xfer->rx_sg,
+					      DMA_FROM_DEVICE);
+				return ret;
+			}
+		}
+
+		if ((!xfer->rx_buf) && (xfer->tx_buf) &&
+		    (master->flags & SPI_MASTER_MUST_RX_SG)) {
+			ret = spi_map_null(master, rx_dev,
+					   &xfer->rx_sg, &xfer->tx_sg,
+					   master->page_rx,
+					   DMA_FROM_DEVICE);
+			if (ret != 0) {
+				spi_unmap_buf(master, tx_dev, &xfer->tx_sg,
+					      DMA_TO_DEVICE);
+				return ret;
+			}
+		}
 	}
 
 	master->cur_msg_mapped = true;
@@ -587,9 +685,11 @@  static int spi_unmap_msg(struct spi_master *master, struct spi_message *msg)
 		 * Restore the original value of tx_buf or rx_buf if they are
 		 * NULL.
 		 */
-		if (xfer->tx_buf == master->dummy_tx)
+		if ((xfer->tx_buf == master->dummy_tx) ||
+		    (xfer->tx_buf == master->page_tx))
 			xfer->tx_buf = NULL;
-		if (xfer->rx_buf == master->dummy_rx)
+		if ((xfer->rx_buf == master->dummy_rx) ||
+		    (xfer->rx_buf == master->page_rx))
 			xfer->rx_buf = NULL;
 
 		if (!master->can_dma(master, msg->spi, xfer))
@@ -618,8 +718,9 @@  static inline int spi_unmap_msg(struct spi_master *master,
 static int spi_map_msg(struct spi_master *master, struct spi_message *msg)
 {
 	struct spi_transfer *xfer;
-	void *tmp;
+	void *tmp_tx, *tmp_rx;
 	unsigned int max_tx, max_rx;
+	int ret;
 
 	if (master->flags & (SPI_MASTER_MUST_RX | SPI_MASTER_MUST_TX)) {
 		max_tx = 0;
@@ -635,34 +736,53 @@  static int spi_map_msg(struct spi_master *master, struct spi_message *msg)
 		}
 
 		if (max_tx) {
-			tmp = krealloc(master->dummy_tx, max_tx,
-				       GFP_KERNEL | GFP_DMA);
-			if (!tmp)
-				return -ENOMEM;
-			master->dummy_tx = tmp;
-			memset(tmp, 0, max_tx);
+			if (max_tx > PAGE_SIZE) {
+				tmp_tx = krealloc(master->dummy_tx, max_tx,
+						  GFP_KERNEL | GFP_DMA);
+				if (!tmp_tx)
+					return -ENOMEM;
+				master->dummy_tx = tmp_tx;
+				memset(tmp_tx, 0, max_tx);
+			} else {
+				tmp_tx = master->page_tx;
+			}
+		} else {
+			tmp_tx = NULL;
 		}
 
-		if (max_rx) {
-			tmp = krealloc(master->dummy_rx, max_rx,
-				       GFP_KERNEL | GFP_DMA);
-			if (!tmp)
-				return -ENOMEM;
-			master->dummy_rx = tmp;
+		if (max_rx)  {
+			if (max_rx > PAGE_SIZE) {
+				tmp_rx = krealloc(master->dummy_rx, max_rx,
+						  GFP_KERNEL | GFP_DMA);
+				if (!tmp_rx)
+					return -ENOMEM;
+				master->dummy_rx = tmp_rx;
+			} else {
+				tmp_rx = master->page_rx;
+			}
+		} else {
+			tmp_tx = NULL;
 		}
 
 		if (max_tx || max_rx) {
 			list_for_each_entry(xfer, &msg->transfers,
 					    transfer_list) {
 				if (!xfer->tx_buf)
-					xfer->tx_buf = master->dummy_tx;
+					xfer->tx_buf = tmp_tx;
 				if (!xfer->rx_buf)
-					xfer->rx_buf = master->dummy_rx;
+					xfer->rx_buf = tmp_rx;
 			}
 		}
 	}
 
-	return __spi_map_msg(master, msg);
+	/* if we fail we need to undo the parial mappings
+	 * and fix up the modified rx_buf/tx_buf
+	 */
+	ret = __spi_map_msg(master, msg);
+	if (ret)
+		spi_unmap_msg(master, msg);
+
+	return ret;
 }
 
 /*
@@ -1555,6 +1675,24 @@  int spi_register_master(struct spi_master *master)
 	if (!master->max_dma_len)
 		master->max_dma_len = INT_MAX;
 
+	/* we need to set max_dma_len to PAGESIZE for MUST_XX_SG */
+	if (master->flags & (SPI_MASTER_MUST_RX_SG | SPI_MASTER_MUST_TX_SG))
+		master->max_dma_len = min_t(size_t,
+					    master->max_dma_len, PAGE_SIZE);
+	/* and allocate some buffers for dma */
+	if (master->flags & (SPI_MASTER_MUST_RX | SPI_MASTER_MUST_RX_SG)) {
+		master->page_rx = devm_kmalloc(&master->dev,
+					       PAGE_SIZE, GFP_DMA);
+		if (!master->page_rx)
+			return -ENOMEM;
+	}
+	if (master->flags & (SPI_MASTER_MUST_TX | SPI_MASTER_MUST_TX_SG)) {
+		master->page_tx = devm_kzalloc(&master->dev,
+					       PAGE_SIZE, GFP_DMA);
+		if (!master->page_tx)
+			return -ENOMEM;
+	}
+
 	/* register the device, then userspace will see it.
 	 * registration fails if the bus ID is in use.
 	 */
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index d673072..1f440ff 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -353,8 +353,10 @@  struct spi_master {
 #define SPI_MASTER_HALF_DUPLEX	BIT(0)		/* can't do full duplex */
 #define SPI_MASTER_NO_RX	BIT(1)		/* can't do buffer read */
 #define SPI_MASTER_NO_TX	BIT(2)		/* can't do buffer write */
-#define SPI_MASTER_MUST_RX      BIT(3)		/* requires rx */
-#define SPI_MASTER_MUST_TX      BIT(4)		/* requires tx */
+#define SPI_MASTER_MUST_RX      BIT(3)		/* requires rx_buf allocated */
+#define SPI_MASTER_MUST_TX      BIT(4)		/* requires tx_buf allocated */
+#define SPI_MASTER_MUST_RX_SG   BIT(5)		/* requires rx sg list */
+#define SPI_MASTER_MUST_TX_SG   BIT(6)		/* requires tx sg list */
 
 	/* lock and mutex for SPI bus locking */
 	spinlock_t		bus_lock_spinlock;
@@ -459,6 +461,10 @@  struct spi_master {
 	/* dummy data for full duplex devices */
 	void			*dummy_rx;
 	void			*dummy_tx;
+
+	/* pages for dma-transfers */
+	void			*page_rx;
+	void			*page_tx;
 };
 
 static inline void *spi_master_get_devdata(struct spi_master *master)