diff mbox series

[6/7] spi: bcm2835: Overcome sglist entry length limitation

Message ID eb5ce210b06fb68580961038412f9499c3e56a76.1541659680.git.lukas@wunner.de (mailing list archive)
State Accepted
Commit 3bd7f6589f67f05a7901bdc2810631814d89e462
Headers show
Series Raspberry Pi spi0 improvements | expand

Commit Message

Lukas Wunner Nov. 8, 2018, 7:06 a.m. UTC
When in DMA mode, the BCM2835 SPI controller requires that the FIFO is
accessed in 4 byte chunks.  This rule is not fulfilled if a transfer
consists of multiple sglist entries, one per page, and the first entry
starts in the middle of a page with an offset not a multiple of 4.

The driver currently falls back to programmed I/O for such transfers,
incurring a significant performance penalty.

Overcome this hardware limitation by transferring the first few bytes of
a transfer without DMA such that the remainder of the first sglist entry
becomes a multiple of 4.  Specifics are provided in kerneldoc comments.

An alternative approach would have been to split transfers in the
->prepare_message hook, but this may necessitate two transfers per page,
defeating the goal of clustering multiple pages together in a single
transfer for efficiency.  E.g. if the first TX sglist entry's length is
23 and the first RX's is 40, the first transfer would send and receive
23 bytes, the second 40 - 23 = 17 bytes, the third 4096 - 17 = 4079
bytes, the fourth 4096 - 4079 = 17 bytes and so on.  In other words,
O(n) transfers are necessary (n = number of sglist entries), whereas
the algorithm implemented herein only requires O(1) additional work.

Signed-off-by: Lukas Wunner <lukas@wunner.de>
Cc: Mathias Duckeck <m.duckeck@kunbus.de>
Cc: Frank Pavlic <f.pavlic@kunbus.de>
Cc: Martin Sperl <kernel@martin.sperl.org>
Cc: Noralf Trønnes <noralf@tronnes.org>
---
 drivers/spi/spi-bcm2835.c | 291 +++++++++++++++++++++++++++++++-------
 1 file changed, 242 insertions(+), 49 deletions(-)

Comments

Lukas Wunner Nov. 8, 2018, 7:47 a.m. UTC | #1
On Thu, Nov 08, 2018 at 08:06:10AM +0100, Lukas Wunner wrote:
> When in DMA mode, the BCM2835 SPI controller requires that the FIFO is
> accessed in 4 byte chunks.  This rule is not fulfilled if a transfer
> consists of multiple sglist entries, one per page, and the first entry
> starts in the middle of a page with an offset not a multiple of 4.
> 
> The driver currently falls back to programmed I/O for such transfers,
> incurring a significant performance penalty.
> 
> Overcome this hardware limitation by transferring the first few bytes of
> a transfer without DMA such that the remainder of the first sglist entry
> becomes a multiple of 4.

FWIW the patch below is what I used to test and debug this.

It copies the transfer buffers to a vmalloc'ed area to force the SPI core
to create one sglist entry per page.  The offset within the first page
can be set with the BCM2835_SPI_FIRST_TX_LEN and BCM2835_SPI_FIRST_RX_LEN
macros to test various pathological alignments or force spillover of the
TX prologue to the second sglist entry.

I'm just leaving this here in case anyone else wants to perform further
in-depth testing.

-- >8 --

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index fd9a73963f8c..0f87bf0b18b2 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -112,6 +112,7 @@ struct bcm2835_spi {
 	int rx_prologue;
 	bool tx_spillover;
 	bool dma_pending;
+	void *vm, *tx_buf_saved, *rx_buf_saved;
 };
 
 static inline u32 bcm2835_rd(struct bcm2835_spi *bs, unsigned reg)
@@ -346,6 +347,22 @@ static int bcm2835_spi_transfer_one_irq(struct spi_master *master,
  * list making it slightly unpractical...
  */
 
+static void dump_sg(struct spi_transfer *tfr)
+{
+	struct scatterlist *sgl;
+	int i;
+
+	for_each_sg(tfr->tx_sg.sgl, sgl, tfr->tx_sg.nents, i)
+		pr_info("tx sgl %d: addr=%#x, len=%u, multiple of 4? %d\n", i,
+			sg_dma_address(sgl), sg_dma_len(sgl),
+			!(sg_dma_len(sgl) % 4));
+
+	for_each_sg(tfr->rx_sg.sgl, sgl, tfr->rx_sg.nents, i)
+		pr_info("rx sgl %d: addr=%#x, len=%u, multiple of 4? %d\n", i,
+			sg_dma_address(sgl), sg_dma_len(sgl),
+			!(sg_dma_len(sgl) % 4));
+}
+
 /**
  * bcm2835_spi_transfer_prologue() - transfer first few bytes without DMA
  * @master: SPI master
@@ -425,6 +442,12 @@ static void bcm2835_spi_transfer_prologue(struct spi_master *master,
 	if (!bs->tx_prologue)
 		return;
 
+
+	pr_info("tx_prologue=%d, rx_prologue=%d, tx_spillover=%d\n",
+		bs->tx_prologue, bs->rx_prologue, bs->tx_spillover);
+	// pr_info("before:\n");
+	// dump_sg(tfr);
+
 	/* Write and read RX prologue.  Adjust first entry in RX sglist. */
 	if (bs->rx_prologue) {
 		bcm2835_wr(bs, BCM2835_SPI_DLEN, bs->rx_prologue);
@@ -494,6 +517,10 @@ static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
 		tfr->tx_sg.sgl[1].dma_address -= 4;
 		tfr->tx_sg.sgl[1].length      += 4;
 	}
+
+	// pr_info("after:\n");
+	// dump_sg(tfr);
+
 }
 
 static void bcm2835_spi_dma_done(void *data)
@@ -514,6 +541,18 @@ static void bcm2835_spi_dma_done(void *data)
 		bcm2835_spi_undo_prologue(bs);
 	}
 
+	/* if (bs->tx_prologue) {
+		struct spi_transfer *tfr = bs->tfr;
+		struct scatterlist *sgl;
+		int i;
+
+		pr_info("rx dump:\n");
+		for_each_sg(tfr->rx_sg.sgl, sgl, tfr->rx_sg.nents, i) {
+			pr_info("rx sgl %d: paddr=%#x addr=%p len=%u\n", i, sg_dma_address(sgl), sg_virt(sgl), sg_dma_len(sgl));
+			print_hex_dump(KERN_INFO, "rx ", DUMP_PREFIX_OFFSET, 16, 1, sg_virt(sgl), sg_dma_len(sgl), true);
+		}
+	} */
+
 	/* and mark as completed */;
 	complete(&master->xfer_completion);
 }
@@ -842,6 +881,8 @@ static int bcm2835_spi_prepare_message(struct spi_master *master,
 	struct spi_device *spi = msg->spi;
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
 	u32 cs = bcm2835_rd(bs, BCM2835_SPI_CS);
+	struct spi_transfer *xfer;
+	void *ptr;
 
 	cs &= ~(BCM2835_SPI_CS_CPOL | BCM2835_SPI_CS_CPHA);
 
@@ -852,6 +893,54 @@ static int bcm2835_spi_prepare_message(struct spi_master *master,
 
 	bcm2835_wr(bs, BCM2835_SPI_CS, cs);
 
+#define BCM2835_SPI_FIRST_TX_LEN 2040
+#define BCM2835_SPI_FIRST_RX_LEN 43
+
+	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+		if (xfer->len < BCM2835_SPI_DMA_MIN_LENGTH ||
+		    xfer->len > PAGE_SIZE)
+			continue;
+
+		ptr = bs->vm +      PAGE_SIZE - BCM2835_SPI_FIRST_TX_LEN;
+		if (xfer->tx_buf)
+			memcpy(ptr, xfer->tx_buf, xfer->len);
+		else
+			memset(ptr, 0x00, xfer->len);
+		bs->tx_buf_saved = (void *)xfer->tx_buf;
+		xfer->tx_buf = ptr;
+
+		ptr = bs->vm + (3 * PAGE_SIZE) - BCM2835_SPI_FIRST_RX_LEN;
+		memset(ptr, 0xf4, xfer->len);
+		bs->rx_buf_saved = xfer->rx_buf;
+		xfer->rx_buf = ptr;
+		// pr_info("copied to vmalloc area\n");
+		break;
+	}
+
+	return 0;
+}
+
+static int bcm2835_spi_unprepare_message(struct spi_master *master,
+					 struct spi_message *msg)
+{
+	struct bcm2835_spi *bs = spi_master_get_devdata(master);
+	struct spi_transfer *xfer;
+
+	list_for_each_entry(xfer, &msg->transfers, transfer_list) {
+		if (xfer->len < BCM2835_SPI_DMA_MIN_LENGTH ||
+		    xfer->len > PAGE_SIZE)
+			continue;
+
+		xfer->tx_buf = bs->tx_buf_saved;
+
+		// pr_info("%s:memcpy(%#x, %#x, %u)\n", __func__, bs->rx_buf_saved, xfer->rx_buf, xfer->len);
+		if (bs->rx_buf_saved)
+			memcpy(bs->rx_buf_saved, xfer->rx_buf, xfer->len);
+		xfer->rx_buf = bs->rx_buf_saved;
+		// pr_info("copied from vmalloc area\n");
+		break;
+	}
+
 	return 0;
 }
 
@@ -944,29 +1033,36 @@ static int bcm2835_spi_probe(struct platform_device *pdev)
 	master->transfer_one = bcm2835_spi_transfer_one;
 	master->handle_err = bcm2835_spi_handle_err;
 	master->prepare_message = bcm2835_spi_prepare_message;
+	master->unprepare_message = bcm2835_spi_unprepare_message;
 	master->dev.of_node = pdev->dev.of_node;
 
 	bs = spi_master_get_devdata(master);
 
+	bs->vm = vmalloc(4 * PAGE_SIZE);
+	if (!bs->vm) {
+		err = -ENOMEM;
+		goto out_master_put;
+	}
+
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	bs->regs = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(bs->regs)) {
 		err = PTR_ERR(bs->regs);
-		goto out_master_put;
+		goto out_free_vm;
 	}
 
 	bs->clk = devm_clk_get(&pdev->dev, NULL);
 	if (IS_ERR(bs->clk)) {
 		err = PTR_ERR(bs->clk);
 		dev_err(&pdev->dev, "could not get clk: %d\n", err);
-		goto out_master_put;
+		goto out_free_vm;
 	}
 
 	bs->irq = platform_get_irq(pdev, 0);
 	if (bs->irq <= 0) {
 		dev_err(&pdev->dev, "could not get IRQ: %d\n", bs->irq);
 		err = bs->irq ? bs->irq : -ENODEV;
-		goto out_master_put;
+		goto out_free_vm;
 	}
 
 	clk_prepare_enable(bs->clk);
@@ -994,6 +1090,8 @@ static int bcm2835_spi_probe(struct platform_device *pdev)
 
 out_clk_disable:
 	clk_disable_unprepare(bs->clk);
+out_free_vm:
+	vfree(bs->vm);
 out_master_put:
 	spi_master_put(master);
 	return err;
@@ -1012,6 +1110,8 @@ static int bcm2835_spi_remove(struct platform_device *pdev)
 
 	bcm2835_dma_release(master);
 
+	vfree(bs->vm);
+
 	return 0;
 }
Stefan Wahren Nov. 9, 2018, 3:43 p.m. UTC | #2
Am 08.11.18 um 08:06 schrieb Lukas Wunner:
> When in DMA mode, the BCM2835 SPI controller requires that the FIFO is
> accessed in 4 byte chunks.  This rule is not fulfilled if a transfer
> consists of multiple sglist entries, one per page, and the first entry
> starts in the middle of a page with an offset not a multiple of 4.
>
> The driver currently falls back to programmed I/O for such transfers,
> incurring a significant performance penalty.
>
> Overcome this hardware limitation by transferring the first few bytes of
> a transfer without DMA such that the remainder of the first sglist entry
> becomes a multiple of 4.  Specifics are provided in kerneldoc comments.
>
> An alternative approach would have been to split transfers in the
> ->prepare_message hook, but this may necessitate two transfers per page,
> defeating the goal of clustering multiple pages together in a single
> transfer for efficiency.  E.g. if the first TX sglist entry's length is
> 23 and the first RX's is 40, the first transfer would send and receive
> 23 bytes, the second 40 - 23 = 17 bytes, the third 4096 - 17 = 4079
> bytes, the fourth 4096 - 4079 = 17 bytes and so on.  In other words,
> O(n) transfers are necessary (n = number of sglist entries), whereas
> the algorithm implemented herein only requires O(1) additional work.
>
> Signed-off-by: Lukas Wunner <lukas@wunner.de>
> Cc: Mathias Duckeck <m.duckeck@kunbus.de>
> Cc: Frank Pavlic <f.pavlic@kunbus.de>
> Cc: Martin Sperl <kernel@martin.sperl.org>
> Cc: Noralf Trønnes <noralf@tronnes.org>
> ---
>  drivers/spi/spi-bcm2835.c | 291 +++++++++++++++++++++++++++++++-------
>  1 file changed, 242 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
> index 9b9b9926a956..36719d2cc12d 100644
> --- a/drivers/spi/spi-bcm2835.c
> +++ b/drivers/spi/spi-bcm2835.c
> @@ -85,20 +85,30 @@
>   * @regs: base address of register map
>   * @clk: core clock, divided to calculate serial clock
>   * @irq: interrupt, signals TX FIFO empty or RX FIFO ¾ full
> + * @tfr: SPI transfer currently processed
>   * @tx_buf: pointer whence next transmitted byte is read
>   * @rx_buf: pointer where next received byte is written
>   * @tx_len: remaining bytes to transmit
>   * @rx_len: remaining bytes to receive
> + * @tx_prologue: bytes transmitted without DMA if first TX sglist entry's
> + *	length is not a multiple of 4 (to overcome hardware limitation)
> + * @rx_prologue: bytes received without DMA if first RX sglist entry's
> + *	length is not a multiple of 4 (to overcome hardware limitation)
> + * @tx_spillover: whether @tx_prologue spills over to second TX sglist entry
>   * @dma_pending: whether a DMA transfer is in progress
>   */
>  struct bcm2835_spi {
>  	void __iomem *regs;
>  	struct clk *clk;
>  	int irq;
> +	struct spi_transfer *tfr;
>  	const u8 *tx_buf;
>  	u8 *rx_buf;
>  	int tx_len;
>  	int rx_len;
> +	int tx_prologue;
> +	int rx_prologue;
> +	bool tx_spillover;
>  	bool dma_pending;
>  };
>  
> @@ -137,6 +147,72 @@ static inline void bcm2835_wr_fifo(struct bcm2835_spi *bs)
>  	}
>  }
>  
> +/**
> + * bcm2835_rd_fifo_count() - blindly read exactly @count bytes from RX FIFO
> + * @bs: BCM2835 SPI controller
> + * @count: bytes to read from RX FIFO
> + *
> + * The caller must ensure that @bs->rx_len is greater than or equal to @count,
> + * that the RX FIFO contains at least @count bytes and that the DMA Enable flag
> + * in the CS register is set (such that a read from the FIFO register receives
> + * 32-bit instead of just 8-bit).
> + */
> +static inline void bcm2835_rd_fifo_count(struct bcm2835_spi *bs, int count)
> +{
> +	u32 val;
> +
> +	bs->rx_len -= count;
> +
> +	while (count > 0) {
> +		val = bcm2835_rd(bs, BCM2835_SPI_FIFO);
> +		if (bs->rx_buf) {
> +			int len = min(count, 4);
> +			memcpy(bs->rx_buf, &val, len);
> +			bs->rx_buf += len;
> +		}
> +		count -= 4;
> +	}
> +}
> +
> +/**
> + * bcm2835_wr_fifo_count() - blindly write exactly @count bytes to TX FIFO
> + * @bs: BCM2835 SPI controller
> + * @count: bytes to write to TX FIFO
> + *
> + * The caller must ensure that @bs->tx_len is greater than or equal to @count,
> + * that the TX FIFO can accommodate @count bytes and that the DMA Enable flag
> + * in the CS register is set (such that a write to the FIFO register transmits
> + * 32-bit instead of just 8-bit).
> + */
> +static inline void bcm2835_wr_fifo_count(struct bcm2835_spi *bs, int count)
> +{
> +	u32 val;
> +
> +	bs->tx_len -= count;
> +
> +	while (count > 0) {
> +		if (bs->tx_buf) {
> +			int len = min(count, 4);
> +			memcpy(&val, bs->tx_buf, len);
> +			bs->tx_buf += len;
> +		} else {
> +			val = 0;
> +		}
> +		bcm2835_wr(bs, BCM2835_SPI_FIFO, val);
> +		count -= 4;
> +	}
> +}
> +
> +/**
> + * bcm2835_wait_tx_fifo_empty() - busy-wait for TX FIFO to empty
> + * @bs: BCM2835 SPI controller
> + */
> +static inline void bcm2835_wait_tx_fifo_empty(struct bcm2835_spi *bs)
> +{
> +	while (!(bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE))
> +		cpu_relax();
> +}
Can we have some kind of timeout here, so we never spin forever in case
hw doesn't behave as expected?
Lukas Wunner Nov. 10, 2018, 8:30 a.m. UTC | #3
On Fri, Nov 09, 2018 at 04:43:43PM +0100, Stefan Wahren wrote:
> Am 08.11.18 um 08:06 schrieb Lukas Wunner:
> > +/**
> > + * bcm2835_wait_tx_fifo_empty() - busy-wait for TX FIFO to empty
> > + * @bs: BCM2835 SPI controller
> > + */
> > +static inline void bcm2835_wait_tx_fifo_empty(struct bcm2835_spi *bs)
> > +{
> > +	while (!(bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE))
> > +		cpu_relax();
> > +}
> 
> Can we have some kind of timeout here, so we never spin forever in case
> hw doesn't behave as expected?

The only way for this to happen is if the RX FIFO cannot accommodate
as many bytes as were written to the TX FIFO:  Transmission is halted
once the RX FIFO becomes full and then indeed the function would spin
forever.  Let me amend the function's kerneldoc to spell out this
responsibility of the caller.

The DONE flag is otherwise set reliably by the hardware, I honestly
don't see a need to supervise that.

Thanks,

Lukas
diff mbox series

Patch

diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index 9b9b9926a956..36719d2cc12d 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -85,20 +85,30 @@ 
  * @regs: base address of register map
  * @clk: core clock, divided to calculate serial clock
  * @irq: interrupt, signals TX FIFO empty or RX FIFO ¾ full
+ * @tfr: SPI transfer currently processed
  * @tx_buf: pointer whence next transmitted byte is read
  * @rx_buf: pointer where next received byte is written
  * @tx_len: remaining bytes to transmit
  * @rx_len: remaining bytes to receive
+ * @tx_prologue: bytes transmitted without DMA if first TX sglist entry's
+ *	length is not a multiple of 4 (to overcome hardware limitation)
+ * @rx_prologue: bytes received without DMA if first RX sglist entry's
+ *	length is not a multiple of 4 (to overcome hardware limitation)
+ * @tx_spillover: whether @tx_prologue spills over to second TX sglist entry
  * @dma_pending: whether a DMA transfer is in progress
  */
 struct bcm2835_spi {
 	void __iomem *regs;
 	struct clk *clk;
 	int irq;
+	struct spi_transfer *tfr;
 	const u8 *tx_buf;
 	u8 *rx_buf;
 	int tx_len;
 	int rx_len;
+	int tx_prologue;
+	int rx_prologue;
+	bool tx_spillover;
 	bool dma_pending;
 };
 
@@ -137,6 +147,72 @@  static inline void bcm2835_wr_fifo(struct bcm2835_spi *bs)
 	}
 }
 
+/**
+ * bcm2835_rd_fifo_count() - blindly read exactly @count bytes from RX FIFO
+ * @bs: BCM2835 SPI controller
+ * @count: bytes to read from RX FIFO
+ *
+ * The caller must ensure that @bs->rx_len is greater than or equal to @count,
+ * that the RX FIFO contains at least @count bytes and that the DMA Enable flag
+ * in the CS register is set (such that a read from the FIFO register receives
+ * 32-bit instead of just 8-bit).
+ */
+static inline void bcm2835_rd_fifo_count(struct bcm2835_spi *bs, int count)
+{
+	u32 val;
+
+	bs->rx_len -= count;
+
+	while (count > 0) {
+		val = bcm2835_rd(bs, BCM2835_SPI_FIFO);
+		if (bs->rx_buf) {
+			int len = min(count, 4);
+			memcpy(bs->rx_buf, &val, len);
+			bs->rx_buf += len;
+		}
+		count -= 4;
+	}
+}
+
+/**
+ * bcm2835_wr_fifo_count() - blindly write exactly @count bytes to TX FIFO
+ * @bs: BCM2835 SPI controller
+ * @count: bytes to write to TX FIFO
+ *
+ * The caller must ensure that @bs->tx_len is greater than or equal to @count,
+ * that the TX FIFO can accommodate @count bytes and that the DMA Enable flag
+ * in the CS register is set (such that a write to the FIFO register transmits
+ * 32-bit instead of just 8-bit).
+ */
+static inline void bcm2835_wr_fifo_count(struct bcm2835_spi *bs, int count)
+{
+	u32 val;
+
+	bs->tx_len -= count;
+
+	while (count > 0) {
+		if (bs->tx_buf) {
+			int len = min(count, 4);
+			memcpy(&val, bs->tx_buf, len);
+			bs->tx_buf += len;
+		} else {
+			val = 0;
+		}
+		bcm2835_wr(bs, BCM2835_SPI_FIFO, val);
+		count -= 4;
+	}
+}
+
+/**
+ * bcm2835_wait_tx_fifo_empty() - busy-wait for TX FIFO to empty
+ * @bs: BCM2835 SPI controller
+ */
+static inline void bcm2835_wait_tx_fifo_empty(struct bcm2835_spi *bs)
+{
+	while (!(bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE))
+		cpu_relax();
+}
+
 static void bcm2835_spi_reset_hw(struct spi_master *master)
 {
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
@@ -209,15 +285,161 @@  static int bcm2835_spi_transfer_one_irq(struct spi_master *master,
  * the main one being that DMA transfers are limited to 16 bit
  * (so 0 to 65535 bytes) by the SPI HW due to BCM2835_SPI_DLEN
  *
- * also we currently assume that the scatter-gather fragments are
- * all multiple of 4 (except the last) - otherwise we would need
- * to reset the FIFO before subsequent transfers...
- * this also means that tx/rx transfers sg's need to be of equal size!
- *
  * there may be a few more border-cases we may need to address as well
  * but unfortunately this would mean splitting up the scatter-gather
  * list making it slightly unpractical...
  */
+
+/**
+ * bcm2835_spi_transfer_prologue() - transfer first few bytes without DMA
+ * @master: SPI master
+ * @tfr: SPI transfer
+ * @bs: BCM2835 SPI controller
+ * @cs: CS register
+ *
+ * A limitation in DMA mode is that the FIFO must be accessed in 4 byte chunks.
+ * Only the final write access is permitted to transmit less than 4 bytes, the
+ * SPI controller deduces its intended size from the DLEN register.
+ *
+ * If a TX or RX sglist contains multiple entries, one per page, and the first
+ * entry starts in the middle of a page, that first entry's length may not be
+ * a multiple of 4.  Subsequent entries are fine because they span an entire
+ * page, hence do have a length that's a multiple of 4.
+ *
+ * This cannot happen with kmalloc'ed buffers (which is what most clients use)
+ * because they are contiguous in physical memory and therefore not split on
+ * page boundaries by spi_map_buf().  But it *can* happen with vmalloc'ed
+ * buffers.
+ *
+ * The DMA engine is incapable of combining sglist entries into a continuous
+ * stream of 4 byte chunks, it treats every entry separately:  A TX entry is
+ * rounded up a to a multiple of 4 bytes by transmitting surplus bytes, an RX
+ * entry is rounded up by throwing away received bytes.
+ *
+ * Overcome this limitation by transferring the first few bytes without DMA:
+ * E.g. if the first TX sglist entry's length is 23 and the first RX's is 42,
+ * write 3 bytes to the TX FIFO but read only 2 bytes from the RX FIFO.
+ * The residue of 1 byte in the RX FIFO is picked up by DMA.  Together with
+ * the rest of the first RX sglist entry it makes up a multiple of 4 bytes.
+ *
+ * Should the RX prologue be larger, say, 3 vis-à-vis a TX prologue of 1,
+ * write 1 + 4 = 5 bytes to the TX FIFO and read 3 bytes from the RX FIFO.
+ * Caution, the additional 4 bytes spill over to the second TX sglist entry
+ * if the length of the first is *exactly* 1.
+ *
+ * At most 6 bytes are written and at most 3 bytes read.  Do we know the
+ * transfer has this many bytes?  Yes, see BCM2835_SPI_DMA_MIN_LENGTH.
+ *
+ * The FIFO is normally accessed with 8-bit width by the CPU and 32-bit width
+ * by the DMA engine.  Toggling the DMA Enable flag in the CS register switches
+ * the width but also garbles the FIFO's contents.  The prologue must therefore
+ * be transmitted in 32-bit width to ensure that the following DMA transfer can
+ * pick up the residue in the RX FIFO in ungarbled form.
+ */
+static void bcm2835_spi_transfer_prologue(struct spi_master *master,
+					  struct spi_transfer *tfr,
+					  struct bcm2835_spi *bs,
+					  u32 cs)
+{
+	int tx_remaining;
+
+	bs->tfr		 = tfr;
+	bs->tx_prologue  = 0;
+	bs->rx_prologue  = 0;
+	bs->tx_spillover = false;
+
+	if (!sg_is_last(&tfr->tx_sg.sgl[0]))
+		bs->tx_prologue = sg_dma_len(&tfr->tx_sg.sgl[0]) & 3;
+
+	if (!sg_is_last(&tfr->rx_sg.sgl[0])) {
+		bs->rx_prologue = sg_dma_len(&tfr->rx_sg.sgl[0]) & 3;
+
+		if (bs->rx_prologue > bs->tx_prologue) {
+			if (sg_is_last(&tfr->tx_sg.sgl[0])) {
+				bs->tx_prologue  = bs->rx_prologue;
+			} else {
+				bs->tx_prologue += 4;
+				bs->tx_spillover =
+					!(sg_dma_len(&tfr->tx_sg.sgl[0]) & ~3);
+			}
+		}
+	}
+
+	/* rx_prologue > 0 implies tx_prologue > 0, so check only the latter */
+	if (!bs->tx_prologue)
+		return;
+
+	/* Write and read RX prologue.  Adjust first entry in RX sglist. */
+	if (bs->rx_prologue) {
+		bcm2835_wr(bs, BCM2835_SPI_DLEN, bs->rx_prologue);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_TA
+						  | BCM2835_SPI_CS_DMAEN);
+		bcm2835_wr_fifo_count(bs, bs->rx_prologue);
+		bcm2835_wait_tx_fifo_empty(bs);
+		bcm2835_rd_fifo_count(bs, bs->rx_prologue);
+		bcm2835_spi_reset_hw(master);
+
+		dma_sync_sg_for_device(master->dma_rx->device->dev,
+				       tfr->rx_sg.sgl, 1, DMA_FROM_DEVICE);
+
+		tfr->rx_sg.sgl[0].dma_address += bs->rx_prologue;
+		tfr->rx_sg.sgl[0].length      -= bs->rx_prologue;
+	}
+
+	/*
+	 * Write remaining TX prologue.  Adjust first entry in TX sglist.
+	 * Also adjust second entry if prologue spills over to it.
+	 */
+	tx_remaining = bs->tx_prologue - bs->rx_prologue;
+	if (tx_remaining) {
+		bcm2835_wr(bs, BCM2835_SPI_DLEN, tx_remaining);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_TA
+						  | BCM2835_SPI_CS_DMAEN);
+		bcm2835_wr_fifo_count(bs, tx_remaining);
+		bcm2835_wait_tx_fifo_empty(bs);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_CLEAR_TX);
+	}
+
+	if (likely(!bs->tx_spillover)) {
+		tfr->tx_sg.sgl[0].dma_address += bs->tx_prologue;
+		tfr->tx_sg.sgl[0].length      -= bs->tx_prologue;
+	} else {
+		tfr->tx_sg.sgl[0].length       = 0;
+		tfr->tx_sg.sgl[1].dma_address += 4;
+		tfr->tx_sg.sgl[1].length      -= 4;
+	}
+}
+
+/**
+ * bcm2835_spi_undo_prologue() - reconstruct original sglist state
+ * @bs: BCM2835 SPI controller
+ *
+ * Undo changes which were made to an SPI transfer's sglist when transmitting
+ * the prologue.  This is necessary to ensure the same memory ranges are
+ * unmapped that were originally mapped.
+ */
+static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
+{
+	struct spi_transfer *tfr = bs->tfr;
+
+	if (!bs->tx_prologue)
+		return;
+
+	if (bs->rx_prologue) {
+		tfr->rx_sg.sgl[0].dma_address -= bs->rx_prologue;
+		tfr->rx_sg.sgl[0].length      += bs->rx_prologue;
+	}
+
+	if (likely(!bs->tx_spillover)) {
+		tfr->tx_sg.sgl[0].dma_address -= bs->tx_prologue;
+		tfr->tx_sg.sgl[0].length      += bs->tx_prologue;
+	} else {
+		tfr->tx_sg.sgl[0].length       = bs->tx_prologue - 4;
+		tfr->tx_sg.sgl[1].dma_address -= 4;
+		tfr->tx_sg.sgl[1].length      += 4;
+	}
+}
+
 static void bcm2835_spi_dma_done(void *data)
 {
 	struct spi_master *master = data;
@@ -233,6 +455,7 @@  static void bcm2835_spi_dma_done(void *data)
 	 */
 	if (cmpxchg(&bs->dma_pending, true, false)) {
 		dmaengine_terminate_all(master->dma_tx);
+		bcm2835_spi_undo_prologue(bs);
 	}
 
 	/* and mark as completed */;
@@ -283,20 +506,6 @@  static int bcm2835_spi_prepare_sg(struct spi_master *master,
 	return dma_submit_error(cookie);
 }
 
-static inline int bcm2835_check_sg_length(struct sg_table *sgt)
-{
-	int i;
-	struct scatterlist *sgl;
-
-	/* check that the sg entries are word-sized (except for last) */
-	for_each_sg(sgt->sgl, sgl, (int)sgt->nents - 1, i) {
-		if (sg_dma_len(sgl) % 4)
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
 static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 					struct spi_device *spi,
 					struct spi_transfer *tfr,
@@ -305,18 +514,16 @@  static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
 	int ret;
 
-	/* check that the scatter gather segments are all a multiple of 4 */
-	if (bcm2835_check_sg_length(&tfr->tx_sg) ||
-	    bcm2835_check_sg_length(&tfr->rx_sg)) {
-		dev_warn_once(&spi->dev,
-			      "scatter gather segment length is not a multiple of 4 - falling back to interrupt mode\n");
-		return bcm2835_spi_transfer_one_irq(master, spi, tfr, cs);
-	}
+	/*
+	 * Transfer first few bytes without DMA if length of first TX or RX
+	 * sglist entry is not a multiple of 4 bytes (hardware limitation).
+	 */
+	bcm2835_spi_transfer_prologue(master, tfr, bs, cs);
 
 	/* setup tx-DMA */
 	ret = bcm2835_spi_prepare_sg(master, tfr, true);
 	if (ret)
-		return ret;
+		goto err_reset_hw;
 
 	/* start TX early */
 	dma_async_issue_pending(master->dma_tx);
@@ -325,7 +532,7 @@  static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 	bs->dma_pending = 1;
 
 	/* set the DMA length */
-	bcm2835_wr(bs, BCM2835_SPI_DLEN, tfr->len);
+	bcm2835_wr(bs, BCM2835_SPI_DLEN, bs->tx_len);
 
 	/* start the HW */
 	bcm2835_wr(bs, BCM2835_SPI_CS,
@@ -340,8 +547,7 @@  static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 		/* need to reset on errors */
 		dmaengine_terminate_all(master->dma_tx);
 		bs->dma_pending = false;
-		bcm2835_spi_reset_hw(master);
-		return ret;
+		goto err_reset_hw;
 	}
 
 	/* start rx dma late */
@@ -349,6 +555,11 @@  static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 
 	/* wait for wakeup in framework */
 	return 1;
+
+err_reset_hw:
+	bcm2835_spi_reset_hw(master);
+	bcm2835_spi_undo_prologue(bs);
+	return ret;
 }
 
 static bool bcm2835_spi_can_dma(struct spi_master *master,
@@ -372,25 +583,6 @@  static bool bcm2835_spi_can_dma(struct spi_master *master,
 		return false;
 	}
 
-	/* if we run rx/tx_buf with word aligned addresses then we are OK */
-	if ((((size_t)tfr->rx_buf & 3) == 0) &&
-	    (((size_t)tfr->tx_buf & 3) == 0))
-		return true;
-
-	/* otherwise we only allow transfers within the same page
-	 * to avoid wasting time on dma_mapping when it is not practical
-	 */
-	if (((size_t)tfr->tx_buf & (PAGE_SIZE - 1)) + tfr->len > PAGE_SIZE) {
-		dev_warn_once(&spi->dev,
-			      "Unaligned spi tx-transfer bridging page\n");
-		return false;
-	}
-	if (((size_t)tfr->rx_buf & (PAGE_SIZE - 1)) + tfr->len > PAGE_SIZE) {
-		dev_warn_once(&spi->dev,
-			      "Unaligned spi rx-transfer bridging page\n");
-		return false;
-	}
-
 	/* return OK */
 	return true;
 }
@@ -614,6 +806,7 @@  static void bcm2835_spi_handle_err(struct spi_master *master,
 	if (cmpxchg(&bs->dma_pending, true, false)) {
 		dmaengine_terminate_all(master->dma_tx);
 		dmaengine_terminate_all(master->dma_rx);
+		bcm2835_spi_undo_prologue(bs);
 	}
 	/* and reset */
 	bcm2835_spi_reset_hw(master);