Message ID | 146706898537.188649.13633369547142674206.stgit@djiang5-desk3.ch.intel.com (mailing list archive) |
---|---|
State | Superseded |
Headers | show |
On Mon, Jun 27, 2016 at 7:09 PM, Dave Jiang <dave.jiang@intel.com> wrote: > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > --- > drivers/dma/ioat/dma.c | 136 ++++++++++++++++++++++++++++++++++++------ > drivers/dma/ioat/registers.h | 2 + > 2 files changed, 120 insertions(+), 18 deletions(-) > > diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c > index bd09961..1bbc005 100644 > --- a/drivers/dma/ioat/dma.c > +++ b/drivers/dma/ioat/dma.c > @@ -622,7 +622,8 @@ static void ioat_cleanup(struct ioatdma_chan *ioat_chan) > if (is_ioat_halted(*ioat_chan->completion)) { > u32 chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > > - if (chanerr & IOAT_CHANERR_HANDLE_MASK) { > + if (chanerr & > + (IOAT_CHANERR_HANDLE_MASK | IOAT_CHANERR_RECOVER_MASK)) { > mod_timer(&ioat_chan->timer, jiffies + IDLE_TIMEOUT); > ioat_eh(ioat_chan); > } > @@ -652,6 +653,60 @@ static void ioat_restart_channel(struct ioatdma_chan *ioat_chan) > __ioat_restart_chan(ioat_chan); > } > > + > +static void ioat_abort_descs(struct ioatdma_chan *ioat_chan) > +{ > + struct ioatdma_device *ioat_dma = ioat_chan->ioat_dma; > + struct ioat_ring_ent *desc; > + u16 active; > + int idx = ioat_chan->tail, i; > + > + /* > + * We assume that the failed descriptor has been processed. > + * Now we are just returning all the remaining submitted > + * descriptors to abort. > + */ > + active = ioat_ring_active(ioat_chan); > + > + /* we skip the failed descriptor that tail points to */ > + for (i = 1; i < active; i++) { > + struct dma_async_tx_descriptor *tx; > + > + smp_read_barrier_depends(); > + prefetch(ioat_get_ring_ent(ioat_chan, idx + i + 1)); > + desc = ioat_get_ring_ent(ioat_chan, idx + i); > + desc->txd.result |= ERR_DMA_ABORT; See below, txd.result doesn't seem to be a bit mask. > + > + tx = &desc->txd; > + if (tx->cookie) { > + dma_cookie_complete(tx); > + dma_descriptor_unmap(tx); > + if (tx->callback) { > + tx->callback(tx->callback_param); > + tx->callback = NULL; > + } > + } > + > + /* skip extended descriptors */ > + if (desc_has_ext(desc)) { > + WARN_ON(i + 1 >= active); > + i++; Is it always true that extended descriptors (note: descriptors is plural) means "exactly one?" Otherwise, ++ aught to be += how how many. I didn't notice anything in dma_prep.c that would use more than one extra. > + } > + > + /* cleanup super extended descriptors */ > + if (desc->sed) { > + ioat_free_sed(ioat_dma, desc->sed); > + desc->sed = NULL; > + } > + } > + > + smp_mb(); /* finish all descriptor reads before incrementing tail */ > + ioat_chan->tail = idx + i; Would it also be correct to say `idx + active` here? If so, that might be more clear. > + > + desc = ioat_get_ring_ent(ioat_chan, ioat_chan->tail); > + ioat_chan->last_completion = *ioat_chan->completion = desc->txd.phys; > +} > + > static void ioat_eh(struct ioatdma_chan *ioat_chan) > { > struct pci_dev *pdev = to_pdev(ioat_chan); > @@ -662,6 +717,7 @@ static void ioat_eh(struct ioatdma_chan *ioat_chan) > u32 err_handled = 0; > u32 chanerr_int; > u32 chanerr; > + bool abort = false; > > /* cleanup so tail points to descriptor that caused the error */ > if (ioat_cleanup_preamble(ioat_chan, &phys_complete)) > @@ -697,30 +753,50 @@ static void ioat_eh(struct ioatdma_chan *ioat_chan) > break; > } > > + if (chanerr & IOAT_CHANERR_RECOVER_MASK) { > + if (chanerr & IOAT_CHANERR_READ_DATA_ERR) { > + desc->txd.result |= ERR_DMA_READ; In [PATCH 1/5] ERR_DMA_XYZ values do not look appropriate for use as a bit mask. +enum err_result_flags { + ERR_DMA_NONE = 0, + ERR_DMA_READ, + ERR_DMA_WRITE, + ERR_DMA_ABORT, +}; And in [PATCH 4/5] and [PATCH 5/5] txd.result is not used as a bit mask. + switch (txd->result) { + case ERR_DMA_READ: + case ERR_DMA_WRITE: + entry->errors++; + case ERR_DMA_ABORT: + flags = DESC_ABORT_FLAG; + break; + case ERR_DMA_NONE: + default: + break; + } > + err_handled |= IOAT_CHANERR_READ_DATA_ERR; > + } else if (chanerr & IOAT_CHANERR_WRITE_DATA_ERR) { > + desc->txd.result |= ERR_DMA_WRITE; > + err_handled |= IOAT_CHANERR_WRITE_DATA_ERR; > + } > + > + abort = true; > + } > + > /* fault on unhandled error or spurious halt */ > if (chanerr ^ err_handled || chanerr == 0) { Not part of your change, but chanerr == 0 is confusing. Doesn't that mean "no errors?" No errors would make it hard to justify BUG() that follows. > dev_err(to_dev(ioat_chan), "%s: fatal error (%x:%x)\n", > __func__, chanerr, err_handled); > BUG(); > - } else { /* cleanup the faulty descriptor */ > - tx = &desc->txd; > - if (tx->cookie) { > - dma_cookie_complete(tx); > - dma_descriptor_unmap(tx); > - if (tx->callback) { > - tx->callback(tx->callback_param); > - tx->callback = NULL; > - } > - } > } > > - writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > - pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr_int); > + /* cleanup the faulty descriptor since we are continuing */ > + tx = &desc->txd; > + if (tx->cookie) { > + dma_cookie_complete(tx); > + dma_descriptor_unmap(tx); > + if (tx->callback) { > + tx->callback(tx->callback_param); > + tx->callback = NULL; > + } > + } > > /* mark faulting descriptor as complete */ > *ioat_chan->completion = desc->txd.phys; > > spin_lock_bh(&ioat_chan->prep_lock); > + /* we need abort all descriptors */ > + if (abort) { > + ioat_abort_descs(ioat_chan); > + /* clean up the channel, we could be in weird state */ > + ioat_reset_hw(ioat_chan); > + } > + > + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > + pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr_int); > + > ioat_restart_channel(ioat_chan); > spin_unlock_bh(&ioat_chan->prep_lock); > } > @@ -753,10 +829,25 @@ void ioat_timer_event(unsigned long data) > chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > dev_err(to_dev(ioat_chan), "%s: Channel halted (%x)\n", > __func__, chanerr); > - if (test_bit(IOAT_RUN, &ioat_chan->state)) > - BUG_ON(is_ioat_bug(chanerr)); > - else /* we never got off the ground */ > - return; > + if (test_bit(IOAT_RUN, &ioat_chan->state)) { > + spin_lock_bh(&ioat_chan->cleanup_lock); > + spin_lock_bh(&ioat_chan->prep_lock); > + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > + spin_unlock_bh(&ioat_chan->prep_lock); > + > + ioat_abort_descs(ioat_chan); > + dev_warn(to_dev(ioat_chan), "Reset channel...\n"); > + ioat_reset_hw(ioat_chan); > + dev_warn(to_dev(ioat_chan), "Restart channel...\n"); > + ioat_restart_channel(ioat_chan); > + > + spin_lock_bh(&ioat_chan->prep_lock); > + clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > + spin_unlock_bh(&ioat_chan->prep_lock); > + spin_unlock_bh(&ioat_chan->cleanup_lock); > + } > + > + return; > } > > spin_lock_bh(&ioat_chan->cleanup_lock); > @@ -780,14 +871,23 @@ void ioat_timer_event(unsigned long data) > u32 chanerr; > > chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > - dev_warn(to_dev(ioat_chan), "Restarting channel...\n"); > dev_warn(to_dev(ioat_chan), "CHANSTS: %#Lx CHANERR: %#x\n", > status, chanerr); > dev_warn(to_dev(ioat_chan), "Active descriptors: %d\n", > ioat_ring_active(ioat_chan)); > > spin_lock_bh(&ioat_chan->prep_lock); > + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > + spin_unlock_bh(&ioat_chan->prep_lock); > + > + ioat_abort_descs(ioat_chan); > + dev_warn(to_dev(ioat_chan), "Resetting channel...\n"); > + ioat_reset_hw(ioat_chan); > + dev_warn(to_dev(ioat_chan), "Restarting channel...\n"); > ioat_restart_channel(ioat_chan); > + > + spin_lock_bh(&ioat_chan->prep_lock); > + clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > spin_unlock_bh(&ioat_chan->prep_lock); > spin_unlock_bh(&ioat_chan->cleanup_lock); > return; > diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h > index 4994a36..faf20fa 100644 > --- a/drivers/dma/ioat/registers.h > +++ b/drivers/dma/ioat/registers.h > @@ -233,6 +233,8 @@ > #define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000 > > #define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR) > +#define IOAT_CHANERR_RECOVER_MASK (IOAT_CHANERR_READ_DATA_ERR | \ > + IOAT_CHANERR_WRITE_DATA_ERR) > > #define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */ > > -- To unsubscribe from this list: send the line "unsubscribe dmaengine" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
On Tue, 2016-06-28 at 01:08 +0000, Allen Hubbe wrote: > On Mon, Jun 27, 2016 at 7:09 PM, Dave Jiang <dave.jiang@intel.com> > wrote: > > Signed-off-by: Dave Jiang <dave.jiang@intel.com> > > --- > > drivers/dma/ioat/dma.c | 136 > > ++++++++++++++++++++++++++++++++++++------ > > drivers/dma/ioat/registers.h | 2 + > > 2 files changed, 120 insertions(+), 18 deletions(-) > > > > diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c > > index bd09961..1bbc005 100644 > > --- a/drivers/dma/ioat/dma.c > > +++ b/drivers/dma/ioat/dma.c > > @@ -622,7 +622,8 @@ static void ioat_cleanup(struct ioatdma_chan > > *ioat_chan) > > if (is_ioat_halted(*ioat_chan->completion)) { > > u32 chanerr = readl(ioat_chan->reg_base + > > IOAT_CHANERR_OFFSET); > > > > - if (chanerr & IOAT_CHANERR_HANDLE_MASK) { > > + if (chanerr & > > + (IOAT_CHANERR_HANDLE_MASK | > > IOAT_CHANERR_RECOVER_MASK)) { > > mod_timer(&ioat_chan->timer, jiffies + > > IDLE_TIMEOUT); > > ioat_eh(ioat_chan); > > } > > @@ -652,6 +653,60 @@ static void ioat_restart_channel(struct > > ioatdma_chan *ioat_chan) > > __ioat_restart_chan(ioat_chan); > > } > > > > + > > +static void ioat_abort_descs(struct ioatdma_chan *ioat_chan) > > +{ > > + struct ioatdma_device *ioat_dma = ioat_chan->ioat_dma; > > + struct ioat_ring_ent *desc; > > + u16 active; > > + int idx = ioat_chan->tail, i; > > + > > + /* > > + * We assume that the failed descriptor has been processed. > > + * Now we are just returning all the remaining submitted > > + * descriptors to abort. > > + */ > > + active = ioat_ring_active(ioat_chan); > > + > > + /* we skip the failed descriptor that tail points to */ > > + for (i = 1; i < active; i++) { > > + struct dma_async_tx_descriptor *tx; > > + > > + smp_read_barrier_depends(); > > + prefetch(ioat_get_ring_ent(ioat_chan, idx + i + > > 1)); > > + desc = ioat_get_ring_ent(ioat_chan, idx + i); > > + desc->txd.result |= ERR_DMA_ABORT; > > See below, txd.result doesn't seem to be a bit mask. I'll fix that. It shouldn't be a bit mask. > > > + > > + tx = &desc->txd; > > + if (tx->cookie) { > > + dma_cookie_complete(tx); > > + dma_descriptor_unmap(tx); > > + if (tx->callback) { > > + tx->callback(tx->callback_param); > > + tx->callback = NULL; > > + } > > + } > > + > > + /* skip extended descriptors */ > > + if (desc_has_ext(desc)) { > > + WARN_ON(i + 1 >= active); > > + i++; > > Is it always true that extended descriptors (note: descriptors is > plural) means "exactly one?" Otherwise, ++ aught to be += how how > many. I didn't notice anything in dma_prep.c that would use more > than > one extra. There can only be 1 extended descriptor always. > > > + } > > + > > + /* cleanup super extended descriptors */ > > + if (desc->sed) { > > + ioat_free_sed(ioat_dma, desc->sed); > > + desc->sed = NULL; > > + } > > + } > > + > > + smp_mb(); /* finish all descriptor reads before > > incrementing tail */ > > + ioat_chan->tail = idx + i; > > Would it also be correct to say `idx + active` here? If so, that > might be more clear. ok > > > + > > + desc = ioat_get_ring_ent(ioat_chan, ioat_chan->tail); > > + ioat_chan->last_completion = *ioat_chan->completion = desc- > > >txd.phys; > > +} > > + > > static void ioat_eh(struct ioatdma_chan *ioat_chan) > > { > > struct pci_dev *pdev = to_pdev(ioat_chan); > > @@ -662,6 +717,7 @@ static void ioat_eh(struct ioatdma_chan > > *ioat_chan) > > u32 err_handled = 0; > > u32 chanerr_int; > > u32 chanerr; > > + bool abort = false; > > > > /* cleanup so tail points to descriptor that caused the > > error */ > > if (ioat_cleanup_preamble(ioat_chan, &phys_complete)) > > @@ -697,30 +753,50 @@ static void ioat_eh(struct ioatdma_chan > > *ioat_chan) > > break; > > } > > > > + if (chanerr & IOAT_CHANERR_RECOVER_MASK) { > > + if (chanerr & IOAT_CHANERR_READ_DATA_ERR) { > > + desc->txd.result |= ERR_DMA_READ; > > In [PATCH 1/5] ERR_DMA_XYZ values do not look appropriate for use as > a bit mask. > > +enum err_result_flags { > + ERR_DMA_NONE = 0, > + ERR_DMA_READ, > + ERR_DMA_WRITE, > + ERR_DMA_ABORT, > +}; > > And in [PATCH 4/5] and [PATCH 5/5] txd.result is not used as a bit > mask. > > + switch (txd->result) { > + case ERR_DMA_READ: > + case ERR_DMA_WRITE: > + entry->errors++; > + case ERR_DMA_ABORT: > + flags = DESC_ABORT_FLAG; > + break; > + case ERR_DMA_NONE: > + default: > + break; > + } > > > + err_handled |= IOAT_CHANERR_READ_DATA_ERR; > > + } else if (chanerr & IOAT_CHANERR_WRITE_DATA_ERR) { > > + desc->txd.result |= ERR_DMA_WRITE; > > + err_handled |= IOAT_CHANERR_WRITE_DATA_ERR; > > + } > > + > > + abort = true; > > + } > > + > > /* fault on unhandled error or spurious halt */ > > if (chanerr ^ err_handled || chanerr == 0) { > > Not part of your change, but chanerr == 0 is confusing. Doesn't that > mean "no errors?" No errors would make it hard to justify BUG() that > follows. Legacy code. This is the beginning of me cleaning out all the BUG() in the driver and put in proper error handling. > > > dev_err(to_dev(ioat_chan), "%s: fatal error > > (%x:%x)\n", > > __func__, chanerr, err_handled); > > BUG(); > > - } else { /* cleanup the faulty descriptor */ > > - tx = &desc->txd; > > - if (tx->cookie) { > > - dma_cookie_complete(tx); > > - dma_descriptor_unmap(tx); > > - if (tx->callback) { > > - tx->callback(tx->callback_param); > > - tx->callback = NULL; > > - } > > - } > > } > > > > - writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > > - pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, > > chanerr_int); > > + /* cleanup the faulty descriptor since we are continuing */ > > + tx = &desc->txd; > > + if (tx->cookie) { > > + dma_cookie_complete(tx); > > + dma_descriptor_unmap(tx); > > + if (tx->callback) { > > + tx->callback(tx->callback_param); > > + tx->callback = NULL; > > + } > > + } > > > > /* mark faulting descriptor as complete */ > > *ioat_chan->completion = desc->txd.phys; > > > > spin_lock_bh(&ioat_chan->prep_lock); > > + /* we need abort all descriptors */ > > + if (abort) { > > + ioat_abort_descs(ioat_chan); > > + /* clean up the channel, we could be in weird state > > */ > > + ioat_reset_hw(ioat_chan); > > + } > > + > > + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); > > + pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, > > chanerr_int); > > + > > ioat_restart_channel(ioat_chan); > > spin_unlock_bh(&ioat_chan->prep_lock); > > } > > @@ -753,10 +829,25 @@ void ioat_timer_event(unsigned long data) > > chanerr = readl(ioat_chan->reg_base + > > IOAT_CHANERR_OFFSET); > > dev_err(to_dev(ioat_chan), "%s: Channel halted > > (%x)\n", > > __func__, chanerr); > > - if (test_bit(IOAT_RUN, &ioat_chan->state)) > > - BUG_ON(is_ioat_bug(chanerr)); > > - else /* we never got off the ground */ > > - return; > > + if (test_bit(IOAT_RUN, &ioat_chan->state)) { > > + spin_lock_bh(&ioat_chan->cleanup_lock); > > + spin_lock_bh(&ioat_chan->prep_lock); > > + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > > + spin_unlock_bh(&ioat_chan->prep_lock); > > + > > + ioat_abort_descs(ioat_chan); > > + dev_warn(to_dev(ioat_chan), "Reset > > channel...\n"); > > + ioat_reset_hw(ioat_chan); > > + dev_warn(to_dev(ioat_chan), "Restart > > channel...\n"); > > + ioat_restart_channel(ioat_chan); > > + > > + spin_lock_bh(&ioat_chan->prep_lock); > > + clear_bit(IOAT_CHAN_DOWN, &ioat_chan- > > >state); > > + spin_unlock_bh(&ioat_chan->prep_lock); > > + spin_unlock_bh(&ioat_chan->cleanup_lock); > > + } > > + > > + return; > > } > > > > spin_lock_bh(&ioat_chan->cleanup_lock); > > @@ -780,14 +871,23 @@ void ioat_timer_event(unsigned long data) > > u32 chanerr; > > > > chanerr = readl(ioat_chan->reg_base + > > IOAT_CHANERR_OFFSET); > > - dev_warn(to_dev(ioat_chan), "Restarting > > channel...\n"); > > dev_warn(to_dev(ioat_chan), "CHANSTS: %#Lx CHANERR: > > %#x\n", > > status, chanerr); > > dev_warn(to_dev(ioat_chan), "Active descriptors: > > %d\n", > > ioat_ring_active(ioat_chan)); > > > > spin_lock_bh(&ioat_chan->prep_lock); > > + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > > + spin_unlock_bh(&ioat_chan->prep_lock); > > + > > + ioat_abort_descs(ioat_chan); > > + dev_warn(to_dev(ioat_chan), "Resetting > > channel...\n"); > > + ioat_reset_hw(ioat_chan); > > + dev_warn(to_dev(ioat_chan), "Restarting > > channel...\n"); > > ioat_restart_channel(ioat_chan); > > + > > + spin_lock_bh(&ioat_chan->prep_lock); > > + clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state); > > spin_unlock_bh(&ioat_chan->prep_lock); > > spin_unlock_bh(&ioat_chan->cleanup_lock); > > return; > > diff --git a/drivers/dma/ioat/registers.h > > b/drivers/dma/ioat/registers.h > > index 4994a36..faf20fa 100644 > > --- a/drivers/dma/ioat/registers.h > > +++ b/drivers/dma/ioat/registers.h > > @@ -233,6 +233,8 @@ > > #define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000 > > > > #define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | > > IOAT_CHANERR_XOR_Q_ERR) > > +#define IOAT_CHANERR_RECOVER_MASK (IOAT_CHANERR_READ_DATA_ERR | \ > > + IOAT_CHANERR_WRITE_DATA_ERR) > > > > #define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit > > Channel Error Register */ > > > > > > --
diff --git a/drivers/dma/ioat/dma.c b/drivers/dma/ioat/dma.c index bd09961..1bbc005 100644 --- a/drivers/dma/ioat/dma.c +++ b/drivers/dma/ioat/dma.c @@ -622,7 +622,8 @@ static void ioat_cleanup(struct ioatdma_chan *ioat_chan) if (is_ioat_halted(*ioat_chan->completion)) { u32 chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); - if (chanerr & IOAT_CHANERR_HANDLE_MASK) { + if (chanerr & + (IOAT_CHANERR_HANDLE_MASK | IOAT_CHANERR_RECOVER_MASK)) { mod_timer(&ioat_chan->timer, jiffies + IDLE_TIMEOUT); ioat_eh(ioat_chan); } @@ -652,6 +653,60 @@ static void ioat_restart_channel(struct ioatdma_chan *ioat_chan) __ioat_restart_chan(ioat_chan); } + +static void ioat_abort_descs(struct ioatdma_chan *ioat_chan) +{ + struct ioatdma_device *ioat_dma = ioat_chan->ioat_dma; + struct ioat_ring_ent *desc; + u16 active; + int idx = ioat_chan->tail, i; + + /* + * We assume that the failed descriptor has been processed. + * Now we are just returning all the remaining submitted + * descriptors to abort. + */ + active = ioat_ring_active(ioat_chan); + + /* we skip the failed descriptor that tail points to */ + for (i = 1; i < active; i++) { + struct dma_async_tx_descriptor *tx; + + smp_read_barrier_depends(); + prefetch(ioat_get_ring_ent(ioat_chan, idx + i + 1)); + desc = ioat_get_ring_ent(ioat_chan, idx + i); + desc->txd.result |= ERR_DMA_ABORT; + + tx = &desc->txd; + if (tx->cookie) { + dma_cookie_complete(tx); + dma_descriptor_unmap(tx); + if (tx->callback) { + tx->callback(tx->callback_param); + tx->callback = NULL; + } + } + + /* skip extended descriptors */ + if (desc_has_ext(desc)) { + WARN_ON(i + 1 >= active); + i++; + } + + /* cleanup super extended descriptors */ + if (desc->sed) { + ioat_free_sed(ioat_dma, desc->sed); + desc->sed = NULL; + } + } + + smp_mb(); /* finish all descriptor reads before incrementing tail */ + ioat_chan->tail = idx + i; + + desc = ioat_get_ring_ent(ioat_chan, ioat_chan->tail); + ioat_chan->last_completion = *ioat_chan->completion = desc->txd.phys; +} + static void ioat_eh(struct ioatdma_chan *ioat_chan) { struct pci_dev *pdev = to_pdev(ioat_chan); @@ -662,6 +717,7 @@ static void ioat_eh(struct ioatdma_chan *ioat_chan) u32 err_handled = 0; u32 chanerr_int; u32 chanerr; + bool abort = false; /* cleanup so tail points to descriptor that caused the error */ if (ioat_cleanup_preamble(ioat_chan, &phys_complete)) @@ -697,30 +753,50 @@ static void ioat_eh(struct ioatdma_chan *ioat_chan) break; } + if (chanerr & IOAT_CHANERR_RECOVER_MASK) { + if (chanerr & IOAT_CHANERR_READ_DATA_ERR) { + desc->txd.result |= ERR_DMA_READ; + err_handled |= IOAT_CHANERR_READ_DATA_ERR; + } else if (chanerr & IOAT_CHANERR_WRITE_DATA_ERR) { + desc->txd.result |= ERR_DMA_WRITE; + err_handled |= IOAT_CHANERR_WRITE_DATA_ERR; + } + + abort = true; + } + /* fault on unhandled error or spurious halt */ if (chanerr ^ err_handled || chanerr == 0) { dev_err(to_dev(ioat_chan), "%s: fatal error (%x:%x)\n", __func__, chanerr, err_handled); BUG(); - } else { /* cleanup the faulty descriptor */ - tx = &desc->txd; - if (tx->cookie) { - dma_cookie_complete(tx); - dma_descriptor_unmap(tx); - if (tx->callback) { - tx->callback(tx->callback_param); - tx->callback = NULL; - } - } } - writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); - pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr_int); + /* cleanup the faulty descriptor since we are continuing */ + tx = &desc->txd; + if (tx->cookie) { + dma_cookie_complete(tx); + dma_descriptor_unmap(tx); + if (tx->callback) { + tx->callback(tx->callback_param); + tx->callback = NULL; + } + } /* mark faulting descriptor as complete */ *ioat_chan->completion = desc->txd.phys; spin_lock_bh(&ioat_chan->prep_lock); + /* we need abort all descriptors */ + if (abort) { + ioat_abort_descs(ioat_chan); + /* clean up the channel, we could be in weird state */ + ioat_reset_hw(ioat_chan); + } + + writel(chanerr, ioat_chan->reg_base + IOAT_CHANERR_OFFSET); + pci_write_config_dword(pdev, IOAT_PCI_CHANERR_INT_OFFSET, chanerr_int); + ioat_restart_channel(ioat_chan); spin_unlock_bh(&ioat_chan->prep_lock); } @@ -753,10 +829,25 @@ void ioat_timer_event(unsigned long data) chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); dev_err(to_dev(ioat_chan), "%s: Channel halted (%x)\n", __func__, chanerr); - if (test_bit(IOAT_RUN, &ioat_chan->state)) - BUG_ON(is_ioat_bug(chanerr)); - else /* we never got off the ground */ - return; + if (test_bit(IOAT_RUN, &ioat_chan->state)) { + spin_lock_bh(&ioat_chan->cleanup_lock); + spin_lock_bh(&ioat_chan->prep_lock); + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); + spin_unlock_bh(&ioat_chan->prep_lock); + + ioat_abort_descs(ioat_chan); + dev_warn(to_dev(ioat_chan), "Reset channel...\n"); + ioat_reset_hw(ioat_chan); + dev_warn(to_dev(ioat_chan), "Restart channel...\n"); + ioat_restart_channel(ioat_chan); + + spin_lock_bh(&ioat_chan->prep_lock); + clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state); + spin_unlock_bh(&ioat_chan->prep_lock); + spin_unlock_bh(&ioat_chan->cleanup_lock); + } + + return; } spin_lock_bh(&ioat_chan->cleanup_lock); @@ -780,14 +871,23 @@ void ioat_timer_event(unsigned long data) u32 chanerr; chanerr = readl(ioat_chan->reg_base + IOAT_CHANERR_OFFSET); - dev_warn(to_dev(ioat_chan), "Restarting channel...\n"); dev_warn(to_dev(ioat_chan), "CHANSTS: %#Lx CHANERR: %#x\n", status, chanerr); dev_warn(to_dev(ioat_chan), "Active descriptors: %d\n", ioat_ring_active(ioat_chan)); spin_lock_bh(&ioat_chan->prep_lock); + set_bit(IOAT_CHAN_DOWN, &ioat_chan->state); + spin_unlock_bh(&ioat_chan->prep_lock); + + ioat_abort_descs(ioat_chan); + dev_warn(to_dev(ioat_chan), "Resetting channel...\n"); + ioat_reset_hw(ioat_chan); + dev_warn(to_dev(ioat_chan), "Restarting channel...\n"); ioat_restart_channel(ioat_chan); + + spin_lock_bh(&ioat_chan->prep_lock); + clear_bit(IOAT_CHAN_DOWN, &ioat_chan->state); spin_unlock_bh(&ioat_chan->prep_lock); spin_unlock_bh(&ioat_chan->cleanup_lock); return; diff --git a/drivers/dma/ioat/registers.h b/drivers/dma/ioat/registers.h index 4994a36..faf20fa 100644 --- a/drivers/dma/ioat/registers.h +++ b/drivers/dma/ioat/registers.h @@ -233,6 +233,8 @@ #define IOAT_CHANERR_DESCRIPTOR_COUNT_ERR 0x40000 #define IOAT_CHANERR_HANDLE_MASK (IOAT_CHANERR_XOR_P_OR_CRC_ERR | IOAT_CHANERR_XOR_Q_ERR) +#define IOAT_CHANERR_RECOVER_MASK (IOAT_CHANERR_READ_DATA_ERR | \ + IOAT_CHANERR_WRITE_DATA_ERR) #define IOAT_CHANERR_MASK_OFFSET 0x2C /* 32-bit Channel Error Register */
Signed-off-by: Dave Jiang <dave.jiang@intel.com> --- drivers/dma/ioat/dma.c | 136 ++++++++++++++++++++++++++++++++++++------ drivers/dma/ioat/registers.h | 2 + 2 files changed, 120 insertions(+), 18 deletions(-) -- To unsubscribe from this list: send the line "unsubscribe dmaengine" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html