@@ -56,6 +56,55 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM
+struct mlx5_hmm_pfault {
+ struct mlx5_ib_mr *mlx5_ib_mr;
+ u64 start_idx;
+ dma_addr_t access_mask;
+ unsigned npages;
+ struct hmm_event event;
+};
+
+static int mlx5_hmm_pfault(struct mlx5_ib_dev *mlx5_ib_dev,
+ struct hmm_mirror *mirror,
+ const struct hmm_event *event)
+{
+ struct mlx5_hmm_pfault *pfault;
+ struct hmm_pt_iter iter;
+ unsigned long addr, cnt;
+ int ret;
+
+ pfault = container_of(event, struct mlx5_hmm_pfault, event);
+ hmm_pt_iter_init(&iter);
+
+ for (addr = event->start, cnt = 0; addr < event->end;
+ addr += PAGE_SIZE, ++cnt) {
+ dma_addr_t *ptep;
+
+ /* Get and lock pointer to mirror page table. */
+ ptep = hmm_pt_iter_update(&iter, &mirror->pt, addr);
+ /* This could be BUG_ON() as it can not happen. */
+ if (!ptep || !hmm_pte_test_valid_dma(ptep)) {
+ pr_warn("got empty mirror page table on pagefault.\n");
+ return -EINVAL;
+ }
+ if ((pfault->access_mask & ODP_WRITE_ALLOWED_BIT)) {
+ if (!hmm_pte_test_write(ptep)) {
+ pr_warn("got wrong protection permission on "
+ "pagefault.\n");
+ return -EINVAL;
+ }
+ hmm_pte_set_bit(ptep, ODP_WRITE_ALLOWED_SHIFT);
+ }
+ hmm_pte_set_bit(ptep, ODP_READ_ALLOWED_SHIFT);
+ pfault->npages++;
+ }
+ ret = mlx5_ib_update_mtt(pfault->mlx5_ib_mr,
+ pfault->start_idx,
+ cnt, 0, &iter);
+ hmm_pt_iter_fini(&iter, &mirror->pt);
+ return ret;
+}
+
int mlx5_ib_umem_invalidate(struct ib_umem *umem, u64 start,
u64 end, void *cookie)
{
@@ -178,12 +227,19 @@ static int mlx5_hmm_update(struct hmm_mirror *mirror,
const struct hmm_event *event)
{
struct device *device = mirror->device->dev;
+ struct mlx5_ib_dev *mlx5_ib_dev;
+ struct ib_device *ib_device;
int ret = 0;
+ ib_device = container_of(mirror->device, struct ib_device, hmm_dev);
+ mlx5_ib_dev = to_mdev(ib_device);
+
switch (event->etype) {
case HMM_DEVICE_RFAULT:
case HMM_DEVICE_WFAULT:
- /* FIXME implement. */
+ ret = mlx5_hmm_pfault(mlx5_ib_dev, mirror, event);
+ if (ret)
+ return ret;
break;
case HMM_ISDIRTY:
hmm_mirror_range_dirty(mirror, event->start, event->end);
@@ -228,6 +284,95 @@ void mlx5_dev_fini_odp_hmm(struct ib_device *ib_device)
hmm_device_unregister(&ib_device->hmm_dev);
}
+/*
+ * Handle a single data segment in a page-fault WQE.
+ *
+ * Returns number of pages retrieved on success. The caller will continue to
+ * the next data segment.
+ * Can return the following error codes:
+ * -EAGAIN to designate a temporary error. The caller will abort handling the
+ * page fault and resolve it.
+ * -EFAULT when there's an error mapping the requested pages. The caller will
+ * abort the page fault handling and possibly move the QP to an error state.
+ * On other errors the QP should also be closed with an error.
+ */
+static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
+ struct mlx5_ib_pfault *pfault,
+ u32 key, u64 io_virt, size_t bcnt,
+ u32 *bytes_mapped)
+{
+ struct mlx5_ib_dev *mlx5_ib_dev = to_mdev(qp->ibqp.pd->device);
+ struct ib_mirror *ib_mirror;
+ struct mlx5_hmm_pfault hmm_pfault;
+ int srcu_key;
+ int ret = 0;
+
+ srcu_key = srcu_read_lock(&mlx5_ib_dev->mr_srcu);
+ hmm_pfault.mlx5_ib_mr = mlx5_ib_odp_find_mr_lkey(mlx5_ib_dev, key);
+ /*
+ * If we didn't find the MR, it means the MR was closed while we were
+ * handling the ODP event. In this case we return -EFAULT so that the
+ * QP will be closed.
+ */
+ if (!hmm_pfault.mlx5_ib_mr || !hmm_pfault.mlx5_ib_mr->ibmr.pd) {
+ pr_err("Failed to find relevant mr for lkey=0x%06x, probably "
+ "the MR was destroyed\n", key);
+ ret = -EFAULT;
+ goto srcu_unlock;
+ }
+ if (!hmm_pfault.mlx5_ib_mr->umem->odp_data) {
+ pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault "
+ "handler.\n", key);
+ if (bytes_mapped)
+ *bytes_mapped +=
+ (bcnt - pfault->mpfault.bytes_committed);
+ goto srcu_unlock;
+ }
+ if (hmm_pfault.mlx5_ib_mr->ibmr.pd != qp->ibqp.pd) {
+ pr_err("Page-fault with different PDs for QP and MR.\n");
+ ret = -EFAULT;
+ goto srcu_unlock;
+ }
+
+ ib_mirror = hmm_pfault.mlx5_ib_mr->umem->odp_data->ib_mirror;
+ if (ib_mirror->base.hmm == NULL) {
+ /* Somehow the mirror was kill from under us. */
+ ret = -EFAULT;
+ goto srcu_unlock;
+ }
+
+ /*
+ * Avoid branches - this code will perform correctly
+ * in all iterations (in iteration 2 and above,
+ * bytes_committed == 0).
+ */
+ io_virt += pfault->mpfault.bytes_committed;
+ bcnt -= pfault->mpfault.bytes_committed;
+
+ hmm_pfault.npages = 0;
+ hmm_pfault.start_idx = (io_virt - (hmm_pfault.mlx5_ib_mr->mmr.iova &
+ PAGE_MASK)) >> PAGE_SHIFT;
+ hmm_pfault.access_mask = ODP_READ_ALLOWED_BIT;
+ hmm_pfault.access_mask |= hmm_pfault.mlx5_ib_mr->umem->writable ?
+ ODP_WRITE_ALLOWED_BIT : 0;
+ hmm_pfault.event.start = io_virt & PAGE_MASK;
+ hmm_pfault.event.end = PAGE_ALIGN(io_virt + bcnt);
+ hmm_pfault.event.etype = hmm_pfault.mlx5_ib_mr->umem->writable ?
+ HMM_DEVICE_WFAULT : HMM_DEVICE_RFAULT;
+ ret = hmm_mirror_fault(&ib_mirror->base, &hmm_pfault.event);
+
+ if (!ret && hmm_pfault.npages && bytes_mapped) {
+ u32 new_mappings = hmm_pfault.npages * PAGE_SIZE -
+ (io_virt - round_down(io_virt, PAGE_SIZE));
+ *bytes_mapped += min_t(u32, new_mappings, bcnt);
+ }
+
+srcu_unlock:
+ srcu_read_unlock(&mlx5_ib_dev->mr_srcu, srcu_key);
+ pfault->mpfault.bytes_committed = 0;
+ return ret ? ret : hmm_pfault.npages;
+}
+
#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING_HMM */