@@ -1186,7 +1186,9 @@ struct ptlrpc_bulk_desc {
/** completed with failure */
unsigned long bd_failure:1;
/** client side */
- unsigned long bd_registered:1;
+ unsigned long bd_registered:1,
+ /* bulk request is RDMA transfer, use page->host as real address */
+ bd_is_rdma:1;
/** For serialization with callback */
spinlock_t bd_lock;
/** {put,get}{source,sink}{kiov} */
@@ -1416,6 +1416,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
const char *obd_name = cli->cl_import->imp_obd->obd_name;
struct inode *inode = NULL;
bool directio = false;
+ bool gpu = 0;
bool enable_checksum = true;
struct cl_page *clpage;
@@ -1581,6 +1582,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
if (brw_page2oap(pga[0])->oap_brw_flags & OBD_BRW_RDMA_ONLY) {
enable_checksum = false;
short_io_size = 0;
+ gpu = 1;
}
/* Check if read/write is small enough to be a short io. */
@@ -1632,6 +1634,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,
goto out;
}
/* NB request now owns desc and will free it when it gets freed */
+ desc->bd_is_rdma = gpu;
no_bulk:
body = req_capsule_client_get(pill, &RMF_OST_BODY);
ioobj = req_capsule_client_get(pill, &RMF_OBD_IOOBJ);
@@ -58,6 +58,9 @@ void ptlrpc_fill_bulk_md(struct lnet_md *md, struct ptlrpc_bulk_desc *desc,
return;
}
+ if (desc->bd_is_rdma)
+ md->options |= LNET_MD_GPU_ADDR;
+
if (mdidx == (desc->bd_md_count - 1))
md->length = desc->bd_iov_count - start;
else
@@ -138,8 +138,6 @@ struct lnet_msg {
enum lnet_msg_hstatus msg_health_status;
/* This is a recovery message */
bool msg_recovery;
- /* force an RDMA even if the message size is < 4K */
- bool msg_rdma_force;
/* the number of times a transmission has been retried */
int msg_retry_count;
/* flag to indicate that we do not want to resend this message */
@@ -245,6 +243,7 @@ struct lnet_libmd {
*/
#define LNET_MD_FLAG_HANDLING BIT(3)
#define LNET_MD_FLAG_DISCARD BIT(4)
+#define LNET_MD_FLAG_GPU BIT(5) /**< Special mapping needs */
struct lnet_test_peer {
/* info about peers we are trying to fail */
@@ -467,6 +467,8 @@ struct lnet_md {
#define LNET_MD_TRACK_RESPONSE (1 << 10)
/** See struct lnet_md::options. */
#define LNET_MD_NO_TRACK_RESPONSE (1 << 11)
+/** Special page mapping handling */
+#define LNET_MD_GPU_ADDR (1 << 13)
/** Infinite threshold on MD operations. See lnet_md::threshold */
#define LNET_MD_THRESH_INF (-1)
@@ -401,8 +401,9 @@ struct kib_tx { /* transmit message */
struct kib_tx_pool *tx_pool; /* pool I'm from */
struct kib_conn *tx_conn; /* owning conn */
short tx_sending; /* # tx callbacks outstanding */
- short tx_queued; /* queued for sending */
- short tx_waiting; /* waiting for peer_ni */
+ unsigned long tx_queued:1, /* queued for sending */
+ tx_waiting:1, /* waiting for peer_ni */
+ tx_gpu:1; /* force DMA */
int tx_status; /* LNET completion status */
enum lnet_msg_hstatus tx_hstatus; /* health status of the transmit */
ktime_t tx_deadline; /* completion deadline */
@@ -861,17 +862,23 @@ static inline void kiblnd_dma_unmap_single(struct ib_device *dev,
#define KIBLND_UNMAP_ADDR_SET(p, m, a) do {} while (0)
#define KIBLND_UNMAP_ADDR(p, m, a) (a)
-static inline int kiblnd_dma_map_sg(struct kib_hca_dev *hdev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
+static inline
+int kiblnd_dma_map_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
{
+ struct scatterlist *sg = tx->tx_frags;
+ int nents = tx->tx_nfrags;
+ enum dma_data_direction direction = tx->tx_dmadir;
+
return ib_dma_map_sg(hdev->ibh_ibdev, sg, nents, direction);
}
-static inline void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev,
- struct scatterlist *sg, int nents,
- enum dma_data_direction direction)
+static inline
+void kiblnd_dma_unmap_sg(struct kib_hca_dev *hdev, struct kib_tx *tx)
{
+ struct scatterlist *sg = tx->tx_frags;
+ int nents = tx->tx_nfrags;
+ enum dma_data_direction direction = tx->tx_dmadir;
+
ib_dma_unmap_sg(hdev->ibh_ibdev, sg, nents, direction);
}
@@ -623,8 +623,7 @@ static void kiblnd_unmap_tx(struct kib_tx *tx)
kiblnd_fmr_pool_unmap(&tx->tx_fmr, tx->tx_status);
if (tx->tx_nfrags) {
- kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev,
- tx->tx_frags, tx->tx_nfrags, tx->tx_dmadir);
+ kiblnd_dma_unmap_sg(tx->tx_pool->tpo_hdev, tx);
tx->tx_nfrags = 0;
}
}
@@ -644,9 +643,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
tx->tx_dmadir = (rd != tx->tx_rd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
tx->tx_nfrags = nfrags;
- rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx->tx_frags,
- tx->tx_nfrags, tx->tx_dmadir);
-
+ rd->rd_nfrags = kiblnd_dma_map_sg(hdev, tx);
for (i = 0, nob = 0; i < rd->rd_nfrags; i++) {
rd->rd_frags[i].rf_nob = kiblnd_sg_dma_len(
hdev->ibh_ibdev, &tx->tx_frags[i]);
@@ -1076,7 +1073,8 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
int prev = dstidx;
if (srcidx >= srcrd->rd_nfrags) {
- CERROR("Src buffer exhausted: %d frags\n", srcidx);
+ CERROR("Src buffer exhausted: %d frags %px\n",
+ srcidx, tx);
rc = -EPROTO;
break;
}
@@ -1540,10 +1538,12 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
struct bio_vec *payload_kiov = lntmsg->msg_kiov;
unsigned int payload_offset = lntmsg->msg_offset;
unsigned int payload_nob = lntmsg->msg_len;
+ struct lnet_libmd *msg_md = lntmsg->msg_md;
struct iov_iter from;
struct kib_msg *ibmsg;
struct kib_rdma_desc *rd;
struct kib_tx *tx;
+ bool gpu;
int nob;
int rc;
@@ -1571,6 +1571,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
return -ENOMEM;
}
ibmsg = tx->tx_msg;
+ gpu = msg_md ? (msg_md->md_flags & LNET_MD_FLAG_GPU) : false;
switch (type) {
default:
@@ -1586,11 +1587,13 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
break; /* send IMMEDIATE */
/* is the REPLY message too small for RDMA? */
- nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
- if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+ nob = offsetof(struct kib_msg,
+ ibm_u.immediate.ibim_payload[lntmsg->msg_md->md_length]);
+ if (nob <= IBLND_MSG_SIZE && !gpu)
break; /* send IMMEDIATE */
rd = &ibmsg->ibm_u.get.ibgm_rd;
+ tx->tx_gpu = gpu;
rc = kiblnd_setup_rd_kiov(ni, tx, rd,
payload_niov, payload_kiov,
payload_offset, payload_nob);
@@ -1626,9 +1629,11 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
case LNET_MSG_PUT:
/* Is the payload small enough not to need RDMA? */
nob = offsetof(struct kib_msg, ibm_u.immediate.ibim_payload[payload_nob]);
- if (nob <= IBLND_MSG_SIZE && !lntmsg->msg_rdma_force)
+ if (nob <= IBLND_MSG_SIZE && !gpu)
break; /* send IMMEDIATE */
+ tx->tx_gpu = gpu;
+
rc = kiblnd_setup_rd_kiov(ni, tx, tx->tx_rd,
payload_niov, payload_kiov,
payload_offset, payload_nob);
@@ -1712,6 +1717,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
struct bio_vec *kiov = lntmsg->msg_kiov;
unsigned int offset = lntmsg->msg_offset;
unsigned int nob = lntmsg->msg_len;
+ struct lnet_libmd *payload_md = lntmsg->msg_md;
struct kib_tx *tx;
int rc;
@@ -1722,6 +1728,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
goto failed_0;
}
+ tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
if (!nob)
rc = 0;
else
@@ -1784,7 +1791,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
struct kib_tx *tx;
int nob;
int post_credit = IBLND_POSTRX_PEER_CREDIT;
- u64 ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
+ u64 ibprm_cookie;
int rc = 0;
LASSERT(iov_iter_count(to) <= rlen);
@@ -1819,6 +1826,9 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
case IBLND_MSG_PUT_REQ: {
struct kib_msg *txmsg;
struct kib_rdma_desc *rd;
+ struct lnet_libmd *payload_md = lntmsg->msg_md;
+
+ ibprm_cookie = rxmsg->ibm_u.putreq.ibprm_cookie;
if (!iov_iter_count(to)) {
lnet_finalize(lntmsg, 0);
@@ -1836,6 +1846,7 @@ static int kiblnd_map_tx(struct lnet_ni *ni, struct kib_tx *tx,
break;
}
+ tx->tx_gpu = !!(payload_md->md_flags & LNET_MD_FLAG_GPU);
txmsg = tx->tx_msg;
rd = &txmsg->ibm_u.putack.ibpam_rd;
rc = kiblnd_setup_rd_kiov(ni, tx, rd,
@@ -192,6 +192,9 @@ struct page *
lmd->md_flags = (unlink == LNET_UNLINK) ? LNET_MD_FLAG_AUTO_UNLINK : 0;
lmd->md_bulk_handle = umd->bulk_handle;
+ if (umd->options & LNET_MD_GPU_ADDR)
+ lmd->md_flags |= LNET_MD_FLAG_GPU;
+
if (umd->options & LNET_MD_KIOV) {
niov = umd->length;
lmd->md_niov = umd->length;
@@ -1450,11 +1450,13 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
u32 best_sel_prio;
unsigned int best_dev_prio;
unsigned int dev_idx = UINT_MAX;
- struct page *page = lnet_get_first_page(md, offset);
+ bool gpu = md ? (md->md_flags & LNET_MD_FLAG_GPU) : false;
+
+ if (gpu) {
+ struct page *page = lnet_get_first_page(md, offset);
- msg->msg_rdma_force = lnet_is_rdma_only_page(page);
- if (msg->msg_rdma_force)
dev_idx = lnet_get_dev_idx(page);
+ }
/* If there is no peer_ni that we can send to on this network,
* then there is no point in looking for a new best_ni here.
@@ -1505,7 +1507,7 @@ void lnet_usr_translate_stats(struct lnet_ioctl_element_msg_stats *msg_stats,
* All distances smaller than the NUMA range
* are treated equally.
*/
- if (distance < lnet_numa_range)
+ if (!gpu && distance < lnet_numa_range)
distance = lnet_numa_range;
/* * Select on health, selection policy, direct dma prio,