diff mbox series

[RFC,rdma-next,08/10] RDMA/rxe: Implement flush execution in responder side

Message ID 20211228080717.10666-9-lizhijian@cn.fujitsu.com (mailing list archive)
State Superseded
Headers show
Series RDMA/rxe: Add RDMA FLUSH operation | expand

Commit Message

Li Zhijian Dec. 28, 2021, 8:07 a.m. UTC
In contrast to other opcodes, after a series of sanity checking, FLUSH
opcode will do a Placement Type checking before it really do the FLUSH
operation. Responder will also reply NAK "Remote Access Error" if it
found a placement type violation.

We will persist data via arch_wb_cache_pmem(), which could be
architecture specific.

After the execution, responder would reply a responded successfully by
RDMA READ response of zero size.

Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
---
 drivers/infiniband/sw/rxe/rxe_hdr.h  |  28 ++++++
 drivers/infiniband/sw/rxe/rxe_loc.h  |   2 +
 drivers/infiniband/sw/rxe/rxe_mr.c   |   4 +-
 drivers/infiniband/sw/rxe/rxe_resp.c | 131 ++++++++++++++++++++++++++-
 include/uapi/rdma/ib_user_verbs.h    |  10 ++
 5 files changed, 169 insertions(+), 6 deletions(-)

Comments

Tom Talpey Dec. 30, 2021, 10:18 p.m. UTC | #1
On 12/28/2021 3:07 AM, Li Zhijian wrote:
> In contrast to other opcodes, after a series of sanity checking, FLUSH
> opcode will do a Placement Type checking before it really do the FLUSH
> operation. Responder will also reply NAK "Remote Access Error" if it
> found a placement type violation.
> 
> We will persist data via arch_wb_cache_pmem(), which could be
> architecture specific.
> 
> After the execution, responder would reply a responded successfully by
> RDMA READ response of zero size.
> 
> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
> ---
>   drivers/infiniband/sw/rxe/rxe_hdr.h  |  28 ++++++
>   drivers/infiniband/sw/rxe/rxe_loc.h  |   2 +
>   drivers/infiniband/sw/rxe/rxe_mr.c   |   4 +-
>   drivers/infiniband/sw/rxe/rxe_resp.c | 131 ++++++++++++++++++++++++++-
>   include/uapi/rdma/ib_user_verbs.h    |  10 ++
>   5 files changed, 169 insertions(+), 6 deletions(-)
> 

<snip>

> +static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
> +{
> +	int			err;
> +	int			bytes;
> +	u8			*va;
> +	struct rxe_map		**map;
> +	struct rxe_phys_buf	*buf;
> +	int			m;
> +	int			i;
> +	size_t			offset;
> +
> +	if (length == 0)
> +		return 0;

The length is only relevant when the flush type is "Memory Region
Range".

When the flush type is "Memory Region", the entire region must be
flushed successfully before completing the operation.

> +
> +	if (mr->type == IB_MR_TYPE_DMA) {
> +		arch_wb_cache_pmem((void *)iova, length);
> +		return 0;
> +	}

Are dmamr's supported for remote access? I thought that was
prevented on first principles now. I might suggest not allowing
them to be flushed in any event. There is no length restriction,
and it's a VERY costly operation. At a minimum, protect this
closely.

> +
> +	WARN_ON_ONCE(!mr->cur_map_set);

The WARN is rather pointless because the code will crash just
seven lines below.

> +
> +	err = mr_check_range(mr, iova, length);
> +	if (err) {
> +		err = -EFAULT;
> +		goto err1;
> +	}
> +
> +	lookup_iova(mr, iova, &m, &i, &offset);
> +
> +	map = mr->cur_map_set->map + m;
> +	buf	= map[0]->buf + i;
> +
> +	while (length > 0) {
> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
> +		bytes	= buf->size - offset;
> +
> +		if (bytes > length)
> +			bytes = length;
> +
> +		arch_wb_cache_pmem(va, bytes);
> +
> +		length	-= bytes;
> +
> +		offset	= 0;
> +		buf++;
> +		i++;
> +
> +		if (i == RXE_BUF_PER_MAP) {
> +			i = 0;
> +			map++;
> +			buf = map[0]->buf;
> +		}
> +	}
> +
> +	return 0;
> +
> +err1:
> +	return err;
> +}
> +
> +static enum resp_states process_flush(struct rxe_qp *qp,
> +				       struct rxe_pkt_info *pkt)
> +{
> +	u64 length = 0, start = qp->resp.va;
> +	u32 sel = feth_sel(pkt);
> +	u32 plt = feth_plt(pkt);
> +	struct rxe_mr *mr = qp->resp.mr;
> +
> +	if (sel == IB_EXT_SEL_MR_RANGE)
> +		length = qp->resp.length;
> +	else if (sel == IB_EXT_SEL_MR_WHOLE)
> +		length = mr->cur_map_set->length;

I'm going to have to think about these
> +
> +	if (plt == IB_EXT_PLT_PERSIST) {
> +		nvdimm_flush_iova(mr, start, length);
> +		wmb(); // clwb follows by a sfence
> +	} else if (plt == IB_EXT_PLT_GLB_VIS)
> +		wmb(); // sfence is enough

The persistence and global visibility bits are not mutually
exclusive, and in fact persistence does not imply global
visibility in some platforms. They must be tested and
processed individually.

	if (plt & IB_EXT_PLT_PERSIST)
		...
	else if (plt & IB_EXT_PLT_GLB_VIS)
		..

Second, the "clwb" and "sfence" comments are completely
Intel-specific. What processing will be done on other
processor platforms???

Tom.
Zhijian Li (Fujitsu) Dec. 31, 2021, 1:37 a.m. UTC | #2
On 31/12/2021 06:18, Tom Talpey wrote:
> On 12/28/2021 3:07 AM, Li Zhijian wrote:
>> In contrast to other opcodes, after a series of sanity checking, FLUSH
>> opcode will do a Placement Type checking before it really do the FLUSH
>> operation. Responder will also reply NAK "Remote Access Error" if it
>> found a placement type violation.
>>
>> We will persist data via arch_wb_cache_pmem(), which could be
>> architecture specific.
>>
>> After the execution, responder would reply a responded successfully by
>> RDMA READ response of zero size.
>>
>> Signed-off-by: Li Zhijian <lizhijian@cn.fujitsu.com>
>> ---
>>   drivers/infiniband/sw/rxe/rxe_hdr.h  |  28 ++++++
>>   drivers/infiniband/sw/rxe/rxe_loc.h  |   2 +
>>   drivers/infiniband/sw/rxe/rxe_mr.c   |   4 +-
>>   drivers/infiniband/sw/rxe/rxe_resp.c | 131 ++++++++++++++++++++++++++-
>>   include/uapi/rdma/ib_user_verbs.h    |  10 ++
>>   5 files changed, 169 insertions(+), 6 deletions(-)
>>
>
> <snip>
>
>> +static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
>> +{
>> +    int            err;
>> +    int            bytes;
>> +    u8            *va;
>> +    struct rxe_map        **map;
>> +    struct rxe_phys_buf    *buf;
>> +    int            m;
>> +    int            i;
>> +    size_t            offset;
>> +
>> +    if (length == 0)
>> +        return 0;
>
> The length is only relevant when the flush type is "Memory Region
> Range".
>
> When the flush type is "Memory Region", the entire region must be
> flushed successfully before completing the operation.

Yes, currently, the length has been expanded to the MR's length in such case.


>
>> +
>> +    if (mr->type == IB_MR_TYPE_DMA) {
>> +        arch_wb_cache_pmem((void *)iova, length);
>> +        return 0;
>> +    }
>
> Are dmamr's supported for remote access? I thought that was
> prevented on first principles now. I might suggest not allowing
> them to be flushed in any event. There is no length restriction,
> and it's a VERY costly operation. At a minimum, protect this
> closely.
Indeed, I didn't confidence about this, the main logic comes from rxe_mr_copy()
Thanks for the suggestion.



>
>> +
>> +    WARN_ON_ONCE(!mr->cur_map_set);
>
> The WARN is rather pointless because the code will crash just
> seven lines below.
>
>> +
>> +    err = mr_check_range(mr, iova, length);
>> +    if (err) {
>> +        err = -EFAULT;
>> +        goto err1;
>> +    }
>> +
>> +    lookup_iova(mr, iova, &m, &i, &offset);
>> +
>> +    map = mr->cur_map_set->map + m;
>> +    buf    = map[0]->buf + i;
>> +
>> +    while (length > 0) {
>> +        va    = (u8 *)(uintptr_t)buf->addr + offset;
>> +        bytes    = buf->size - offset;
>> +
>> +        if (bytes > length)
>> +            bytes = length;
>> +
>> +        arch_wb_cache_pmem(va, bytes);
>> +
>> +        length    -= bytes;
>> +
>> +        offset    = 0;
>> +        buf++;
>> +        i++;
>> +
>> +        if (i == RXE_BUF_PER_MAP) {
>> +            i = 0;
>> +            map++;
>> +            buf = map[0]->buf;
>> +        }
>> +    }
>> +
>> +    return 0;
>> +
>> +err1:
>> +    return err;
>> +}
>> +
>> +static enum resp_states process_flush(struct rxe_qp *qp,
>> +                       struct rxe_pkt_info *pkt)
>> +{
>> +    u64 length = 0, start = qp->resp.va;
>> +    u32 sel = feth_sel(pkt);
>> +    u32 plt = feth_plt(pkt);
>> +    struct rxe_mr *mr = qp->resp.mr;
>> +
>> +    if (sel == IB_EXT_SEL_MR_RANGE)
>> +        length = qp->resp.length;
>> +    else if (sel == IB_EXT_SEL_MR_WHOLE)
>> +        length = mr->cur_map_set->length;
>
> I'm going to have to think about these

Yes, you inspire me that we should consider to adjust the start of iova to the MR's start as well.


>> +
>> +    if (plt == IB_EXT_PLT_PERSIST) {
>> +        nvdimm_flush_iova(mr, start, length);
>> +        wmb(); // clwb follows by a sfence
>> +    } else if (plt == IB_EXT_PLT_GLB_VIS)
>> +        wmb(); // sfence is enough
>
> The persistence and global visibility bits are not mutually
> exclusive,
My bad, it ever appeared in my mind. o(╯□╰)o




> and in fact persistence does not imply global
> visibility in some platforms. 
If so, and per the SPEC, why not
    if (plt & IB_EXT_PLT_PERSIST)
       do_somethingA();
    if (plt & IB_EXT_PLT_GLB_VIS)
       do_somethingB();



> They must be tested and
> processed individually.
>
>     if (plt & IB_EXT_PLT_PERSIST)
>         ...
>     else if (plt & IB_EXT_PLT_GLB_VIS)
>         ..
>
> Second, the "clwb" and "sfence" comments are completely
> Intel-specific. 
good catch.


> What processing will be done on other
> processor platforms???

I didn't dig other ARCH yet but INTEL.
In this function, i think i just need to call the higher level wrapper, like wmb() and
arch_wb_cache_pmem are enough, right ?

Again, thank you.

Thanks.




>
> Tom.
Tom Talpey Dec. 31, 2021, 2:32 a.m. UTC | #3
On 12/30/2021 8:37 PM, lizhijian@fujitsu.com wrote
>>> +static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
>>> +{
>>> +    int            err;
>>> +    int            bytes;
>>> +    u8            *va;
>>> +    struct rxe_map        **map;
>>> +    struct rxe_phys_buf    *buf;
>>> +    int            m;
>>> +    int            i;
>>> +    size_t            offset;
>>> +
>>> +    if (length == 0)
>>> +        return 0;
>>
>> The length is only relevant when the flush type is "Memory Region
>> Range".
>>
>> When the flush type is "Memory Region", the entire region must be
>> flushed successfully before completing the operation.
> 
> Yes, currently, the length has been expanded to the MR's length in such case.

I'll dig deeper, but this is not immediately clear in this context.
A zero-length operation is however valid, even though it's a no-op,
the application may use it to ensure certain ordering constraints.
Will comment later if I have a specific concern.

>>> +
>>> +    if (mr->type == IB_MR_TYPE_DMA) {
>>> +        arch_wb_cache_pmem((void *)iova, length);
>>> +        return 0;
>>> +    }
>>
>> Are dmamr's supported for remote access? I thought that was
>> prevented on first principles now. I might suggest not allowing
>> them to be flushed in any event. There is no length restriction,
>> and it's a VERY costly operation. At a minimum, protect this
>> closely.
> Indeed, I didn't confidence about this, the main logic comes from rxe_mr_copy()
> Thanks for the suggestion.

Well, be careful when following semantics from local behaviors. When you
are processing a command from the wire, extreme caution is needed.
ESPECIALLY WHEN IT'S A DMA_MR, WHICH PROVIDES ACCESS TO ALL PHYSICAL!!!

Sorry for shouting. :)

>>> +
>>> +    WARN_ON_ONCE(!mr->cur_map_set);
>>
>> The WARN is rather pointless because the code will crash just
>> seven lines below.
>>
>>> +
>>> +    err = mr_check_range(mr, iova, length);
>>> +    if (err) {
>>> +        err = -EFAULT;
>>> +        goto err1;
>>> +    }
>>> +
>>> +    lookup_iova(mr, iova, &m, &i, &offset);
>>> +
>>> +    map = mr->cur_map_set->map + m;
>>> +    buf    = map[0]->buf + i;
>>> +
>>> +    while (length > 0) {
>>> +        va    = (u8 *)(uintptr_t)buf->addr + offset;
>>> +        bytes    = buf->size - offset;
>>> +
>>> +        if (bytes > length)
>>> +            bytes = length;
>>> +
>>> +        arch_wb_cache_pmem(va, bytes);
>>> +
>>> +        length    -= bytes;
>>> +
>>> +        offset    = 0;
>>> +        buf++;
>>> +        i++;
>>> +
>>> +        if (i == RXE_BUF_PER_MAP) {
>>> +            i = 0;
>>> +            map++;
>>> +            buf = map[0]->buf;
>>> +        }
>>> +    }
>>> +
>>> +    return 0;
>>> +
>>> +err1:
>>> +    return err;
>>> +}
>>> +
>>> +static enum resp_states process_flush(struct rxe_qp *qp,
>>> +                       struct rxe_pkt_info *pkt)
>>> +{
>>> +    u64 length = 0, start = qp->resp.va;
>>> +    u32 sel = feth_sel(pkt);
>>> +    u32 plt = feth_plt(pkt);
>>> +    struct rxe_mr *mr = qp->resp.mr;
>>> +
>>> +    if (sel == IB_EXT_SEL_MR_RANGE)
>>> +        length = qp->resp.length;
>>> +    else if (sel == IB_EXT_SEL_MR_WHOLE)
>>> +        length = mr->cur_map_set->length;
>>
>> I'm going to have to think about these
> 
> Yes, you inspire me that we should consider to adjust the start of iova to the MR's start as well.

I'll review again when you decide.
>>> +
>>> +    if (plt == IB_EXT_PLT_PERSIST) {
>>> +        nvdimm_flush_iova(mr, start, length);
>>> +        wmb(); // clwb follows by a sfence
>>> +    } else if (plt == IB_EXT_PLT_GLB_VIS)
>>> +        wmb(); // sfence is enough
>>
>> The persistence and global visibility bits are not mutually
>> exclusive,
> My bad, it ever appeared in my mind. o(╯□╰)o
> 
> 
> 
> 
>> and in fact persistence does not imply global
>> visibility in some platforms.
> If so, and per the SPEC, why not
>      if (plt & IB_EXT_PLT_PERSIST)
>         do_somethingA();
>      if (plt & IB_EXT_PLT_GLB_VIS)
>         do_somethingB();

At the simplest, yes. But there are many subtle other possibilities.

The global visibility is oriented toward supporting distributed
shared memory workloads, or publish/subscribe on high scale systems.
It's mainly about ensuring that all devices and CPUs see the data,
something ordinary RDMA Write does not guarantee. This often means
flushing write pipelines, possibly followed by invalidating caches.

The persistence, of course, is about ensuring the data is stored. This
is entirely different from making it visible.

In fact, you often want to optimize the two scenarios together, since
these operations are all about optimizing latency. The choice is going
to depend greatly on the behavior of the platform itself. For example,
certain caches provide persistence when batteries are present. So,
providing persistence and providing visibility are one and the same.
No need to do that twice.

On the other hand, some systems may provide persistence only after a
significant cost, such as writing into flash from a DRAM buffer, or
when an Optane DIMM becomes overloaded from continuous writes and
begins to throttle them. The flush may need some rather intricate
processing, to avoid deadlock.

Your code does not appear to envision these, so the simple way might
be best for now. But you should consider other situations.

>> Second, the "clwb" and "sfence" comments are completely
>> Intel-specific.
> good catch.
> 
> 
>> What processing will be done on other
>> processor platforms???
> 
> I didn't dig other ARCH yet but INTEL.
> In this function, i think i just need to call the higher level wrapper, like wmb() and
> arch_wb_cache_pmem are enough, right ?

Well, higher level wrappers may signal errors, for example if they're
not available or unreliable, and you will need to handle them. Also,
they may block. Is that ok in this context?

You need to get this right on all platforms which will run this code.
It is not acceptable to guess at whether the data is in the required
state before completing the RDMA_FLUSH. If this is not guaranteed,
the operation must raise the required error. To do anything else will
violate the protocol contract, and the remote application may fail.

Tom.
Zhijian Li (Fujitsu) Jan. 4, 2022, 8:51 a.m. UTC | #4
On 31/12/2021 10:32, Tom Talpey wrote:
>
> On 12/30/2021 8:37 PM, lizhijian@fujitsu.com wrote
>>>> +static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
>>>> +{
>>>> +    int            err;
>>>> +    int            bytes;
>>>> +    u8            *va;
>>>> +    struct rxe_map        **map;
>>>> +    struct rxe_phys_buf    *buf;
>>>> +    int            m;
>>>> +    int            i;
>>>> +    size_t            offset;
>>>> +
>>>> +    if (length == 0)
>>>> +        return 0;
>>>
>>> The length is only relevant when the flush type is "Memory Region
>>> Range".
>>>
>>> When the flush type is "Memory Region", the entire region must be
>>> flushed successfully before completing the operation.
>>
>> Yes, currently, the length has been expanded to the MR's length in such case.
>
> I'll dig deeper, but this is not immediately clear in this context.
> A zero-length operation is however valid, even though it's a no-op,

If it's no-op, what shall we do in this case.


> the application may use it to ensure certain ordering constraints.
> Will comment later if I have a specific concern.

kindly welcome :)




>
>>>> +
>>>> +    if (mr->type == IB_MR_TYPE_DMA) {
>>>> +        arch_wb_cache_pmem((void *)iova, length);
>>>> +        return 0;
>>>> +    }
>>>
>>> Are dmamr's supported for remote access? I thought that was
>>> prevented on first principles now. I might suggest not allowing
>>> them to be flushed in any event. There is no length restriction,
>>> and it's a VERY costly operation. At a minimum, protect this
>>> closely.
>> Indeed, I didn't confidence about this, the main logic comes from rxe_mr_copy()
>> Thanks for the suggestion.
>
> Well, be careful when following semantics from local behaviors. When you
> are processing a command from the wire, extreme caution is needed.
> ESPECIALLY WHEN IT'S A DMA_MR, WHICH PROVIDES ACCESS TO ALL PHYSICAL!!!
>
> Sorry for shouting. :)

Never mind :)





>
>>>> +
>>>> +static enum resp_states process_flush(struct rxe_qp *qp,
>>>> +                       struct rxe_pkt_info *pkt)
>>>> +{
>>>> +    u64 length = 0, start = qp->resp.va;
>>>> +    u32 sel = feth_sel(pkt);
>>>> +    u32 plt = feth_plt(pkt);
>>>> +    struct rxe_mr *mr = qp->resp.mr;
>>>> +
>>>> +    if (sel == IB_EXT_SEL_MR_RANGE)
>>>> +        length = qp->resp.length;
>>>> +    else if (sel == IB_EXT_SEL_MR_WHOLE)
>>>> +        length = mr->cur_map_set->length;
>>>
>>> I'm going to have to think about these
>>
>> Yes, you inspire me that we should consider to adjust the start of iova to the MR's start as well.
>
> I'll review again when you decide.
>>>> +
>>>> +    if (plt == IB_EXT_PLT_PERSIST) {
>>>> +        nvdimm_flush_iova(mr, start, length);
>>>> +        wmb(); // clwb follows by a sfence
>>>> +    } else if (plt == IB_EXT_PLT_GLB_VIS)
>>>> +        wmb(); // sfence is enough
>>>
>>> The persistence and global visibility bits are not mutually
>>> exclusive,
>> My bad, it ever appeared in my mind. o(╯□╰)o
>>
>>
>>
>>
>>> and in fact persistence does not imply global
>>> visibility in some platforms.
>> If so, and per the SPEC, why not
>>      if (plt & IB_EXT_PLT_PERSIST)
>>         do_somethingA();
>>      if (plt & IB_EXT_PLT_GLB_VIS)
>>         do_somethingB();
>
> At the simplest, yes. But there are many subtle other possibilities.
>
> The global visibility is oriented toward supporting distributed
> shared memory workloads, or publish/subscribe on high scale systems.
> It's mainly about ensuring that all devices and CPUs see the data,
> something ordinary RDMA Write does not guarantee. This often means
> flushing write pipelines, possibly followed by invalidating caches.
>
> The persistence, of course, is about ensuring the data is stored. This
> is entirely different from making it visible.
>
> In fact, you often want to optimize the two scenarios together, since
> these operations are all about optimizing latency. The choice is going
> to depend greatly on the behavior of the platform itself. For example,
> certain caches provide persistence when batteries are present. So,
> providing persistence and providing visibility are one and the same.
> No need to do that twice.
>
> On the other hand, some systems may provide persistence only after a
> significant cost, such as writing into flash from a DRAM buffer, or
> when an Optane DIMM becomes overloaded from continuous writes and
> begins to throttle them. The flush may need some rather intricate
> processing, to avoid deadlock.
>
> Your code does not appear to envision these, so the simple way might
> be best for now. But you should consider other situations.

Thanks a lot for your detailed explanation.



>
>>> Second, the "clwb" and "sfence" comments are completely
>>> Intel-specific.
>> good catch.
>>
>>
>>> What processing will be done on other
>>> processor platforms???
>>
>> I didn't dig other ARCH yet but INTEL.
>> In this function, i think i just need to call the higher level wrapper, like wmb() and
>> arch_wb_cache_pmem are enough, right ?
>
> Well, higher level wrappers may signal errors, for example if they're
> not available or unreliable, and you will need to handle them. Also,
> they may block. Is that ok in this context?

Good question.

My bad, i forgot to check to return value of nvdimm_flush_iova() previously.

But if you mean we should guarantee arch_wb_cache_pmem() and wmb(), i have not
idea yet.

arch_wb_cache_pmem() and wmb(), What i'm currently using are just to hide
the assembly instructions on different architectures. And they are void return.

I wonder if we can always assume they work always When the code reaches them.
Since all current local flushing to pmem do something like that AFAIK(Feel free
to correct me if i'm wrong)

Thanks
Zhijian

>
> You need to get this right on all platforms which will run this code.
> It is not acceptable to guess at whether the data is in the required
> state before completing the RDMA_FLUSH. If this is not guaranteed,
> the operation must raise the required error. To do anything else will
> violate the protocol contract, and the remote application may fail.


>
> Tom.
Tom Talpey Jan. 4, 2022, 4:02 p.m. UTC | #5
On 1/4/2022 3:51 AM, lizhijian@fujitsu.com wrote:
> 
> 
> On 31/12/2021 10:32, Tom Talpey wrote:
>>
>> On 12/30/2021 8:37 PM, lizhijian@fujitsu.com wrote
>>>>> +static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
>>>>> +{
>>>>> +    int            err;
>>>>> +    int            bytes;
>>>>> +    u8            *va;
>>>>> +    struct rxe_map        **map;
>>>>> +    struct rxe_phys_buf    *buf;
>>>>> +    int            m;
>>>>> +    int            i;
>>>>> +    size_t            offset;
>>>>> +
>>>>> +    if (length == 0)
>>>>> +        return 0;
>>>>
>>>> The length is only relevant when the flush type is "Memory Region
>>>> Range".
>>>>
>>>> When the flush type is "Memory Region", the entire region must be
>>>> flushed successfully before completing the operation.
>>>
>>> Yes, currently, the length has been expanded to the MR's length in such case.
>>
>> I'll dig deeper, but this is not immediately clear in this context.
>> A zero-length operation is however valid, even though it's a no-op,
> 
> If it's no-op, what shall we do in this case.

It's a no-op only in the case that the flush has the MRR (range)
flag set. When the Region flag is set, the length is ignored. As
you point out, the length is determined in process_flush().

I think the confusion arises because the routine is being used
for both cases, and the zero-length case is only valid for one.
To me, it would make more sense to make this test before calling
it. Something like:

+static enum resp_states process_flush(struct rxe_qp *qp,
+				       struct rxe_pkt_info *pkt)
+{
+	u64 length = 0, start = qp->resp.va;
+	u32 sel = feth_sel(pkt);
+	u32 plt = feth_plt(pkt);
+	struct rxe_mr *mr = qp->resp.mr;
+
+	if (sel == IB_EXT_SEL_MR_RANGE)
+		length = qp->resp.length;
+	else if (sel == IB_EXT_SEL_MR_WHOLE)
	{
+		length = mr->cur_map_set->length;
		WARN_ON(length == 0);
	}

	if (length > 0) {
		// there may be optimizations possible here...
		if (plt & IB_EXT_PLT_PERSIST)
			(flush to persistence)
		if (plt & IB_EXT_PLT_GLB_VIS)
			(flush to global visibility)
	}

+	/* set RDMA READ response of zero */
+	qp->resp.resid = 0;
+
+	return RESPST_READ_REPLY;
+}

Tom.

> 
> 
>> the application may use it to ensure certain ordering constraints.
>> Will comment later if I have a specific concern.
> 
> kindly welcome :)
> 
> 
> 
> 
>>
>>>>> +
>>>>> +    if (mr->type == IB_MR_TYPE_DMA) {
>>>>> +        arch_wb_cache_pmem((void *)iova, length);
>>>>> +        return 0;
>>>>> +    }
>>>>
>>>> Are dmamr's supported for remote access? I thought that was
>>>> prevented on first principles now. I might suggest not allowing
>>>> them to be flushed in any event. There is no length restriction,
>>>> and it's a VERY costly operation. At a minimum, protect this
>>>> closely.
>>> Indeed, I didn't confidence about this, the main logic comes from rxe_mr_copy()
>>> Thanks for the suggestion.
>>
>> Well, be careful when following semantics from local behaviors. When you
>> are processing a command from the wire, extreme caution is needed.
>> ESPECIALLY WHEN IT'S A DMA_MR, WHICH PROVIDES ACCESS TO ALL PHYSICAL!!!
>>
>> Sorry for shouting. :)
> 
> Never mind :)
> 
> 
> 
> 
> 
>>
>>>>> +
>>>>> +static enum resp_states process_flush(struct rxe_qp *qp,
>>>>> +                       struct rxe_pkt_info *pkt)
>>>>> +{
>>>>> +    u64 length = 0, start = qp->resp.va;
>>>>> +    u32 sel = feth_sel(pkt);
>>>>> +    u32 plt = feth_plt(pkt);
>>>>> +    struct rxe_mr *mr = qp->resp.mr;
>>>>> +
>>>>> +    if (sel == IB_EXT_SEL_MR_RANGE)
>>>>> +        length = qp->resp.length;
>>>>> +    else if (sel == IB_EXT_SEL_MR_WHOLE)
>>>>> +        length = mr->cur_map_set->length;
>>>>
>>>> I'm going to have to think about these
>>>
>>> Yes, you inspire me that we should consider to adjust the start of iova to the MR's start as well.
>>
>> I'll review again when you decide.
>>>>> +
>>>>> +    if (plt == IB_EXT_PLT_PERSIST) {
>>>>> +        nvdimm_flush_iova(mr, start, length);
>>>>> +        wmb(); // clwb follows by a sfence
>>>>> +    } else if (plt == IB_EXT_PLT_GLB_VIS)
>>>>> +        wmb(); // sfence is enough
>>>>
>>>> The persistence and global visibility bits are not mutually
>>>> exclusive,
>>> My bad, it ever appeared in my mind. o(╯□╰)o
>>>
>>>
>>>
>>>
>>>> and in fact persistence does not imply global
>>>> visibility in some platforms.
>>> If so, and per the SPEC, why not
>>>       if (plt & IB_EXT_PLT_PERSIST)
>>>          do_somethingA();
>>>       if (plt & IB_EXT_PLT_GLB_VIS)
>>>          do_somethingB();
>>
>> At the simplest, yes. But there are many subtle other possibilities.
>>
>> The global visibility is oriented toward supporting distributed
>> shared memory workloads, or publish/subscribe on high scale systems.
>> It's mainly about ensuring that all devices and CPUs see the data,
>> something ordinary RDMA Write does not guarantee. This often means
>> flushing write pipelines, possibly followed by invalidating caches.
>>
>> The persistence, of course, is about ensuring the data is stored. This
>> is entirely different from making it visible.
>>
>> In fact, you often want to optimize the two scenarios together, since
>> these operations are all about optimizing latency. The choice is going
>> to depend greatly on the behavior of the platform itself. For example,
>> certain caches provide persistence when batteries are present. So,
>> providing persistence and providing visibility are one and the same.
>> No need to do that twice.
>>
>> On the other hand, some systems may provide persistence only after a
>> significant cost, such as writing into flash from a DRAM buffer, or
>> when an Optane DIMM becomes overloaded from continuous writes and
>> begins to throttle them. The flush may need some rather intricate
>> processing, to avoid deadlock.
>>
>> Your code does not appear to envision these, so the simple way might
>> be best for now. But you should consider other situations.
> 
> Thanks a lot for your detailed explanation.
> 
> 
> 
>>
>>>> Second, the "clwb" and "sfence" comments are completely
>>>> Intel-specific.
>>> good catch.
>>>
>>>
>>>> What processing will be done on other
>>>> processor platforms???
>>>
>>> I didn't dig other ARCH yet but INTEL.
>>> In this function, i think i just need to call the higher level wrapper, like wmb() and
>>> arch_wb_cache_pmem are enough, right ?
>>
>> Well, higher level wrappers may signal errors, for example if they're
>> not available or unreliable, and you will need to handle them. Also,
>> they may block. Is that ok in this context?
> 
> Good question.
> 
> My bad, i forgot to check to return value of nvdimm_flush_iova() previously.
> 
> But if you mean we should guarantee arch_wb_cache_pmem() and wmb(), i have not
> idea yet.
> 
> arch_wb_cache_pmem() and wmb(), What i'm currently using are just to hide
> the assembly instructions on different architectures. And they are void return.
> 
> I wonder if we can always assume they work always When the code reaches them.
> Since all current local flushing to pmem do something like that AFAIK(Feel free
> to correct me if i'm wrong)
> 
> Thanks
> Zhijian
> 
>>
>> You need to get this right on all platforms which will run this code.
>> It is not acceptable to guess at whether the data is in the required
>> state before completing the RDMA_FLUSH. If this is not guaranteed,
>> the operation must raise the required error. To do anything else will
>> violate the protocol contract, and the remote application may fail.
> 
> 
>>
>> Tom.
Tom Talpey Jan. 4, 2022, 4:40 p.m. UTC | #6
On 12/28/2021 3:07 AM, Li Zhijian wrote:

> +static enum resp_states process_flush(struct rxe_qp *qp,
> +				       struct rxe_pkt_info *pkt)
> +{
> +	u64 length = 0, start = qp->resp.va;
> +	u32 sel = feth_sel(pkt);
> +	u32 plt = feth_plt(pkt);
> +	struct rxe_mr *mr = qp->resp.mr;
> +
> +	if (sel == IB_EXT_SEL_MR_RANGE)
> +		length = qp->resp.length;
> +	else if (sel == IB_EXT_SEL_MR_WHOLE)
> +		length = mr->cur_map_set->length;

I noticed another issue in this. When the "Memory Region" flag is
set, the RETH:VA field in the request is ignored, in addition to
the RETH:DMALEN. Therefore, both the start and length must be set
from the map.

Is the mr->cur_map_set[0]->addr field a virtual address, or a
physical? I can't find the cur_map_set in any patch. The virtual
(mapped) address will be needed, to pass to arch_wb_cache_pmem().

Something like this?

	if (sel == IB_EXT_SEL_MR_WHOLE) {
		length = mr->cur_map_set->length;
		start = mr->cur_map_set[0]->addr;
	}

(in addition to my other review suggestions about 0-length, etc...)

Tom.
Zhijian Li (Fujitsu) Jan. 5, 2022, 1:43 a.m. UTC | #7
On 05/01/2022 00:40, Tom Talpey wrote:
>
> On 12/28/2021 3:07 AM, Li Zhijian wrote:
>
>> +static enum resp_states process_flush(struct rxe_qp *qp,
>> +                       struct rxe_pkt_info *pkt)
>> +{
>> +    u64 length = 0, start = qp->resp.va;
>> +    u32 sel = feth_sel(pkt);
>> +    u32 plt = feth_plt(pkt);
>> +    struct rxe_mr *mr = qp->resp.mr;
>> +
>> +    if (sel == IB_EXT_SEL_MR_RANGE)
>> +        length = qp->resp.length;
>> +    else if (sel == IB_EXT_SEL_MR_WHOLE)
>> +        length = mr->cur_map_set->length;
>
> I noticed another issue in this. When the "Memory Region" flag is
> set, the RETH:VA field in the request is ignored, in addition to
> the RETH:DMALEN. Therefore, both the start and length must be set
> from the map.
Yes, that's true

Actually, i have drafted a new version last week when you first time pointed out this.
https://github.com/zhijianli88/linux/commit/a368d821b163f74836d527638625d414c7a62d00#diff-1b675c68d2f5f664abefea05cbf415f6307c9920b31a13bed41105c4a84e0259R641
the latest code is like:
     if (sel == IB_EXT_SEL_MR_RANGE) {
         start = qp->resp.va;
         length = qp->resp.length;
     } else { /* sel == IB_EXT_SEL_MR_WHOLE */
         start = mr->cur_map_set->iova;
         length = mr->cur_map_set->length;
     }


>
> Is the mr->cur_map_set[0]->addr field a virtual address, or a
> physical? 
mr->cur_map_set[0]->addr is a virtual address in kernel space.

nvdimm_flush_iova() will accept a user space address(alloc(), mmap() by applications),
this address could be transited to a kernel virtual address by lookup_iova() where it
will refer to mr->cur_map_set[n]->addr.

So i think we should use qp->resp.va and mr->cur_map_set->iova that are both user
space address.



> I can't find the cur_map_set in any patch. 
cur_map_set will be filled in rxe_mr_init_user(),
rxe_mr_init_user()
  -> ib_umem_get()
    -> pin_user_pages_fast()
  -> update cur_map_set


BTW: some of you all comments are already addressed in
https://github.com/zhijianli88/linux/tree/rdma-flush
and it may forced update irregularly.


Thanks
Zhijian

> The virtual
> (mapped) address will be needed, to pass to arch_wb_cache_pmem().
>
> Something like this?
>
>     if (sel == IB_EXT_SEL_MR_WHOLE) {
>         length = mr->cur_map_set->length;
>         start = mr->cur_map_set[0]->addr;
>     }
>
> (in addition to my other review suggestions about 0-length, etc...)
>
> Tom.
Jason Gunthorpe Jan. 6, 2022, 12:28 a.m. UTC | #8
On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
> +	while (length > 0) {
> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
> +		bytes	= buf->size - offset;
> +
> +		if (bytes > length)
> +			bytes = length;
> +
> +		arch_wb_cache_pmem(va, bytes);

So why did we need to check that the va was pmem to call this?

Jason
Jason Gunthorpe Jan. 6, 2022, 12:35 a.m. UTC | #9
On Thu, Dec 30, 2021 at 09:32:06PM -0500, Tom Talpey wrote:

> The global visibility is oriented toward supporting distributed
> shared memory workloads, or publish/subscribe on high scale systems.
> It's mainly about ensuring that all devices and CPUs see the data,
> something ordinary RDMA Write does not guarantee. This often means
> flushing write pipelines, possibly followed by invalidating caches.

Isn't that what that new ATOMIC_WRITE does? Why do I need to flush if
ATOMIC WRITE was specified as a release? All I need to do is acquire
on the ATOMIC_WRITE site and I'm good?

So what does FLUSH do here, and how does a CPU 'acquire' against this
kind of flush? The flush doesn't imply any ordering right, so how is
it useful on its own?

The write to persistance aspect I understand, but this notion of
global viability seems peculiar.

> Well, higher level wrappers may signal errors, for example if they're
> not available or unreliable, and you will need to handle them. Also,
> they may block. Is that ok in this context?

I'm not sure we have any other tools here beyond a release barrier
like wmb() or the special pmem cache flush. Neither are blocking or
can fail.

In pmem systems storage failure is handled via the memory failure
stuff asynchronously.

Jason
Zhijian Li (Fujitsu) Jan. 6, 2022, 6:42 a.m. UTC | #10
On 06/01/2022 08:28, Jason Gunthorpe wrote:
> On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
>> +	while (length > 0) {
>> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
>> +		bytes	= buf->size - offset;
>> +
>> +		if (bytes > length)
>> +			bytes = length;
>> +
>> +		arch_wb_cache_pmem(va, bytes);
> So why did we need to check that the va was pmem to call this?
Sorry, i didn't get you.

I didn't check whether va is pmem, since only MR registered with PERSISTENCE(only pmem can
register this access flag) can reach here.

Thanks
Zhijian


>
> Jason
Jason Gunthorpe Jan. 6, 2022, 5:33 p.m. UTC | #11
On Thu, Jan 06, 2022 at 06:42:57AM +0000, lizhijian@fujitsu.com wrote:
> 
> 
> On 06/01/2022 08:28, Jason Gunthorpe wrote:
> > On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
> >> +	while (length > 0) {
> >> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
> >> +		bytes	= buf->size - offset;
> >> +
> >> +		if (bytes > length)
> >> +			bytes = length;
> >> +
> >> +		arch_wb_cache_pmem(va, bytes);
> > So why did we need to check that the va was pmem to call this?
> Sorry, i didn't get you.
> 
> I didn't check whether va is pmem, since only MR registered with PERSISTENCE(only pmem can
> register this access flag) can reach here.

Yes, that is what I mean, why did we need to check anything to call
this API - it should work on any CPU mapped address.

Jason
Zhijian Li (Fujitsu) Jan. 10, 2022, 5:45 a.m. UTC | #12
Hi Jason


On 07/01/2022 01:33, Jason Gunthorpe wrote:
> On Thu, Jan 06, 2022 at 06:42:57AM +0000, lizhijian@fujitsu.com wrote:
>>
>> On 06/01/2022 08:28, Jason Gunthorpe wrote:
>>> On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
>>>> +	while (length > 0) {
>>>> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
>>>> +		bytes	= buf->size - offset;
>>>> +
>>>> +		if (bytes > length)
>>>> +			bytes = length;
>>>> +
>>>> +		arch_wb_cache_pmem(va, bytes);
>>> So why did we need to check that the va was pmem to call this?
>> Sorry, i didn't get you.
>>
>> I didn't check whether va is pmem, since only MR registered with PERSISTENCE(only pmem can
>> register this access flag) can reach here.
> Yes, that is what I mean,

I'm not sure I understand the *check* you mentioned above.

Current code just dose something like:

if (!sanity_check())
     return;
if (requested_plt == PERSISTENCE)
     va = iova_to_va(iova);
     arch_wb_cache_pmem(va, bytes);
     wmb;
else if (requested_plt == GLOBAL_VISIBILITY)
     wmb();


> why did we need to check anything to call
> this API
As above pseudo code,  it didn't *check* anything as what you said i think.


> - it should work on any CPU mapped address.
Of course, arch_wb_cache_pmem(va, bytes) works on CPU mapped address backing to both dimm and nvdimm,
but not a iova that could refers to user space address.


Thanks
Zhijian



>
> Jason
Jason Gunthorpe Jan. 10, 2022, 2:34 p.m. UTC | #13
On Mon, Jan 10, 2022 at 05:45:47AM +0000, lizhijian@fujitsu.com wrote:
> Hi Jason
> 
> 
> On 07/01/2022 01:33, Jason Gunthorpe wrote:
> > On Thu, Jan 06, 2022 at 06:42:57AM +0000, lizhijian@fujitsu.com wrote:
> >>
> >> On 06/01/2022 08:28, Jason Gunthorpe wrote:
> >>> On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
> >>>> +	while (length > 0) {
> >>>> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
> >>>> +		bytes	= buf->size - offset;
> >>>> +
> >>>> +		if (bytes > length)
> >>>> +			bytes = length;
> >>>> +
> >>>> +		arch_wb_cache_pmem(va, bytes);
> >>> So why did we need to check that the va was pmem to call this?
> >> Sorry, i didn't get you.
> >>
> >> I didn't check whether va is pmem, since only MR registered with PERSISTENCE(only pmem can
> >> register this access flag) can reach here.
> > Yes, that is what I mean,
> 
> I'm not sure I understand the *check* you mentioned above.
> 
> Current code just dose something like:
> 
> if (!sanity_check())
>      return;
> if (requested_plt == PERSISTENCE)
>      va = iova_to_va(iova);
>      arch_wb_cache_pmem(va, bytes);
>      wmb;
> else if (requested_plt == GLOBAL_VISIBILITY)
>      wmb();
> 
> 
> > why did we need to check anything to call
> > this API
> As above pseudo code,  it didn't *check* anything as what you said i think.

I mean when you created the MR in the first place you checked for pmem
before even allowing the persitent access flag.

Jason
Zhijian Li (Fujitsu) Jan. 11, 2022, 5:34 a.m. UTC | #14
On 10/01/2022 22:34, Jason Gunthorpe wrote:
> On Mon, Jan 10, 2022 at 05:45:47AM +0000, lizhijian@fujitsu.com wrote:
>> Hi Jason
>>
>>
>> On 07/01/2022 01:33, Jason Gunthorpe wrote:
>>> On Thu, Jan 06, 2022 at 06:42:57AM +0000, lizhijian@fujitsu.com wrote:
>>>> On 06/01/2022 08:28, Jason Gunthorpe wrote:
>>>>> On Tue, Dec 28, 2021 at 04:07:15PM +0800, Li Zhijian wrote:
>>>>>> +	while (length > 0) {
>>>>>> +		va	= (u8 *)(uintptr_t)buf->addr + offset;
>>>>>> +		bytes	= buf->size - offset;
>>>>>> +
>>>>>> +		if (bytes > length)
>>>>>> +			bytes = length;
>>>>>> +
>>>>>> +		arch_wb_cache_pmem(va, bytes);
>>>>> So why did we need to check that the va was pmem to call this?
>>>> Sorry, i didn't get you.
>>>>
>>>> I didn't check whether va is pmem, since only MR registered with PERSISTENCE(only pmem can
>>>> register this access flag) can reach here.
>>> Yes, that is what I mean,
>> I'm not sure I understand the *check* you mentioned above.
>>
>> Current code just dose something like:
>>
>> if (!sanity_check())
>>       return;
>> if (requested_plt == PERSISTENCE)
>>       va = iova_to_va(iova);
>>       arch_wb_cache_pmem(va, bytes);
>>       wmb;
>> else if (requested_plt == GLOBAL_VISIBILITY)
>>       wmb();
>>
>>
>>> why did we need to check anything to call
>>> this API
>> As above pseudo code,  it didn't *check* anything as what you said i think.
> I mean when you created the MR in the first place you checked for pmem
> before even allowing the persitent access flag.

Yes, that's true. that's because only pmem has ability to persist data.
So do you mean we don't need to prevent user to create/register a persistent
access flag to a non-pmem MR? it would be a bit confusing if so.


>
> Jason
Jason Gunthorpe Jan. 11, 2022, 8:48 p.m. UTC | #15
On Tue, Jan 11, 2022 at 05:34:36AM +0000, lizhijian@fujitsu.com wrote:

> Yes, that's true. that's because only pmem has ability to persist data.
> So do you mean we don't need to prevent user to create/register a persistent
> access flag to a non-pmem MR? it would be a bit confusing if so.

Since these extensions seem to have a mode that is unrelated to
persistent memory, I'm not sure it makes sense to link the two things.

Jason
Zhijian Li (Fujitsu) Jan. 12, 2022, 9:50 a.m. UTC | #16
On 12/01/2022 04:48, Jason Gunthorpe wrote:
> On Tue, Jan 11, 2022 at 05:34:36AM +0000, lizhijian@fujitsu.com wrote:
>
>> Yes, that's true. that's because only pmem has ability to persist data.
>> So do you mean we don't need to prevent user to create/register a persistent
>> access flag to a non-pmem MR? it would be a bit confusing if so.
> Since these extensions seem to have a mode that is unrelated to
> persistent memory,
I can only agree with part of them, since the extensions also say that:

oA19-1: Responder shall successfully respond on FLUSH operation only
after providing the placement guarantees, as specified in the packet, of
preceding memory updates (for example: RDMA WRITE, Atomics and
ATOMIC WRITE) towards the memory region.

it mentions *shall successfully respond on FLUSH operation only
after providing the placement guarantees*. If users request a
persistent placement to a non-pmem MR without errors,  from view
of the users, they will think of their request had been *successfully responded*
that doesn't reflect the true(data was persisted).

So i think we should respond error to request side in such case.


Further more, If we have a checking during the new MR creating/registering,
user who registers this MR can know if the target MR supports persistent access flag.
Then they can tell this information to the request side so that request side can
request a valid placement type later. that is similar behavior with current librpma.


Thanks
Zhijian

>   I'm not sure it makes sense to link the two things.
>
> Jason
Jason Gunthorpe Jan. 12, 2022, 1:12 p.m. UTC | #17
On Wed, Jan 12, 2022 at 09:50:38AM +0000, lizhijian@fujitsu.com wrote:
> 
> 
> On 12/01/2022 04:48, Jason Gunthorpe wrote:
> > On Tue, Jan 11, 2022 at 05:34:36AM +0000, lizhijian@fujitsu.com wrote:
> >
> >> Yes, that's true. that's because only pmem has ability to persist data.
> >> So do you mean we don't need to prevent user to create/register a persistent
> >> access flag to a non-pmem MR? it would be a bit confusing if so.
> > Since these extensions seem to have a mode that is unrelated to
> > persistent memory,
> I can only agree with part of them, since the extensions also say that:
> 
> oA19-1: Responder shall successfully respond on FLUSH operation only
> after providing the placement guarantees, as specified in the packet, of
> preceding memory updates (for example: RDMA WRITE, Atomics and
> ATOMIC WRITE) towards the memory region.
> 
> it mentions *shall successfully respond on FLUSH operation only
> after providing the placement guarantees*. If users request a
> persistent placement to a non-pmem MR without errors,  from view
> of the users, they will think of their request had been *successfully responded*
> that doesn't reflect the true(data was persisted).

The "placement guarentees" should obviously be variable depending on
the type of memory being targeted.

> Further more, If we have a checking during the new MR creating/registering,
> user who registers this MR can know if the target MR supports persistent access flag.
> Then they can tell this information to the request side so that request side can
> request a valid placement type later. that is similar behavior with current librpma.

Then you can't use ATOMIC_WRITE with non-nvdimm memory, which is
nonsense

Jason
Zhijian Li (Fujitsu) Jan. 13, 2022, 6:29 a.m. UTC | #18
On 12/01/2022 21:12, Jason Gunthorpe wrote:
> On Wed, Jan 12, 2022 at 09:50:38AM +0000, lizhijian@fujitsu.com wrote:
>>
>> On 12/01/2022 04:48, Jason Gunthorpe wrote:
>>> On Tue, Jan 11, 2022 at 05:34:36AM +0000, lizhijian@fujitsu.com wrote:
>>>
>>>> Yes, that's true. that's because only pmem has ability to persist data.
>>>> So do you mean we don't need to prevent user to create/register a persistent
>>>> access flag to a non-pmem MR? it would be a bit confusing if so.
>>> Since these extensions seem to have a mode that is unrelated to
>>> persistent memory,
>> I can only agree with part of them, since the extensions also say that:
>>
>> oA19-1: Responder shall successfully respond on FLUSH operation only
>> after providing the placement guarantees, as specified in the packet, of
>> preceding memory updates (for example: RDMA WRITE, Atomics and
>> ATOMIC WRITE) towards the memory region.
>>
>> it mentions *shall successfully respond on FLUSH operation only
>> after providing the placement guarantees*. If users request a
>> persistent placement to a non-pmem MR without errors,  from view
>> of the users, they will think of their request had been *successfully responded*
>> that doesn't reflect the true(data was persisted).
> The "placement guarentees" should obviously be variable depending on
> the type of memory being targeted.

> Since these extensions seem to have a mode that is unrelated to
> persistent memory,


Although the SPEC doesn't link extensions to persistent memory directly, but
the *Persistence* is linked to pmem like indicated memory region. See below:

A19.2 G LOSSARY

Global Visibility
   Ensuring the placement of the preceding data accesses in the indicated
   memory region is visible for reading by the responder platform.
Persistence
   Ensuring the placement of the preceding data accesses in the indicated
   memory region is persistent and the data will be preserved across power
   cycle or other failure of the responder platform.

Thanks
Zhijian


>
>> Further more, If we have a checking during the new MR creating/registering,
>> user who registers this MR can know if the target MR supports persistent access flag.
>> Then they can tell this information to the request side so that request side can
>> request a valid placement type later. that is similar behavior with current librpma.
> Then you can't use ATOMIC_WRITE with non-nvdimm memory, which is
> nonsense
>
> Jason
diff mbox series

Patch

diff --git a/drivers/infiniband/sw/rxe/rxe_hdr.h b/drivers/infiniband/sw/rxe/rxe_hdr.h
index e37aa1944b18..cdfd393b8bd8 100644
--- a/drivers/infiniband/sw/rxe/rxe_hdr.h
+++ b/drivers/infiniband/sw/rxe/rxe_hdr.h
@@ -630,6 +630,34 @@  static inline void feth_init(struct rxe_pkt_info *pkt, u32 type, u32 level)
 	*p = cpu_to_be32(feth);
 }
 
+static inline u32 __feth_plt(void *arg)
+{
+	u32 *fethp = arg;
+	u32 feth = be32_to_cpu(*fethp);
+
+	return (feth & FETH_PLT_MASK) >> FETH_PLT_SHIFT;
+}
+
+static inline u32 __feth_sel(void *arg)
+{
+	u32 *fethp = arg;
+	u32 feth = be32_to_cpu(*fethp);
+
+	return (feth & FETH_SEL_MASK) >> FETH_SEL_SHIFT;
+}
+
+static inline u32 feth_plt(struct rxe_pkt_info *pkt)
+{
+	return __feth_plt(pkt->hdr +
+		rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
+static inline u32 feth_sel(struct rxe_pkt_info *pkt)
+{
+	return __feth_sel(pkt->hdr +
+		rxe_opcode[pkt->opcode].offset[RXE_FETH]);
+}
+
 /******************************************************************************
  * Atomic Extended Transport Header
  ******************************************************************************/
diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
index b1e174afb1d4..73c39ff11e28 100644
--- a/drivers/infiniband/sw/rxe/rxe_loc.h
+++ b/drivers/infiniband/sw/rxe/rxe_loc.h
@@ -80,6 +80,8 @@  int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
 		enum rxe_mr_copy_dir dir);
 int copy_data(struct rxe_pd *pd, int access, struct rxe_dma_info *dma,
 	      void *addr, int length, enum rxe_mr_copy_dir dir);
+void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
+		 size_t *offset_out);
 void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length);
 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
 			 enum rxe_mr_lookup_type type);
diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
index 21616d058f29..2cb530305392 100644
--- a/drivers/infiniband/sw/rxe/rxe_mr.c
+++ b/drivers/infiniband/sw/rxe/rxe_mr.c
@@ -325,8 +325,8 @@  int rxe_mr_init_fast(struct rxe_pd *pd, int max_pages, struct rxe_mr *mr)
 	return err;
 }
 
-static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
-			size_t *offset_out)
+void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
+		 size_t *offset_out)
 {
 	struct rxe_map_set *set = mr->cur_map_set;
 	size_t offset = iova - set->iova + set->offset;
diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c
index e8f435fa6e4d..6730336037d1 100644
--- a/drivers/infiniband/sw/rxe/rxe_resp.c
+++ b/drivers/infiniband/sw/rxe/rxe_resp.c
@@ -5,6 +5,7 @@ 
  */
 
 #include <linux/skbuff.h>
+#include <linux/libnvdimm.h>
 
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -19,6 +20,7 @@  enum resp_states {
 	RESPST_CHK_RESOURCE,
 	RESPST_CHK_LENGTH,
 	RESPST_CHK_RKEY,
+	RESPST_CHK_PLT,
 	RESPST_EXECUTE,
 	RESPST_READ_REPLY,
 	RESPST_COMPLETE,
@@ -35,6 +37,7 @@  enum resp_states {
 	RESPST_ERR_TOO_MANY_RDMA_ATM_REQ,
 	RESPST_ERR_RNR,
 	RESPST_ERR_RKEY_VIOLATION,
+	RESPST_ERR_PLT_VIOLATION,
 	RESPST_ERR_INVALIDATE_RKEY,
 	RESPST_ERR_LENGTH,
 	RESPST_ERR_CQ_OVERFLOW,
@@ -53,6 +56,7 @@  static char *resp_state_name[] = {
 	[RESPST_CHK_RESOURCE]			= "CHK_RESOURCE",
 	[RESPST_CHK_LENGTH]			= "CHK_LENGTH",
 	[RESPST_CHK_RKEY]			= "CHK_RKEY",
+	[RESPST_CHK_PLT]			= "CHK_PLACEMENT_TYPE",
 	[RESPST_EXECUTE]			= "EXECUTE",
 	[RESPST_READ_REPLY]			= "READ_REPLY",
 	[RESPST_COMPLETE]			= "COMPLETE",
@@ -69,6 +73,7 @@  static char *resp_state_name[] = {
 	[RESPST_ERR_TOO_MANY_RDMA_ATM_REQ]	= "ERR_TOO_MANY_RDMA_ATM_REQ",
 	[RESPST_ERR_RNR]			= "ERR_RNR",
 	[RESPST_ERR_RKEY_VIOLATION]		= "ERR_RKEY_VIOLATION",
+	[RESPST_ERR_PLT_VIOLATION]		= "ERR_PLACEMENT_TYPE_VIOLATION",
 	[RESPST_ERR_INVALIDATE_RKEY]		= "ERR_INVALIDATE_RKEY_VIOLATION",
 	[RESPST_ERR_LENGTH]			= "ERR_LENGTH",
 	[RESPST_ERR_CQ_OVERFLOW]		= "ERR_CQ_OVERFLOW",
@@ -400,6 +405,30 @@  static enum resp_states check_length(struct rxe_qp *qp,
 	}
 }
 
+static enum resp_states check_placement_type(struct rxe_qp *qp,
+					     struct rxe_pkt_info *pkt)
+{
+	struct rxe_mr *mr = qp->resp.mr;
+	u32 plt = feth_plt(pkt);
+
+	// no FLUSH access flag MR
+	if ((mr->access & IB_ACCESS_FLUSH) == 0) {
+		pr_err("This mr isn't registered any flush access permission\n");
+		return RESPST_ERR_PLT_VIOLATION;
+	}
+
+	if ((plt & IB_EXT_PLT_GLB_VIS &&
+	    !(mr->access & IB_ACCESS_FLUSH_GLOBAL_VISIBILITY)) ||
+	    (plt & IB_EXT_PLT_PERSIST &&
+	    !(mr->access & IB_ACCESS_FLUSH_PERSISTENT))) {
+		pr_err("Target MR don't allow this placement type, is_pmem: %d, register flag: %x, request flag: %x\n",
+		       mr->ibmr.is_pmem, mr->access >> 8, plt);
+		return RESPST_ERR_PLT_VIOLATION;
+	}
+
+	return RESPST_EXECUTE;
+}
+
 static enum resp_states check_rkey(struct rxe_qp *qp,
 				   struct rxe_pkt_info *pkt)
 {
@@ -413,7 +442,7 @@  static enum resp_states check_rkey(struct rxe_qp *qp,
 	enum resp_states state;
 	int access;
 
-	if (pkt->mask & RXE_READ_OR_WRITE_MASK) {
+	if (pkt->mask & (RXE_READ_OR_WRITE_MASK | RXE_FLUSH_MASK)) {
 		if (pkt->mask & RXE_RETH_MASK) {
 			qp->resp.va = reth_va(pkt);
 			qp->resp.offset = 0;
@@ -434,8 +463,10 @@  static enum resp_states check_rkey(struct rxe_qp *qp,
 	}
 
 	/* A zero-byte op is not required to set an addr or rkey. */
+	/* RXE_FETH_MASK carraies zero-byte playload */
 	if ((pkt->mask & RXE_READ_OR_WRITE_MASK) &&
 	    (pkt->mask & RXE_RETH_MASK) &&
+	    !(pkt->mask & RXE_FETH_MASK) &&
 	    reth_len(pkt) == 0) {
 		return RESPST_EXECUTE;
 	}
@@ -503,7 +534,7 @@  static enum resp_states check_rkey(struct rxe_qp *qp,
 	WARN_ON_ONCE(qp->resp.mr);
 
 	qp->resp.mr = mr;
-	return RESPST_EXECUTE;
+	return pkt->mask & RXE_FETH_MASK ? RESPST_CHK_PLT : RESPST_EXECUTE;
 
 err:
 	if (mr)
@@ -549,6 +580,91 @@  static enum resp_states write_data_in(struct rxe_qp *qp,
 	return rc;
 }
 
+static int nvdimm_flush_iova(struct rxe_mr *mr, u64 iova, int length)
+{
+	int			err;
+	int			bytes;
+	u8			*va;
+	struct rxe_map		**map;
+	struct rxe_phys_buf	*buf;
+	int			m;
+	int			i;
+	size_t			offset;
+
+	if (length == 0)
+		return 0;
+
+	if (mr->type == IB_MR_TYPE_DMA) {
+		arch_wb_cache_pmem((void *)iova, length);
+		return 0;
+	}
+
+	WARN_ON_ONCE(!mr->cur_map_set);
+
+	err = mr_check_range(mr, iova, length);
+	if (err) {
+		err = -EFAULT;
+		goto err1;
+	}
+
+	lookup_iova(mr, iova, &m, &i, &offset);
+
+	map = mr->cur_map_set->map + m;
+	buf	= map[0]->buf + i;
+
+	while (length > 0) {
+		va	= (u8 *)(uintptr_t)buf->addr + offset;
+		bytes	= buf->size - offset;
+
+		if (bytes > length)
+			bytes = length;
+
+		arch_wb_cache_pmem(va, bytes);
+
+		length	-= bytes;
+
+		offset	= 0;
+		buf++;
+		i++;
+
+		if (i == RXE_BUF_PER_MAP) {
+			i = 0;
+			map++;
+			buf = map[0]->buf;
+		}
+	}
+
+	return 0;
+
+err1:
+	return err;
+}
+
+static enum resp_states process_flush(struct rxe_qp *qp,
+				       struct rxe_pkt_info *pkt)
+{
+	u64 length = 0, start = qp->resp.va;
+	u32 sel = feth_sel(pkt);
+	u32 plt = feth_plt(pkt);
+	struct rxe_mr *mr = qp->resp.mr;
+
+	if (sel == IB_EXT_SEL_MR_RANGE)
+		length = qp->resp.length;
+	else if (sel == IB_EXT_SEL_MR_WHOLE)
+		length = mr->cur_map_set->length;
+
+	if (plt == IB_EXT_PLT_PERSIST) {
+		nvdimm_flush_iova(mr, start, length);
+		wmb(); // clwb follows by a sfence
+	} else if (plt == IB_EXT_PLT_GLB_VIS)
+		wmb(); // sfence is enough
+
+	/* set RDMA READ response of zero */
+	qp->resp.resid = 0;
+
+	return RESPST_READ_REPLY;
+}
+
 /* Guarantee atomicity of atomic operations at the machine level. */
 static DEFINE_SPINLOCK(atomic_ops_lock);
 
@@ -801,6 +917,8 @@  static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt)
 		err = process_atomic(qp, pkt);
 		if (err)
 			return err;
+	} else if (pkt->mask & RXE_FLUSH_MASK) {
+		return process_flush(qp, pkt);
 	} else {
 		/* Unreachable */
 		WARN_ON_ONCE(1);
@@ -1059,7 +1177,7 @@  static enum resp_states duplicate_request(struct rxe_qp *qp,
 		/* SEND. Ack again and cleanup. C9-105. */
 		send_ack(qp, pkt, AETH_ACK_UNLIMITED, prev_psn);
 		return RESPST_CLEANUP;
-	} else if (pkt->mask & RXE_READ_MASK) {
+	} else if (pkt->mask & RXE_READ_MASK || pkt->mask & RXE_FLUSH_MASK) {
 		struct resp_res *res;
 
 		res = find_resource(qp, pkt->psn);
@@ -1098,7 +1216,7 @@  static enum resp_states duplicate_request(struct rxe_qp *qp,
 			/* Reset the resource, except length. */
 			res->read.va_org = iova;
 			res->read.va = iova;
-			res->read.resid = resid;
+			res->read.resid = pkt->mask & RXE_FLUSH_MASK ? 0 : resid;
 
 			/* Replay the RDMA read reply. */
 			qp->resp.res = res;
@@ -1244,6 +1362,9 @@  int rxe_responder(void *arg)
 		case RESPST_CHK_RKEY:
 			state = check_rkey(qp, pkt);
 			break;
+		case RESPST_CHK_PLT:
+			state = check_placement_type(qp, pkt);
+			break;
 		case RESPST_EXECUTE:
 			state = execute(qp, pkt);
 			break;
@@ -1298,6 +1419,8 @@  int rxe_responder(void *arg)
 			break;
 
 		case RESPST_ERR_RKEY_VIOLATION:
+		/* oA19-13 8 */
+		case RESPST_ERR_PLT_VIOLATION:
 			if (qp_type(qp) == IB_QPT_RC) {
 				/* Class C */
 				do_class_ac_error(qp, AETH_NAK_REM_ACC_ERR,
diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h
index 4b7093f58259..efa06f53c7c6 100644
--- a/include/uapi/rdma/ib_user_verbs.h
+++ b/include/uapi/rdma/ib_user_verbs.h
@@ -105,6 +105,16 @@  enum {
 	IB_USER_VERBS_EX_CMD_MODIFY_CQ
 };
 
+enum ib_ext_placement_type {
+	IB_EXT_PLT_GLB_VIS = 1 << 0,
+	IB_EXT_PLT_PERSIST = 1 << 1,
+};
+
+enum ib_ext_selectivity_level {
+	IB_EXT_SEL_MR_RANGE = 0,
+	IB_EXT_SEL_MR_WHOLE,
+};
+
 /*
  * Make sure that all structs defined in this file remain laid out so
  * that they pack the same way on 32-bit and 64-bit architectures (to