diff mbox

[rdma-core,1/3] verbs: Add mmio_wc_spinlock barrier

Message ID 20170313170003.GC25664@obsidianresearch.com (mailing list archive)
State Accepted
Headers show

Commit Message

Jason Gunthorpe March 13, 2017, 5 p.m. UTC
On Mon, Mar 13, 2017 at 04:53:47PM +0200, Yishai Hadas wrote:
> From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> 
> For x86 the serialization within the spin lock is enough to
> strongly order WC and other memory types.
> 
> Add a new barrier named 'mmio_wc_spinlock' to optimize
> that.

Please use this patch with the commentary instead:

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Yishai Hadas March 14, 2017, 12:06 p.m. UTC | #1
On 3/13/2017 7:00 PM, Jason Gunthorpe wrote:
> On Mon, Mar 13, 2017 at 04:53:47PM +0200, Yishai Hadas wrote:
>> From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
>>
>> For x86 the serialization within the spin lock is enough to
>> strongly order WC and other memory types.
>>
>> Add a new barrier named 'mmio_wc_spinlock' to optimize
>> that.
>
> Please use this patch with the commentary instead:

OK, pull request was updated with below.
https://github.com/linux-rdma/rdma-core/pull/95

> diff --git a/util/udma_barrier.h b/util/udma_barrier.h
> index 9e73148af8d5b6..cfe0459d7f6fff 100644
> --- a/util/udma_barrier.h
> +++ b/util/udma_barrier.h
> @@ -33,6 +33,8 @@
>  #ifndef __UTIL_UDMA_BARRIER_H
>  #define __UTIL_UDMA_BARRIER_H
>
> +#include <pthread.h>
> +
>  /* Barriers for DMA.
>
>     These barriers are expliclty only for use with user DMA operations. If you
> @@ -222,4 +224,37 @@
>  */
>  #define mmio_ordered_writes_hack() mmio_flush_writes()
>
> +/* Write Combining Spinlock primitive
> +
> +   Any access to a multi-value WC region must ensure that multiple cpus do not
> +   write to the same values concurrently, these macros make that
> +   straightforward and efficient if the choosen exclusion is a spinlock.
> +
> +   The spinlock guarantees that the WC writes issued within the critical
> +   section are made visible as TLP to the device. The TLP must be seen by the
> +   device strictly in the order that the spinlocks are acquired, and combining
> +   WC writes between different sections is not permitted.
> +
> +   Use of these macros allow the fencing inside the spinlock to be combined
> +   with the fencing required for DMA.
> + */
> +static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
> +{
> +	pthread_spin_lock(lock);
> +#if !defined(__i386__) && !defined(__x86_64__)
> +	/* For x86 the serialization within the spin lock is enough to
> +	 * strongly order WC and other memory types. */
> +	mmio_wc_start();
> +#endif
> +}
> +
> +static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
> +{
> +	/* It is possible that on x86 the atomic in the lock is strong enough
> +	 * to force-flush the WC buffers quickly, and this SFENCE can be
> +	 * omitted too. */
> +	mmio_flush_writes();
> +	pthread_spin_unlock(lock);
> +}
> +
>  #endif

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Doug Ledford March 14, 2017, 3:12 p.m. UTC | #2
On Tue, 2017-03-14 at 14:06 +0200, Yishai Hadas wrote:
> On 3/13/2017 7:00 PM, Jason Gunthorpe wrote:
> > 
> > On Mon, Mar 13, 2017 at 04:53:47PM +0200, Yishai Hadas wrote:
> > > 
> > > From: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
> > > 
> > > For x86 the serialization within the spin lock is enough to
> > > strongly order WC and other memory types.
> > > 
> > > Add a new barrier named 'mmio_wc_spinlock' to optimize
> > > that.
> > 
> > Please use this patch with the commentary instead:
> 
> OK, pull request was updated with below.
> https://github.com/linux-rdma/rdma-core/pull/95

Thanks, I've merged this pull request.

> 
> > 
> > diff --git a/util/udma_barrier.h b/util/udma_barrier.h
> > index 9e73148af8d5b6..cfe0459d7f6fff 100644
> > --- a/util/udma_barrier.h
> > +++ b/util/udma_barrier.h
> > @@ -33,6 +33,8 @@
> >  #ifndef __UTIL_UDMA_BARRIER_H
> >  #define __UTIL_UDMA_BARRIER_H
> > 
> > +#include <pthread.h>
> > +
> >  /* Barriers for DMA.
> > 
> >     These barriers are expliclty only for use with user DMA
> > operations. If you
> > @@ -222,4 +224,37 @@
> >  */
> >  #define mmio_ordered_writes_hack() mmio_flush_writes()
> > 
> > +/* Write Combining Spinlock primitive
> > +
> > +   Any access to a multi-value WC region must ensure that multiple
> > cpus do not
> > +   write to the same values concurrently, these macros make that
> > +   straightforward and efficient if the choosen exclusion is a
> > spinlock.
> > +
> > +   The spinlock guarantees that the WC writes issued within the
> > critical
> > +   section are made visible as TLP to the device. The TLP must be
> > seen by the
> > +   device strictly in the order that the spinlocks are acquired,
> > and combining
> > +   WC writes between different sections is not permitted.
> > +
> > +   Use of these macros allow the fencing inside the spinlock to be
> > combined
> > +   with the fencing required for DMA.
> > + */
> > +static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
> > +{
> > +	pthread_spin_lock(lock);
> > +#if !defined(__i386__) && !defined(__x86_64__)
> > +	/* For x86 the serialization within the spin lock is
> > enough to
> > +	 * strongly order WC and other memory types. */
> > +	mmio_wc_start();
> > +#endif
> > +}
> > +
> > +static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
> > +{
> > +	/* It is possible that on x86 the atomic in the lock is
> > strong enough
> > +	 * to force-flush the WC buffers quickly, and this SFENCE
> > can be
> > +	 * omitted too. */
> > +	mmio_flush_writes();
> > +	pthread_spin_unlock(lock);
> > +}
> > +
> >  #endif
>
diff mbox

Patch

diff --git a/util/udma_barrier.h b/util/udma_barrier.h
index 9e73148af8d5b6..cfe0459d7f6fff 100644
--- a/util/udma_barrier.h
+++ b/util/udma_barrier.h
@@ -33,6 +33,8 @@ 
 #ifndef __UTIL_UDMA_BARRIER_H
 #define __UTIL_UDMA_BARRIER_H
 
+#include <pthread.h>
+
 /* Barriers for DMA.
 
    These barriers are expliclty only for use with user DMA operations. If you
@@ -222,4 +224,37 @@ 
 */
 #define mmio_ordered_writes_hack() mmio_flush_writes()
 
+/* Write Combining Spinlock primitive
+
+   Any access to a multi-value WC region must ensure that multiple cpus do not
+   write to the same values concurrently, these macros make that
+   straightforward and efficient if the choosen exclusion is a spinlock.
+
+   The spinlock guarantees that the WC writes issued within the critical
+   section are made visible as TLP to the device. The TLP must be seen by the
+   device strictly in the order that the spinlocks are acquired, and combining
+   WC writes between different sections is not permitted.
+
+   Use of these macros allow the fencing inside the spinlock to be combined
+   with the fencing required for DMA.
+ */
+static inline void mmio_wc_spinlock(pthread_spinlock_t *lock)
+{
+	pthread_spin_lock(lock);
+#if !defined(__i386__) && !defined(__x86_64__)
+	/* For x86 the serialization within the spin lock is enough to
+	 * strongly order WC and other memory types. */
+	mmio_wc_start();
+#endif
+}
+
+static inline void mmio_wc_spinunlock(pthread_spinlock_t *lock)
+{
+	/* It is possible that on x86 the atomic in the lock is strong enough
+	 * to force-flush the WC buffers quickly, and this SFENCE can be
+	 * omitted too. */
+	mmio_flush_writes();
+	pthread_spin_unlock(lock);
+}
+
 #endif