diff mbox

[2/2] dm-writecache

Message ID alpine.LRH.2.02.1711222135371.2330@file01.intranet.prod.int.rdu2.redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Mike Snitzer
Headers show

Commit Message

Mikulas Patocka Nov. 23, 2017, 2:36 a.m. UTC
Hi

Here I'm sending slighly update dm-writecache target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/Kconfig         |   11 
 drivers/md/Makefile        |    1 
 drivers/md/dm-writecache.c | 2312 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 2324 insertions(+)


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

Comments

Christoph Hellwig Nov. 23, 2017, 8:29 a.m. UTC | #1
> +/*
> + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> + * stores instead.
> + */
> +#ifdef CONFIG_X86_64
> +#define NT_STORE(dest, src)	asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> +#define COMMIT_FLUSHED()	wmb()
> +#else
> +#define NT_STORE(dest, src)	ACCESS_ONCE(dest) = (src)
> +#define FLUSH_RANGE		dax_flush
> +#define COMMIT_FLUSHED()	do { } while (0)
> +#endif

Please go through the proper pmem APIs.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Dan Williams Nov. 23, 2017, 4:29 p.m. UTC | #2
On Thu, Nov 23, 2017 at 12:29 AM, Christoph Hellwig <hch@infradead.org> wrote:
>> +/*
>> + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
>> + * stores instead.
>> + */
>> +#ifdef CONFIG_X86_64
>> +#define NT_STORE(dest, src)  asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
>> +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
>> +#define COMMIT_FLUSHED()     wmb()
>> +#else
>> +#define NT_STORE(dest, src)  ACCESS_ONCE(dest) = (src)
>> +#define FLUSH_RANGE          dax_flush
>> +#define COMMIT_FLUSHED()     do { } while (0)
>> +#endif
>
> Please go through the proper pmem APIs.

Yes, see _copy_from_iter_flushcache() and all its related apis.
Drivers shouldn't be doing anything different than what fs/dax.c is
using.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Nov. 23, 2017, 9:22 p.m. UTC | #3
On Thu, 23 Nov 2017, Dan Williams wrote:

> On Thu, Nov 23, 2017 at 12:29 AM, Christoph Hellwig <hch@infradead.org> wrote:
> >> +/*
> >> + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> >> + * stores instead.
> >> + */
> >> +#ifdef CONFIG_X86_64
> >> +#define NT_STORE(dest, src)  asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> >> +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> >> +#define COMMIT_FLUSHED()     wmb()
> >> +#else
> >> +#define NT_STORE(dest, src)  ACCESS_ONCE(dest) = (src)
> >> +#define FLUSH_RANGE          dax_flush
> >> +#define COMMIT_FLUSHED()     do { } while (0)
> >> +#endif
> >
> > Please go through the proper pmem APIs.
> 
> Yes, see _copy_from_iter_flushcache() and all its related apis.
> Drivers shouldn't be doing anything different than what fs/dax.c is
> using.

There is not any arch-independent API that abstracts over a single movnti 
instruction.

Do you want to add it?

memcpy_flushcache is overkill for one or two 8-byte stores. And - on 
non-x86 architectures memcpy_flushcache doesn't flush cache at all.

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Dan Williams Nov. 25, 2017, 5:43 a.m. UTC | #4
On Wed, Nov 22, 2017 at 6:36 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> Hi
>
> Here I'm sending slighly update dm-writecache target.
>

I would expect some theory of operation documentation in the changelog
or in Documentation/ for a new driver.

> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>
[..]
> +/*
> + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> + * stores instead.
> + */

The clflushopt instruction's first instantiation in a cpu was after
Broadwell. Also, the performance of clflushopt does not much matter
when running on a system without ADR (asynchronous DRAM refresh) where
flushing the cpu cache can just leave the writes stranded on their way
to the DIMM. ADR is not available on general purpose servers until
ACPI 6.x NFIT-defined persistent memory platforms.

> +#ifdef CONFIG_X86_64

In general something is broken if we end up with per-arch ifdefs like
this in drivers to handle persistent memory. This should be using the
pmem api or extensions of it, and we need to settle on a mechanism for
upper-level drivers to ask if pmem is driving platform protected
persistent memory.

> +#define NT_STORE(dest, src)    asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> +#define COMMIT_FLUSHED()       wmb()
> +#else
> +#define NT_STORE(dest, src)    ACCESS_ONCE(dest) = (src)
> +#define FLUSH_RANGE            dax_flush
> +#define COMMIT_FLUSHED()       do { } while (0)

Is this just for test purposes? How does the user discover that they
are running in a degraded mode as far as persistence guarantees? I
think we should be falling back DM_WRITECACHE_ONLY_SSD mode if we're
not on a pmem platform.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Nov. 28, 2017, 4:01 a.m. UTC | #5
On Fri, 24 Nov 2017, Dan Williams wrote:

> On Wed, Nov 22, 2017 at 6:36 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> > Hi
> >
> > Here I'm sending slighly update dm-writecache target.
> >
> 
> I would expect some theory of operation documentation in the changelog
> or in Documentation/ for a new driver.

It caches writes for a block device in persistent memory. It doesn't cache 
reads because reads are supposed to be cached in normal ram.

> > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> >
> [..]
> > +/*
> > + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> > + * stores instead.
> > + */
> 
> The clflushopt instruction's first instantiation in a cpu was after
> Broadwell. Also, the performance of clflushopt does not much matter
> when running on a system without ADR (asynchronous DRAM refresh) where
> flushing the cpu cache can just leave the writes stranded on their way
> to the DIMM. ADR is not available on general purpose servers until
> ACPI 6.x NFIT-defined persistent memory platforms.

There used to be pcommit instruction, but intel abandoned it - see 
https://software.intel.com/en-us/blogs/2016/09/12/deprecate-pcommit-instruction

So, according to the document, flushing the cache should be enough for 
writes to reach persistent memory.

> > +#ifdef CONFIG_X86_64
> 
> In general something is broken if we end up with per-arch ifdefs like
> this in drivers to handle persistent memory. This should be using the
> pmem api or extensions of it, and we need to settle on a mechanism for
> upper-level drivers to ask if pmem is driving platform protected
> persistent memory.
> 
> > +#define NT_STORE(dest, src)    asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> > +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> > +#define COMMIT_FLUSHED()       wmb()
> > +#else
> > +#define NT_STORE(dest, src)    ACCESS_ONCE(dest) = (src)
> > +#define FLUSH_RANGE            dax_flush
> > +#define COMMIT_FLUSHED()       do { } while (0)
> 
> Is this just for test purposes? How does the user discover that they
> are running in a degraded mode as far as persistence guarantees? I
> think we should be falling back DM_WRITECACHE_ONLY_SSD mode if we're
> not on a pmem platform.

What degraded mode do you mean? According to that document, flushing cache 
should be enough. If it is not enough, what else do I need to do to flush 
cache?


The above code is not for test purposes. It is for performance purposes.

On dual-socket Broadwell server with persistent memory, when we write to 
persistent memory using cached write instructions and use dax_flush 
afterwards to flush cache for the affected range, the performance is about 
350MB/s. It is practically unusable - worse than low-end SSDs.

On the other hand, the movnti instruction can sustain performance of one 
8-byte write per clock cycle. We don't have to flush cache afterwards, the 
only thing that must be done is to flush the write-combining buffer with 
the sfence instruction. Movnti has much better throughput than dax_flush.

You argue that I should use the pmem api, but there is no pmem api 
providing the movnti intruction. I have no choice, but coding the 
instruction directly in assembler.

If you want to create API, take that "NT_STORE" definition and move it to 
some x86-specific include file.

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Dan Williams Nov. 28, 2017, 4:56 a.m. UTC | #6
On Mon, Nov 27, 2017 at 8:01 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
>
>
>
> On Fri, 24 Nov 2017, Dan Williams wrote:
>
> > On Wed, Nov 22, 2017 at 6:36 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> > > Hi
> > >
> > > Here I'm sending slighly update dm-writecache target.
> > >
> >
> > I would expect some theory of operation documentation in the changelog
> > or in Documentation/ for a new driver.
>
> It caches writes for a block device in persistent memory. It doesn't cache
> reads because reads are supposed to be cached in normal ram.

Is it a cache or more of a write-buffer? How aggressively does it
write back data? Are there any recommendations about cache size vs
backing device size? Does it also buffer sequential writes? For
example bcache tries to avoid caching sequential traffic.

>
> > > Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> > >
> > [..]
> > > +/*
> > > + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> > > + * stores instead.
> > > + */
> >
> > The clflushopt instruction's first instantiation in a cpu was after
> > Broadwell. Also, the performance of clflushopt does not much matter
> > when running on a system without ADR (asynchronous DRAM refresh) where
> > flushing the cpu cache can just leave the writes stranded on their way
> > to the DIMM. ADR is not available on general purpose servers until
> > ACPI 6.x NFIT-defined persistent memory platforms.
>
> There used to be pcommit instruction, but intel abandoned it - see
> https://software.intel.com/en-us/blogs/2016/09/12/deprecate-pcommit-instruction

Yes, I am familiar with that document, I wrote the pcommit removal
patch and referenced ADR in the commit message:

    fd1d961dd681 x86/insn: remove pcommit

> So, according to the document, flushing the cache should be enough for
> writes to reach persistent memory.

The document assumes ADR is present.

> > > +#ifdef CONFIG_X86_64
> >
> > In general something is broken if we end up with per-arch ifdefs like
> > this in drivers to handle persistent memory. This should be using the
> > pmem api or extensions of it, and we need to settle on a mechanism for
> > upper-level drivers to ask if pmem is driving platform protected
> > persistent memory.
> >
> > > +#define NT_STORE(dest, src)    asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> > > +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> > > +#define COMMIT_FLUSHED()       wmb()
> > > +#else
> > > +#define NT_STORE(dest, src)    ACCESS_ONCE(dest) = (src)
> > > +#define FLUSH_RANGE            dax_flush
> > > +#define COMMIT_FLUSHED()       do { } while (0)
> >
> > Is this just for test purposes? How does the user discover that they
> > are running in a degraded mode as far as persistence guarantees? I
> > think we should be falling back DM_WRITECACHE_ONLY_SSD mode if we're
> > not on a pmem platform.
>
> What degraded mode do you mean?

Fall back to treating pmem like an SSD / block-device.

> According to that document, flushing cache
> should be enough. If it is not enough, what else do I need to do to flush
> cache?

...ensure that you are on a platform where flushing the cpu cache is enough.

> The above code is not for test purposes. It is for performance purposes.
>
> On dual-socket Broadwell server with persistent memory

What platform is this... does it have ADR, does the BIOS produce an NFIT?

> when we write to
> persistent memory using cached write instructions and use dax_flush
> afterwards to flush cache for the affected range, the performance is about
> 350MB/s. It is practically unusable - worse than low-end SSDs.
>
> On the other hand, the movnti instruction can sustain performance of one
> 8-byte write per clock cycle. We don't have to flush cache afterwards, the
> only thing that must be done is to flush the write-combining buffer with
> the sfence instruction. Movnti has much better throughput than dax_flush.

What about memcpy_flushcache?

> You argue that I should use the pmem api, but there is no pmem api
> providing the movnti intruction. I have no choice, but coding the
> instruction directly in assembler.

You have not convinced me that memcpy_flushcache is insufficient, or
that the driver is mistakenly assuming guarantees about persistence
when it is not running on a persistent memory enabled platform.

> If you want to create API, take that "NT_STORE" definition and move it to
> some x86-specific include file.

I'm asking you to not solve global problems in your local driver. If
the pmem api is deficient for your use case then let's work on
improving it, but as of now I'm still trying to baseline your
assumptions.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Dec. 5, 2017, 5:40 a.m. UTC | #7
On Mon, 27 Nov 2017, Dan Williams wrote:

> On Mon, Nov 27, 2017 at 8:01 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> >
> >
> >
> > On Fri, 24 Nov 2017, Dan Williams wrote:
> >
> > > On Wed, Nov 22, 2017 at 6:36 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> > > > Hi
> > > >
> > > > Here I'm sending slighly update dm-writecache target.
> > > >
> > >
> > > I would expect some theory of operation documentation in the changelog
> > > or in Documentation/ for a new driver.
> >
> > It caches writes for a block device in persistent memory. It doesn't cache
> > reads because reads are supposed to be cached in normal ram.
> 
> Is it a cache or more of a write-buffer? How aggressively does it
> write back data? Are there any recommendations about cache size vs
> backing device size? Does it also buffer sequential writes? For
> example bcache tries to avoid caching sequential traffic.

It caches all writes. When the size of the cache reaches a predefined 
threshold (50% by default), it starts writing cache content to the backing 
device.

The recommended cache size depends on the workload - if the user 
repeatedly overwrites only a small part of the disk, then small cache is 
sufficient.

> > > > +/*
> > > > + * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
> > > > + * stores instead.
> > > > + */
> > >
> > > The clflushopt instruction's first instantiation in a cpu was after
> > > Broadwell. Also, the performance of clflushopt does not much matter
> > > when running on a system without ADR (asynchronous DRAM refresh) where
> > > flushing the cpu cache can just leave the writes stranded on their way
> > > to the DIMM. ADR is not available on general purpose servers until
> > > ACPI 6.x NFIT-defined persistent memory platforms.
> >
> > There used to be pcommit instruction, but intel abandoned it - see
> > https://software.intel.com/en-us/blogs/2016/09/12/deprecate-pcommit-instruction
> 
> Yes, I am familiar with that document, I wrote the pcommit removal
> patch and referenced ADR in the commit message:
> 
>     fd1d961dd681 x86/insn: remove pcommit
> 
> > So, according to the document, flushing the cache should be enough for
> > writes to reach persistent memory.
> 
> The document assumes ADR is present.

Could there be a case where persistent memory is present, but ADR is not? 

> > > > +#ifdef CONFIG_X86_64
> > >
> > > In general something is broken if we end up with per-arch ifdefs like
> > > this in drivers to handle persistent memory. This should be using the
> > > pmem api or extensions of it, and we need to settle on a mechanism for
> > > upper-level drivers to ask if pmem is driving platform protected
> > > persistent memory.
> > >
> > > > +#define NT_STORE(dest, src)    asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> > > > +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> > > > +#define COMMIT_FLUSHED()       wmb()
> > > > +#else
> > > > +#define NT_STORE(dest, src)    ACCESS_ONCE(dest) = (src)
> > > > +#define FLUSH_RANGE            dax_flush
> > > > +#define COMMIT_FLUSHED()       do { } while (0)
> > >
> > > Is this just for test purposes? How does the user discover that they
> > > are running in a degraded mode as far as persistence guarantees? I
> > > think we should be falling back DM_WRITECACHE_ONLY_SSD mode if we're
> > > not on a pmem platform.
> >
> > What degraded mode do you mean?
> 
> Fall back to treating pmem like an SSD / block-device.

If the dm-writecache driver cannot reliably flush cache - then the 
/dev/pmem block device driver couldn't reliably flush cache neither.

> > According to that document, flushing cache
> > should be enough. If it is not enough, what else do I need to do to flush
> > cache?
> 
> ...ensure that you are on a platform where flushing the cpu cache is enough.
> 
> > The above code is not for test purposes. It is for performance purposes.
> >
> > On dual-socket Broadwell server with persistent memory
> 
> What platform is this... does it have ADR, does the BIOS produce an NFIT?

I don't know. I already released the machine for someone else. If you 
want, I can try to re-acquire the access to it.

> > when we write to
> > persistent memory using cached write instructions and use dax_flush
> > afterwards to flush cache for the affected range, the performance is about
> > 350MB/s. It is practically unusable - worse than low-end SSDs.
> >
> > On the other hand, the movnti instruction can sustain performance of one
> > 8-byte write per clock cycle. We don't have to flush cache afterwards, the
> > only thing that must be done is to flush the write-combining buffer with
> > the sfence instruction. Movnti has much better throughput than dax_flush.
> 
> What about memcpy_flushcache?

but

- using memcpy_flushcache is overkill if we need just one or two 8-byte 
writes to the metadata area. Why not use movnti directly?

- on some architctures, memcpy_flushcache is just an alias for memcpy, so 
there will still be some arch-specific ifdefs

- there is no architecture-neutral way how to guarantee ordering between 
multiple memcpy_flushcache calls. On x86, we need wmb(), on Power we 
don't, on ARM64 I don't know (arch_wb_cache_pmem calls dmb(osh), 
memcpy_flushcache doesn't - I don't know what are the implications of this 
difference) on other architectures, wmb() is insufficient to guarantee 
ordering between multiple memcpy_flushcache calls.

- on Power and ARM64, memcpy_flushcache just does memcpy and flushes the 
cache afterwards. What is better for performance? Is is better to do 
multiple memcpy calls and later multiple dax_flush calls? Or is it better 
to do multiple memcpy_flushcache calls and no dax_flush? On x86, the 
latter is clearly benefical, but I don't know if it would also be 
benefical on ARM64 and Power.

> > You argue that I should use the pmem api, but there is no pmem api
> > providing the movnti intruction. I have no choice, but coding the
> > instruction directly in assembler.
> 
> You have not convinced me that memcpy_flushcache is insufficient, or
> that the driver is mistakenly assuming guarantees about persistence
> when it is not running on a persistent memory enabled platform.

See above.

> > If you want to create API, take that "NT_STORE" definition and move it to
> > some x86-specific include file.
> 
> I'm asking you to not solve global problems in your local driver. If
> the pmem api is deficient for your use case then let's work on
> improving it, but as of now I'm still trying to baseline your
> assumptions.

I'm thinking that perhaps a proper solution would be to map the persistent 
memory always as write combining - and then we wouldn't have ponder how to 
do write-combining writes into write-back memory type.

At least for Broadwell, the write-back memory type on persistent memory 
has so horrible performance that it not really usable.

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Dan Williams Dec. 9, 2017, 2:10 a.m. UTC | #8
On Mon, Dec 4, 2017 at 9:40 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
> On Mon, 27 Nov 2017, Dan Williams wrote:
[..]
> > > So, according to the document, flushing the cache should be enough for
> > > writes to reach persistent memory.
> >
> > The document assumes ADR is present.
>
> Could there be a case where persistent memory is present, but ADR is not?

No, that's why it was safe to deprecate the pcommit instruction,
software can assume that once writes reach the memory controller they
are persistent and there is no requirement to flush them through the
memory controller to media which is what the pcommit instruction
performed.

> > > > > +#ifdef CONFIG_X86_64
> > > >
> > > > In general something is broken if we end up with per-arch ifdefs like
> > > > this in drivers to handle persistent memory. This should be using the
> > > > pmem api or extensions of it, and we need to settle on a mechanism for
> > > > upper-level drivers to ask if pmem is driving platform protected
> > > > persistent memory.
> > > >
> > > > > +#define NT_STORE(dest, src)    asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
> > > > > +#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
> > > > > +#define COMMIT_FLUSHED()       wmb()
> > > > > +#else
> > > > > +#define NT_STORE(dest, src)    ACCESS_ONCE(dest) = (src)
> > > > > +#define FLUSH_RANGE            dax_flush
> > > > > +#define COMMIT_FLUSHED()       do { } while (0)
> > > >
> > > > Is this just for test purposes? How does the user discover that they
> > > > are running in a degraded mode as far as persistence guarantees? I
> > > > think we should be falling back DM_WRITECACHE_ONLY_SSD mode if we're
> > > > not on a pmem platform.
> > >
> > > What degraded mode do you mean?
> >
> > Fall back to treating pmem like an SSD / block-device.
>
> If the dm-writecache driver cannot reliably flush cache - then the
> /dev/pmem block device driver couldn't reliably flush cache neither.

Right, but the pmem driver warns when it can't detect that it is
running on a persistent memory enabled platform. I'm saying we should
turn that warning into an api so that in-kernel consumers of pmem can
make the same warning or determination to disable functionality on
non-pmem capable platforms.

> > > According to that document, flushing cache
> > > should be enough. If it is not enough, what else do I need to do to flush
> > > cache?
> >
> > ...ensure that you are on a platform where flushing the cpu cache is enough.
> >
> > > The above code is not for test purposes. It is for performance purposes.
> > >
> > > On dual-socket Broadwell server with persistent memory
> >
> > What platform is this... does it have ADR, does the BIOS produce an NFIT?
>
> I don't know. I already released the machine for someone else. If you
> want, I can try to re-acquire the access to it.

As long as your driver consumes the new/TBD in-kernel api to determine
if the platform is pmem-enabled or not then I think we're good. My
concern is that we're optimizing the kernel interfaces for a platform
that does not support pmem, or requires a pmem support approach that
we would abandon when the clwb instruction can replace clflush usage.

>
> > > when we write to
> > > persistent memory using cached write instructions and use dax_flush
> > > afterwards to flush cache for the affected range, the performance is about
> > > 350MB/s. It is practically unusable - worse than low-end SSDs.
> > >
> > > On the other hand, the movnti instruction can sustain performance of one
> > > 8-byte write per clock cycle. We don't have to flush cache afterwards, the
> > > only thing that must be done is to flush the write-combining buffer with
> > > the sfence instruction. Movnti has much better throughput than dax_flush.
> >
> > What about memcpy_flushcache?
>
> but
>
> - using memcpy_flushcache is overkill if we need just one or two 8-byte
> writes to the metadata area. Why not use movnti directly?
>

The driver performs so many 8-byte moves that the cost of the
memcpy_flushcache() function call significantly eats into your
performance?

> - on some architctures, memcpy_flushcache is just an alias for memcpy, so
> there will still be some arch-specific ifdefs

...but those should all be hidden by arch code, not in drivers.

> - there is no architecture-neutral way how to guarantee ordering between
> multiple memcpy_flushcache calls. On x86, we need wmb(), on Power we
> don't, on ARM64 I don't know (arch_wb_cache_pmem calls dmb(osh),
> memcpy_flushcache doesn't - I don't know what are the implications of this
> difference) on other architectures, wmb() is insufficient to guarantee
> ordering between multiple memcpy_flushcache calls.

wmb() should always be sufficient to order stores on all architectures.

>
> - on Power and ARM64, memcpy_flushcache just does memcpy and flushes the
> cache afterwards. What is better for performance? Is is better to do
> multiple memcpy calls and later multiple dax_flush calls? Or is it better
> to do multiple memcpy_flushcache calls and no dax_flush? On x86, the
> latter is clearly benefical, but I don't know if it would also be
> benefical on ARM64 and Power.
>
> > > You argue that I should use the pmem api, but there is no pmem api
> > > providing the movnti intruction. I have no choice, but coding the
> > > instruction directly in assembler.
> >
> > You have not convinced me that memcpy_flushcache is insufficient, or
> > that the driver is mistakenly assuming guarantees about persistence
> > when it is not running on a persistent memory enabled platform.
>
> See above.
>
> > > If you want to create API, take that "NT_STORE" definition and move it to
> > > some x86-specific include file.
> >
> > I'm asking you to not solve global problems in your local driver. If
> > the pmem api is deficient for your use case then let's work on
> > improving it, but as of now I'm still trying to baseline your
> > assumptions.
>
> I'm thinking that perhaps a proper solution would be to map the persistent
> memory always as write combining - and then we wouldn't have ponder how to
> do write-combining writes into write-back memory type.

Yes, if memcpy_flushcache() really is overkill then it might be better
to just mark the memory write-through for that use. I need to go look
if there is any concerns with use PAT to mark a range WT when the
MTRRs have it marked WB.

Another alternative is to convert this driver to use the same
interfaces that the BTT driver uses. See nvdimm_write_bytes().

> At least for Broadwell, the write-back memory type on persistent memory
> has so horrible performance that it not really usable.

...and my concern is that you're designing a pmem driver mechanism for
Broadwell that predates the clwb instruction for efficient pmem
access.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Mikulas Patocka Dec. 22, 2017, 8:56 p.m. UTC | #9
On Fri, 8 Dec 2017, Dan Williams wrote:

> > > What about memcpy_flushcache?
> >
> > but
> >
> > - using memcpy_flushcache is overkill if we need just one or two 8-byte
> > writes to the metadata area. Why not use movnti directly?
> >
> 
> The driver performs so many 8-byte moves that the cost of the
> memcpy_flushcache() function call significantly eats into your
> performance?
>
> > - on some architctures, memcpy_flushcache is just an alias for memcpy, so
> > there will still be some arch-specific ifdefs
> 
> ...but those should all be hidden by arch code, not in drivers.
> 
> > - there is no architecture-neutral way how to guarantee ordering between
> > multiple memcpy_flushcache calls. On x86, we need wmb(), on Power we
> > don't, on ARM64 I don't know (arch_wb_cache_pmem calls dmb(osh),
> > memcpy_flushcache doesn't - I don't know what are the implications of this
> > difference) on other architectures, wmb() is insufficient to guarantee
> > ordering between multiple memcpy_flushcache calls.
> 
> wmb() should always be sufficient to order stores on all architectures.

No, it isn't. See this example:

uint64_t var_a, var_b;

void fn(void)
{
	uint64_t val = 3;
	memcpy_flushcache(&var_a, &val, 8);
	wmb();
	val = 5;
	memcpy_flushcache(&var_b, &val, 8);;
}

On x86-64, memcpy_flushcache is implemented using the movnti instruction 
(that writes the value to the write-combining buffer) and wmb() is 
compiled into the sfence instruction (that flushes the write-combining 
buffer) - so it's OK - it is guaranteed that the variable var_a is written 
to persistent memory before the variable var_b;

However, on i686 (and most other architectures), memcpy_flushcache is just 
an alias for memcpy - see this code in include/linux/string.h:
#ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
static inline void memcpy_flushcache(void *dst, const void *src, size_t 
cnt)
{
        memcpy(dst, src, cnt);
}
#endif

So, memcpy_flushcache writes the variable var_a to the cache, wmb() 
flushes the pipeline, but otherwise does nothing (because on i686 all 
writes to the cache are already ordered) and then memcpy_flushcache writes 
the variable var_b to the cache - however, both writes end up in cache and 
wmb() doesn't flush the cache. So wmb() doesn't provide any sort of 
guarantee that var_a is written to persistent memory before var_b. If the 
cache sector containing the variable var_b is more contended than the 
cache sector containg the variable var_a, the CPU will flush cache line 
containing var_b before var_a.

We have the dax_flush() function that (unlike wmb) guarantees that the 
specified range is flushed from the cache and written to persistent 
memory. But dax_flush is very slow on x86-64.

So, we have a situation where wmb() is fast, but it only guarantees 
ordering on x86-64 and dax_flush() that guarantees ordering on all 
architectures, but it is slow on x86-64.

So, my driver uses wmb() on x86-64 and dax_flush() on all the others.

You argue that the driver shouldn't use any per-architecture #ifdefs, but 
there is no other way how to do it - using dax_flush() on x86-64 kills 
performance and using wmb() on all architectures is unreliable because 
wmb() doesn't guarantee cache flushing at all.

> > At least for Broadwell, the write-back memory type on persistent memory
> > has so horrible performance that it not really usable.
> 
> ...and my concern is that you're designing a pmem driver mechanism for
> Broadwell that predates the clwb instruction for efficient pmem
> access.

That's what we have for testing. If I get access to some Skylake server, I 
can test if using write combining is faster than cache flushing.

Mikulas

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Dan Williams Dec. 22, 2017, 10:24 p.m. UTC | #10
On Fri, Dec 22, 2017 at 12:56 PM, Mikulas Patocka <mpatocka@redhat.com> wrote:
>
>
> On Fri, 8 Dec 2017, Dan Williams wrote:
>
>> > > What about memcpy_flushcache?
>> >
>> > but
>> >
>> > - using memcpy_flushcache is overkill if we need just one or two 8-byte
>> > writes to the metadata area. Why not use movnti directly?
>> >
>>
>> The driver performs so many 8-byte moves that the cost of the
>> memcpy_flushcache() function call significantly eats into your
>> performance?
>>
>> > - on some architctures, memcpy_flushcache is just an alias for memcpy, so
>> > there will still be some arch-specific ifdefs
>>
>> ...but those should all be hidden by arch code, not in drivers.
>>
>> > - there is no architecture-neutral way how to guarantee ordering between
>> > multiple memcpy_flushcache calls. On x86, we need wmb(), on Power we
>> > don't, on ARM64 I don't know (arch_wb_cache_pmem calls dmb(osh),
>> > memcpy_flushcache doesn't - I don't know what are the implications of this
>> > difference) on other architectures, wmb() is insufficient to guarantee
>> > ordering between multiple memcpy_flushcache calls.
>>
>> wmb() should always be sufficient to order stores on all architectures.
>
> No, it isn't. See this example:
>
> uint64_t var_a, var_b;
>
> void fn(void)
> {
>         uint64_t val = 3;
>         memcpy_flushcache(&var_a, &val, 8);
>         wmb();
>         val = 5;
>         memcpy_flushcache(&var_b, &val, 8);;
> }
>
> On x86-64, memcpy_flushcache is implemented using the movnti instruction
> (that writes the value to the write-combining buffer) and wmb() is
> compiled into the sfence instruction (that flushes the write-combining
> buffer) - so it's OK - it is guaranteed that the variable var_a is written
> to persistent memory before the variable var_b;
>
> However, on i686 (and most other architectures), memcpy_flushcache is just
> an alias for memcpy - see this code in include/linux/string.h:
> #ifndef __HAVE_ARCH_MEMCPY_FLUSHCACHE
> static inline void memcpy_flushcache(void *dst, const void *src, size_t
> cnt)
> {
>         memcpy(dst, src, cnt);
> }
> #endif
>
> So, memcpy_flushcache writes the variable var_a to the cache, wmb()
> flushes the pipeline, but otherwise does nothing (because on i686 all
> writes to the cache are already ordered) and then memcpy_flushcache writes
> the variable var_b to the cache - however, both writes end up in cache and
> wmb() doesn't flush the cache. So wmb() doesn't provide any sort of
> guarantee that var_a is written to persistent memory before var_b. If the
> cache sector containing the variable var_b is more contended than the
> cache sector containg the variable var_a, the CPU will flush cache line
> containing var_b before var_a.

You're overlooking the fact that there is no persistent memory support
for i686. The pmem driver will warn you of this fact:

    dev_warn(dev, "unable to guarantee persistence of writes\n");

The reason persistent memory is not supported on i686 is due to the
fact that older cpus do not guarantee that non-temporal stores always
bypass the cache.

> We have the dax_flush() function that (unlike wmb) guarantees that the
> specified range is flushed from the cache and written to persistent
> memory. But dax_flush is very slow on x86-64.

dax_flush() does not make that guarantee on i686, and it's only slow
on architectures that were not built for persistent memory support
i.e. architectures prior to the arrival of the clwb and clflushopt
instructions.

> So, we have a situation where wmb() is fast, but it only guarantees
> ordering on x86-64 and dax_flush() that guarantees ordering on all
> architectures, but it is slow on x86-64.
>
> So, my driver uses wmb() on x86-64 and dax_flush() on all the others.

...and I'm saying you're hard coding for a cpu that was not built to
support persistent memory efficiently in the first place.

> You argue that the driver shouldn't use any per-architecture #ifdefs, but
> there is no other way how to do it - using dax_flush() on x86-64 kills
> performance and using wmb() on all architectures is unreliable because
> wmb() doesn't guarantee cache flushing at all.
>
>> > At least for Broadwell, the write-back memory type on persistent memory
>> > has so horrible performance that it not really usable.
>>
>> ...and my concern is that you're designing a pmem driver mechanism for
>> Broadwell that predates the clwb instruction for efficient pmem
>> access.
>
> That's what we have for testing. If I get access to some Skylake server, I
> can test if using write combining is faster than cache flushing.

I'm still missing where memcpy_flushcache is unsuitable for your use
case given that i686 does not support reliable peristent memory
access.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
diff mbox

Patch

Index: linux-2.6/drivers/md/Kconfig
===================================================================
--- linux-2.6.orig/drivers/md/Kconfig
+++ linux-2.6/drivers/md/Kconfig
@@ -326,6 +326,17 @@  config DM_CACHE_SMQ
          of less memory utilization, improved performance and increased
          adaptability in the face of changing workloads.
 
+config DM_WRITECACHE
+	tristate "Writecache target"
+	depends on BLK_DEV_DM
+	---help---
+	   The writecache target caches writes on persistent memory or SSD.
+	   It is intended for databases or other programs that need extremely
+	   low commit latency.
+
+	   The writecache target doesn't cache reads because reads are supposed
+	   to be cached in standard RAM.
+
 config DM_ERA
        tristate "Era target (EXPERIMENTAL)"
        depends on BLK_DEV_DM
Index: linux-2.6/drivers/md/Makefile
===================================================================
--- linux-2.6.orig/drivers/md/Makefile
+++ linux-2.6/drivers/md/Makefile
@@ -63,6 +63,7 @@  obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 obj-$(CONFIG_DM_INTEGRITY)	+= dm-integrity.o
 obj-$(CONFIG_DM_ZONED)		+= dm-zoned.o
+obj-$(CONFIG_DM_WRITECACHE)	+= dm-writecache.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-writecache.c
@@ -0,0 +1,2312 @@ 
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+#include <linux/swait.h>
+#include <linux/delay.h>
+#include <linux/dm-io.h>
+#include <linux/dm-kcopyd.h>
+#include <linux/dax.h>
+#include <linux/pfn_t.h>
+
+#define DM_MSG_PREFIX	"writecache"
+
+#define HIGH_WATERMARK			50
+#define LOW_WATERMARK			45
+#define MAX_WRITEBACK_JOBS		1024
+#define ENDIO_LATENCY			16
+#define WRITEBACK_LATENCY		64
+#define AUTOCOMMIT_BLOCKS_SSD		65536
+#define AUTOCOMMIT_BLOCKS_PMEM		64
+#define AUTOCOMMIT_MSEC			1000
+
+/*
+ * If the architecture doesn't support persistent memory, we can use this driver
+ * in SSD-only mode.
+ */
+#ifndef CONFIG_ARCH_HAS_PMEM_API
+#define DM_WRITECACHE_ONLY_SSD
+#endif
+
+//#define WC_MEASURE_LATENCY
+
+#define BITMAP_GRANULARITY	65536
+#if BITMAP_GRANULARITY < PAGE_SIZE
+#undef BITMAP_GRANULARITY
+#define BITMAP_GRANULARITY	PAGE_SIZE
+#endif
+
+#ifndef bio_set_dev
+#define	bio_set_dev(bio, dev)	((bio)->bi_bdev = (dev))
+#endif
+
+/*
+ * The clflushopt instruction is very slow on Broadwell, so we use non-temporal
+ * stores instead.
+ */
+#ifdef CONFIG_X86_64
+#define NT_STORE(dest, src)	asm ("movnti %1, %0" : "=m"(dest) : "r"(src))
+#define FLUSH_RANGE(dax, ptr, size)  do { } while (0)
+#define COMMIT_FLUSHED()	wmb()
+#else
+#define NT_STORE(dest, src)	ACCESS_ONCE(dest) = (src)
+#define FLUSH_RANGE		dax_flush
+#define COMMIT_FLUSHED()	do { } while (0)
+#endif
+
+#if defined(__HAVE_ARCH_MEMCPY_MCSAFE) && !defined(DM_WRITECACHE_ONLY_SSD)
+#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+#endif
+
+#define MEMORY_SUPERBLOCK_MAGIC		0x23489321
+#define MEMORY_SUPERBLOCK_VERSION	1
+
+struct wc_memory_entry {
+	uint64_t original_sector;
+	uint64_t seq_count;
+};
+
+struct wc_memory_superblock {
+	union {
+		struct {
+			uint32_t magic;
+			uint32_t version;
+			uint32_t block_size;
+			uint32_t pad;
+			uint64_t n_blocks;
+			uint64_t seq_count;
+		};
+		uint64_t padding[8];
+	};
+	struct wc_memory_entry entries[0];
+};
+
+struct wc_entry {
+	struct rb_node rb_node;
+	struct list_head lru;
+	unsigned short wc_list_contiguous;
+	bool write_in_progress
+#if BITS_PER_LONG == 64
+		:1
+#endif
+	;
+	unsigned long index
+#if BITS_PER_LONG == 64
+		:47
+#endif
+	;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	uint64_t original_sector;
+	uint64_t seq_count;
+#endif
+};
+
+#ifndef DM_WRITECACHE_ONLY_SSD
+#define WC_MODE_PMEM(wc)			((wc)->pmem_mode)
+#else
+#define WC_MODE_PMEM(wc)			false
+#endif
+
+struct dm_writecache {
+#ifndef DM_WRITECACHE_ONLY_SSD
+	bool pmem_mode;
+#endif
+	struct mutex lock;
+	struct rb_root tree;
+	struct list_head lru;
+	struct list_head freelist;
+	size_t freelist_size;
+	size_t writeback_size;
+	unsigned uncommitted_blocks;
+	unsigned autocommit_blocks;
+	unsigned max_writeback_jobs;
+	size_t freelist_high_watermark;
+	size_t freelist_low_watermark;
+	struct timer_list autocommit_timer;
+	unsigned long autocommit_jiffies;
+	struct swait_queue_head freelist_wait;
+
+	struct dm_target *ti;
+	struct dm_dev *dev;
+	struct dm_dev *ssd_dev;
+	void *memory_map;
+	uint64_t memory_map_size;
+	size_t metadata_sectors;
+	void *block_start;
+	struct wc_entry *entries;
+	unsigned block_size;
+	unsigned char block_size_bits;
+	size_t n_blocks;
+	uint64_t seq_count;
+	int error;
+
+	bool overwrote_committed;
+	bool memory_vmapped;
+
+	atomic_t bio_in_progress[2];
+	struct swait_queue_head bio_in_progress_wait[2];
+
+	struct dm_io_client *dm_io;
+
+	unsigned writeback_all;
+	struct workqueue_struct *writeback_wq;
+	struct work_struct writeback_work;
+	struct work_struct flush_work;
+
+	struct swait_queue_head endio_thread_wait;
+	struct list_head endio_list;
+	struct task_struct *endio_thread;
+
+	struct task_struct *flush_thread;
+	struct bio *flush_bio;
+	struct completion flush_completion;
+
+	struct bio_set *bio_set;
+	mempool_t *copy_pool;
+	mempool_t *page_pool;
+
+	struct dm_kcopyd_client *dm_kcopyd;
+	unsigned long *dirty_bitmap;
+	unsigned dirty_bitmap_size;
+
+	bool high_wm_percent_set;
+	bool low_wm_percent_set;
+	bool max_writeback_jobs_set;
+	bool autocommit_blocks_set;
+	bool autocommit_time_set;
+
+#ifdef WC_MEASURE_LATENCY
+	ktime_t lock_acquired_time;
+	ktime_t max_lock_held;
+	ktime_t max_lock_wait;
+	ktime_t max_freelist_wait;
+	ktime_t measure_latency_time;
+	ktime_t max_measure_latency;
+#endif
+};
+
+#define WB_LIST_INLINE		16
+
+struct writeback_struct {
+	struct list_head endio_entry;
+	struct dm_writecache *wc;
+	struct wc_entry **wc_list;
+	unsigned wc_list_n;
+	unsigned page_offset;
+	struct page *page;
+	struct wc_entry *wc_list_inline[WB_LIST_INLINE];
+	struct bio bio;
+};
+
+struct copy_struct {
+	struct list_head endio_entry;
+	struct dm_writecache *wc;
+	struct wc_entry *e;
+	unsigned n_entries;
+	int error;
+};
+
+DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
+					    "A percentage of time allocated for data copying");
+
+static inline void measure_latency_start(struct dm_writecache *wc)
+{
+#ifdef WC_MEASURE_LATENCY
+	wc->measure_latency_time = ktime_get();
+#endif
+}
+
+static inline void measure_latency_end(struct dm_writecache *wc, unsigned long n)
+{
+#ifdef WC_MEASURE_LATENCY
+	ktime_t now = ktime_get();
+	if (now - wc->measure_latency_time > wc->max_measure_latency) {
+		wc->max_measure_latency = now - wc->measure_latency_time;
+		printk(KERN_DEBUG "dm-writecache: measured latency %lld.%03lldus, %lu steps\n", wc->max_measure_latency / 1000, wc->max_measure_latency % 1000, n);
+	}
+#endif
+}
+
+static void __wc_lock(struct dm_writecache *wc, int line)
+{
+#ifdef WC_MEASURE_LATENCY
+	ktime_t before, after;
+	before = ktime_get();
+#endif
+	mutex_lock(&wc->lock);
+#ifdef WC_MEASURE_LATENCY
+	after = ktime_get();
+	if (unlikely(after - before > wc->max_lock_wait)) {
+		wc->max_lock_wait = after - before;
+		printk(KERN_DEBUG "dm-writecache: waiting for lock for %lld.%03lldus at %d\n", wc->max_lock_wait / 1000, wc->max_lock_wait % 1000, line);
+		after = ktime_get();
+	}
+	wc->lock_acquired_time = after;
+#endif
+}
+#define wc_lock(wc)	__wc_lock(wc, __LINE__)
+
+static void __wc_unlock(struct dm_writecache *wc, int line)
+{
+#ifdef WC_MEASURE_LATENCY
+	ktime_t now = ktime_get();
+	if (now - wc->lock_acquired_time > wc->max_lock_held) {
+		wc->max_lock_held = now - wc->lock_acquired_time;
+		printk(KERN_DEBUG "dm-writecache: lock held for %lld.%03lldus at %d\n", wc->max_lock_held / 1000, wc->max_lock_held % 1000, line);
+	}
+#endif
+	mutex_unlock(&wc->lock);
+}
+#define wc_unlock(wc)	__wc_unlock(wc, __LINE__)
+
+#define wc_unlock_long(wc)	mutex_unlock(&wc->lock)
+
+static int persistent_memory_claim(struct dm_writecache *wc)
+{
+	int r;
+	loff_t s;
+	long p, da;
+	pfn_t pfn;
+	int id;
+	struct page **pages;
+
+	wc->memory_vmapped = false;
+
+	if (!wc->ssd_dev->dax_dev) {
+		r = -EOPNOTSUPP;
+		goto err1;
+	}
+	s = wc->memory_map_size;
+	p = s >> PAGE_SHIFT;
+	if (!p) {
+		r = -EINVAL;
+		goto err1;
+	}
+	if (p != s >> PAGE_SHIFT) {
+		r = -EOVERFLOW;
+		goto err1;
+	}
+
+	id = dax_read_lock();
+
+	da = dax_direct_access(wc->ssd_dev->dax_dev, 0, p, &wc->memory_map, &pfn);
+	if (da < 0) {
+		wc->memory_map = NULL;
+		r = da;
+		goto err2;
+	}
+	if (!pfn_t_has_page(pfn)) {
+		wc->memory_map = NULL;
+		r = -EOPNOTSUPP;
+		goto err2;
+	}
+#ifdef WC_MEASURE_LATENCY
+	printk(KERN_DEBUG "dm-writecache: device %s, pfn %016llx\n", wc->ssd_dev->name, pfn.val);
+#endif
+	if (da != p) {
+		long i;
+		wc->memory_map = NULL;
+		pages = kvmalloc(p * sizeof(struct page *), GFP_KERNEL);
+		if (!pages) {
+			r = -ENOMEM;
+			goto err2;
+		}
+		i = 0;
+		do {
+			long daa;
+			void *dummy_addr;
+			daa = dax_direct_access(wc->ssd_dev->dax_dev, i, p - i, &dummy_addr, &pfn);
+			if (daa <= 0) {
+				r = daa ? daa : -EINVAL;
+				goto err3;
+			}
+			if (!pfn_t_has_page(pfn)) {
+				r = -EOPNOTSUPP;
+				goto err3;
+			}
+			while (daa-- && i < p) {
+				pages[i++] = pfn_t_to_page(pfn);
+				pfn.val++;
+			}
+		} while (i < p);
+		wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
+		if (!wc->memory_map) {
+			r = -ENOMEM;
+			goto err3;
+		}
+		kvfree(pages);
+		wc->memory_vmapped = true;
+	}
+
+	dax_read_unlock(id);
+
+	return 0;
+
+err3:
+	kvfree(pages);
+err2:
+	dax_read_unlock(id);
+err1:
+	return r;
+}
+
+static void persistent_memory_release(struct dm_writecache *wc)
+{
+	if (wc->memory_vmapped)
+		vunmap(wc->memory_map);
+}
+
+static struct page *persistent_memory_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	else
+		return virt_to_page(addr);
+}
+
+static unsigned persistent_memory_page_offset(void *addr)
+{
+	return (unsigned long)addr & (PAGE_SIZE - 1);
+}
+
+static void persistent_memory_flush_cache(void *ptr, size_t size)
+{
+	if (is_vmalloc_addr(ptr))
+		flush_kernel_vmap_range(ptr, size);
+}
+
+static void persistent_memory_invalidate_cache(void *ptr, size_t size)
+{
+	if (is_vmalloc_addr(ptr))
+		invalidate_kernel_vmap_range(ptr, size);
+}
+
+static void persistent_memory_flush(struct dm_writecache *wc, void *ptr, size_t size)
+{
+	FLUSH_RANGE(wc->ssd_dev->dax_dev, ptr, size);
+}
+
+static void persistent_memory_commit_flushed(void)
+{
+	COMMIT_FLUSHED();
+}
+
+static struct wc_memory_superblock *sb(struct dm_writecache *wc)
+{
+	return wc->memory_map;
+}
+
+static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	if (is_power_of_2(sizeof(struct wc_entry)) && 0)
+		return &sb(wc)->entries[e - wc->entries];
+	else
+		return &sb(wc)->entries[e->index];
+}
+
+static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return (char *)wc->block_start + (e->index << wc->block_size_bits);
+}
+
+static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return wc->metadata_sectors +
+		((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
+}
+
+static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	return e->original_sector;
+#else
+	return le64_to_cpu(memory_entry(wc, e)->original_sector);
+#endif
+}
+
+static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	return e->seq_count;
+#else
+	return le64_to_cpu(memory_entry(wc, e)->seq_count);
+#endif
+}
+
+static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
+{
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	e->seq_count = -1;
+#endif
+	NT_STORE(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
+}
+
+static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
+					    uint64_t original_sector, uint64_t seq_count)
+{
+	struct wc_memory_entry *wme;
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	e->original_sector = original_sector;
+	e->seq_count = seq_count;
+#endif
+	wme = memory_entry(wc, e);
+	NT_STORE(wme->original_sector, cpu_to_le64(original_sector));
+	NT_STORE(wme->seq_count, cpu_to_le64(seq_count));
+}
+
+#define writecache_error(wc, err, msg, arg...)				\
+do {									\
+	if (!cmpxchg(&(wc)->error, 0, err))				\
+		DMERR(msg, ##arg);					\
+	swake_up(&(wc)->freelist_wait);					\
+} while (0)
+
+#define writecache_has_error(wc)	(unlikely(ACCESS_ONCE((wc)->error)))
+
+static void writecache_flush_all_metadata(struct dm_writecache *wc)
+{
+	if (WC_MODE_PMEM(wc)) {
+		persistent_memory_flush(wc,
+			sb(wc), offsetof(struct wc_memory_superblock, entries[wc->n_blocks]));
+	} else {
+		memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
+	}
+}
+
+static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
+{
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_flush(wc, ptr, size);
+	else
+		__set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
+			  wc->dirty_bitmap);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
+
+struct io_notify {
+	struct dm_writecache *wc;
+	struct completion c;
+	atomic_t count;
+};
+
+void writecache_notify_io(unsigned long error, void *context)
+{
+	struct io_notify *endio = context;
+
+	if (unlikely(error != 0))
+		writecache_error(endio->wc, -EIO, "error writing metadata");
+	BUG_ON(atomic_read(&endio->count) <= 0);
+	if (atomic_dec_and_test(&endio->count))
+		complete(&endio->c);
+}
+
+static void ssd_commit_flushed(struct dm_writecache *wc)
+{
+	int r;
+	struct dm_io_region region;
+	struct dm_io_request req;
+	struct io_notify endio = {
+		wc,
+		COMPLETION_INITIALIZER_ONSTACK(endio.c),
+		ATOMIC_INIT(1),
+	};
+	unsigned bitmap_bits = wc->dirty_bitmap_size * BITS_PER_LONG;
+	unsigned i = 0;
+
+	while (1) {
+		unsigned j;
+		i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
+		if (unlikely(i == bitmap_bits))
+			break;
+		j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
+
+		region.bdev = wc->ssd_dev->bdev;
+		region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+		region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
+
+		if (unlikely(region.sector >= wc->metadata_sectors))
+			break;
+		if (unlikely(region.sector + region.count > wc->metadata_sectors))
+			region.count = wc->metadata_sectors - region.sector;
+
+		atomic_inc(&endio.count);
+		req.bi_op = REQ_OP_WRITE;
+		req.bi_op_flags = REQ_SYNC;
+		req.mem.type = DM_IO_VMA;
+		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
+		req.client = wc->dm_io;
+		req.notify.fn = writecache_notify_io;
+		req.notify.context = &endio;
+
+		r = dm_io(&req, 1, &region, NULL);
+		if (unlikely(r)) {
+			/*
+			 * Async dm-io (implied by notify.fn above) won't return an error, but
+			 * if that changes in the future we must catch it: so panic in defense.
+			 */
+			panic(DM_NAME ": " DM_MSG_PREFIX ": dm io error %d", r);
+		}
+		i = j;
+	}
+
+	writecache_notify_io(0, &endio);
+	wait_for_completion_io(&endio.c);
+
+	writecache_disk_flush(wc, wc->ssd_dev);
+
+	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
+}
+
+static void writecache_commit_flushed(struct dm_writecache *wc)
+{
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_commit_flushed();
+	else
+		ssd_commit_flushed(wc);
+}
+
+static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
+{
+	int r;
+	struct dm_io_region region;
+	struct dm_io_request req;
+
+	region.bdev = dev->bdev;
+	region.sector = 0;
+	region.count = 0;
+	req.bi_op = REQ_OP_WRITE;
+	req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC;
+	req.mem.type = DM_IO_KMEM;
+	req.mem.ptr.addr = NULL;
+	req.client = wc->dm_io;
+	req.notify.fn = NULL;
+
+	r = dm_io(&req, 1, &region, NULL);
+	if (unlikely(r))
+		writecache_error(wc, r, "error flushing metadata: %d", r);
+}
+
+static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
+{
+	swait_event(wc->bio_in_progress_wait[direction],
+		   !atomic_read(&wc->bio_in_progress[direction]));
+}
+
+#define WFE_RETURN_FOLLOWING	1
+#define WFE_LOWEST_SEQ		2
+
+static struct wc_entry *writecache_find_entry(struct dm_writecache *wc, uint64_t block, int flags)
+{
+	struct wc_entry *e;
+	struct rb_node *node = wc->tree.rb_node;
+
+	if (unlikely(!node))
+		return NULL;
+
+	while (1) {
+		e = container_of(node, struct wc_entry, rb_node);
+		if (read_original_sector(wc, e) == block)
+			break;
+		node = (read_original_sector(wc, e) >= block ?
+			e->rb_node.rb_left : e->rb_node.rb_right);
+		//prefetch(node);
+		if (unlikely(!node)) {
+			if (!(flags & WFE_RETURN_FOLLOWING)) {
+				return NULL;
+			}
+			if (read_original_sector(wc, e) >= block) {
+				break;
+			} else {
+				node = rb_next(&e->rb_node);
+				if (unlikely(!node)) {
+					return NULL;
+				}
+				e = container_of(node, struct wc_entry, rb_node);
+				break;
+			}
+		}
+	}
+
+	while (1) {
+		struct wc_entry *e2;
+		if (flags & WFE_LOWEST_SEQ)
+			node = rb_prev(&e->rb_node);
+		else
+			node = rb_next(&e->rb_node);
+		if (!node)
+			return e;
+		e2 = container_of(node, struct wc_entry, rb_node);
+		if (read_original_sector(wc, e2) != block)
+			return e;
+		e = e2;
+	}
+}
+
+static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
+{
+	struct wc_entry *e;
+	struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
+
+	while (*node) {
+		e = container_of(*node, struct wc_entry, rb_node);
+		parent = &e->rb_node;
+		node = read_original_sector(wc, e) > read_original_sector(wc, ins) ?
+			&parent->rb_left : &parent->rb_right;
+	}
+	rb_link_node(&ins->rb_node, parent, node);
+	rb_insert_color(&ins->rb_node, &wc->tree);
+	list_add(&ins->lru, &wc->lru);
+}
+
+static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
+{
+	list_del(&e->lru);
+	rb_erase(&e->rb_node, &wc->tree);
+}
+
+static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
+{
+	list_add_tail(&e->lru, &wc->freelist);
+	wc->freelist_size++;
+}
+
+struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+{
+	struct wc_entry *e;
+
+	if (unlikely(list_empty(&wc->freelist)))
+		return NULL;
+	e = container_of(wc->freelist.next, struct wc_entry, lru);
+	list_del(&e->lru);
+	wc->freelist_size--;
+	if (unlikely(wc->freelist_size <= wc->freelist_high_watermark))
+		queue_work(wc->writeback_wq, &wc->writeback_work);
+
+	return e;
+}
+
+static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	writecache_unlink(wc, e);
+	writecache_add_to_freelist(wc, e);
+	clear_seq_count(wc, e);
+	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+	if (unlikely(swait_active(&wc->freelist_wait)))
+		swake_up(&wc->freelist_wait);
+}
+
+static void __writecache_wait_on_freelist(struct dm_writecache *wc, bool measure, int line)
+{
+	DECLARE_SWAITQUEUE(wait);
+#ifdef WC_MEASURE_LATENCY
+	ktime_t before, after;
+#endif
+
+	prepare_to_swait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
+	wc_unlock(wc);
+#ifdef WC_MEASURE_LATENCY
+	if (measure)
+		before = ktime_get();
+#endif
+	io_schedule();
+	finish_swait(&wc->freelist_wait, &wait);
+#ifdef WC_MEASURE_LATENCY
+	if (measure) {
+		after = ktime_get();
+		if (unlikely(after - before > wc->max_freelist_wait)) {
+			wc->max_freelist_wait = after - before;
+			printk(KERN_DEBUG "dm-writecache: waiting on freelist for %lld.%03lldus at %d\n", wc->max_freelist_wait / 1000, wc->max_freelist_wait % 1000, line);
+		}
+	}
+#endif
+	wc_lock(wc);
+}
+#define writecache_wait_on_freelist(wc)		__writecache_wait_on_freelist(wc, true, __LINE__)
+#define writecache_wait_on_freelist_long(wc)	__writecache_wait_on_freelist(wc, false, __LINE__)
+
+static void writecache_poison_lists(struct dm_writecache *wc)
+{
+	/*
+	 * Catch incorrect access to these values while the device is suspended.
+	 */
+	memset(&wc->tree, -1, sizeof wc->tree);
+	wc->lru.next = LIST_POISON1;
+	wc->lru.prev = LIST_POISON2;
+	wc->freelist.next = LIST_POISON1;
+	wc->freelist.prev = LIST_POISON2;
+}
+
+static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
+{
+	writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+	if (WC_MODE_PMEM(wc))
+		writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
+}
+
+static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
+{
+	return read_seq_count(wc, e) < wc->seq_count;
+}
+
+static void writecache_flush(struct dm_writecache *wc)
+{
+	struct wc_entry *e, *e2;
+	bool need_flush_after_free;
+
+	wc->uncommitted_blocks = 0;
+	del_timer(&wc->autocommit_timer);
+
+	if (list_empty(&wc->lru))
+		return;
+
+	e = container_of(wc->lru.next, struct wc_entry, lru);
+	if (writecache_entry_is_committed(wc, e)) {
+		if (wc->overwrote_committed) {
+			writecache_wait_for_ios(wc, WRITE);
+			writecache_disk_flush(wc, wc->ssd_dev);
+			wc->overwrote_committed = false;
+		}
+		return;
+	}
+	while (1) {
+		writecache_flush_entry(wc, e);
+		if (unlikely(e->lru.next == &wc->lru))
+			break;
+		e2 = container_of(e->lru.next, struct wc_entry, lru);
+		if (writecache_entry_is_committed(wc, e2))
+			break;
+		e = e2;
+		cond_resched();
+	}
+	writecache_commit_flushed(wc);
+
+	writecache_wait_for_ios(wc, WRITE);
+
+	wc->seq_count++;
+	NT_STORE(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
+	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
+	writecache_commit_flushed(wc);
+
+	wc->overwrote_committed = false;
+
+	need_flush_after_free = false;
+	while (1) {
+		/* Free another committed entry with lower seq-count */
+		struct rb_node *rb_node = rb_prev(&e->rb_node);
+
+		if (rb_node) {
+			e2 = container_of(rb_node, struct wc_entry, rb_node);
+			if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
+			    likely(!e2->write_in_progress)) {
+				writecache_free_entry(wc, e2);
+				need_flush_after_free = true;
+			}
+		}
+		if (unlikely(e->lru.prev == &wc->lru))
+			break;
+		e = container_of(e->lru.prev, struct wc_entry, lru);
+		cond_resched();
+	}
+
+	if (need_flush_after_free)
+		writecache_commit_flushed(wc);
+}
+
+static void writecache_flush_work(struct work_struct *work)
+{
+	struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
+	wc_lock(wc);
+	writecache_flush(wc);
+	wc_unlock(wc);
+}
+
+static void writecache_autocommit_timer(unsigned long data)
+{
+	struct dm_writecache *wc = (struct dm_writecache *)data;
+	if (!writecache_has_error(wc))
+		queue_work(wc->writeback_wq, &wc->flush_work);
+}
+
+static void writecache_schedule_autocommit(struct dm_writecache *wc)
+{
+	if (!timer_pending(&wc->autocommit_timer))
+		mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
+}
+
+static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
+{
+	struct wc_entry *e;
+	bool discarded_something = false;
+
+	e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
+	if (unlikely(!e))
+		return;
+
+	while (read_original_sector(wc, e) < end) {
+		struct rb_node *node = rb_next(&e->rb_node);
+
+		if (likely(!e->write_in_progress)) {
+			if (!discarded_something) {
+				writecache_wait_for_ios(wc, READ);
+				writecache_wait_for_ios(wc, WRITE);
+				discarded_something = true;
+			}
+			writecache_free_entry(wc, e);
+		}
+
+		if (!node)
+			break;
+
+		e = container_of(node, struct wc_entry, rb_node);
+	}
+
+	if (discarded_something)
+		writecache_commit_flushed(wc);
+}
+
+static bool writecache_wait_for_writeback(struct dm_writecache *wc)
+{
+	if (wc->writeback_size) {
+		writecache_wait_on_freelist(wc);
+		return true;
+	}
+	return false;
+}
+
+static void writecache_suspend(struct dm_target *ti)
+{
+	struct dm_writecache *wc = ti->private;
+
+	del_timer_sync(&wc->autocommit_timer);
+
+	flush_workqueue(wc->writeback_wq);
+
+	wc_lock(wc);
+	writecache_flush(wc);
+	while (writecache_wait_for_writeback(wc));
+	while (wc->writeback_all) {
+		wc_unlock_long(wc);
+		msleep(1);
+		wc_lock(wc);
+	}
+
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
+
+	writecache_poison_lists(wc);
+
+	wc_unlock_long(wc);
+}
+
+static int writecache_alloc_entries(struct dm_writecache *wc)
+{
+	size_t b;
+	if (wc->entries)
+		return 0;
+	wc->entries = vmalloc(sizeof(struct wc_entry) * wc->n_blocks);
+	if (!wc->entries)
+		return -ENOMEM;
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		e->index = b;
+		e->write_in_progress = false;
+	}
+	return 0;
+}
+
+static void writecache_resume(struct dm_target *ti)
+{
+	struct dm_writecache *wc = ti->private;
+	size_t b;
+	bool need_flush = false;
+	uint64_t sb_seq_count;
+	int r;
+
+	wc_lock(wc);
+
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
+
+	wc->tree = RB_ROOT;
+	INIT_LIST_HEAD(&wc->lru);
+	INIT_LIST_HEAD(&wc->freelist);
+	wc->freelist_size = 0;
+
+	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
+	if (r) {
+		writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
+		sb_seq_count = cpu_to_le64(0);
+	}
+	wc->seq_count = le64_to_cpu(sb_seq_count);
+
+#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		struct wc_memory_entry wme;
+		if (writecache_has_error(wc)) {
+			e->original_sector = -1;
+			e->seq_count = -1;
+			continue;
+		}
+		r = memcpy_mcsafe(&wme, memory_entry(wc, e), sizeof(struct wc_memory_entry));
+		if (r) {
+			writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d", (unsigned long)b, r);
+			e->original_sector = -1;
+			e->seq_count = -1;
+		} else {
+			e->original_sector = le64_to_cpu(wme.original_sector);
+			e->seq_count = le64_to_cpu(wme.seq_count);
+		}
+	}
+#endif
+	for (b = 0; b < wc->n_blocks; b++) {
+		struct wc_entry *e = &wc->entries[b];
+		if (!writecache_entry_is_committed(wc, e)) {
+			if (read_seq_count(wc, e) != -1) {
+erase_this:
+				clear_seq_count(wc, e);
+				need_flush = true;
+			}
+			writecache_add_to_freelist(wc, e);
+		} else {
+			struct wc_entry *old;
+
+			old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
+			if (!old) {
+				writecache_insert_entry(wc, e);
+			} else {
+				if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
+					writecache_error(wc, -EINVAL, "two identical entries, position %llu, sector %llu, sequence %llu",
+						 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
+						 (unsigned long long)read_seq_count(wc, e));
+				}
+				if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
+					goto erase_this;
+				} else {
+					writecache_free_entry(wc, old);
+					writecache_insert_entry(wc, e);
+					need_flush = true;
+				}
+			}
+		}
+		cond_resched();
+	}
+
+	if (need_flush) {
+		writecache_flush_all_metadata(wc);
+		writecache_commit_flushed(wc);
+	}
+
+	wc_unlock_long(wc);
+}
+
+static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
+{
+	uint64_t seq;
+
+	if (argc != 1)
+		return -EINVAL;
+
+	wc_lock(wc);
+	if (dm_suspended(wc->ti)) {
+		wc_unlock(wc);
+		return -EBUSY;
+	}
+	if (writecache_has_error(wc)) {
+		wc_unlock(wc);
+		return -EIO;
+	}
+
+	writecache_flush(wc);
+	seq = wc->seq_count;
+	wc->writeback_all++;
+	wc_unlock(wc);
+
+	queue_work(wc->writeback_wq, &wc->writeback_work);
+	flush_workqueue(wc->writeback_wq);
+
+	wc_lock(wc);
+	wc->writeback_all--;
+	if (dm_suspended(wc->ti)) {
+		wc_unlock(wc);
+		return -EBUSY;
+	}
+	if (writecache_has_error(wc)) {
+		wc_unlock(wc);
+		return -EIO;
+	}
+	wc_unlock(wc);
+
+	return 0;
+}
+
+static int writecache_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r = -EINVAL;
+	struct dm_writecache *wc = ti->private;
+
+	if (!strcasecmp(argv[0], "flush"))
+		r = process_flush_mesg(argc, argv, wc);
+	else
+		DMWARN("unrecognised message received: %s", argv[0]);
+
+	return r;
+}
+
+static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
+{
+	void *buf;
+	unsigned long flags;
+	unsigned size;
+	int rw = bio_data_dir(bio);
+	unsigned remaining_size = wc->block_size;
+
+	do {
+		struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
+		buf = bvec_kmap_irq(&bv, &flags);
+		size = bv.bv_len;
+		if (unlikely(size > remaining_size))
+			size = remaining_size;
+
+		if (rw == READ) {
+			int r;
+			r = memcpy_mcsafe(buf, data, size);
+			flush_dcache_page(bio_page(bio));
+			if (unlikely(r)) {
+				writecache_error(wc, r, "hardware memory error when reading data: %d", r);
+				bio->bi_status = BLK_STS_IOERR;
+			}
+		} else {
+			flush_dcache_page(bio_page(bio));
+			memcpy_flushcache(data, buf, size);
+		}
+
+		bvec_kunmap_irq(buf, &flags);
+
+		data = (char *)data + size;
+		remaining_size -= size;
+		bio_advance(bio, size);
+	} while (unlikely(remaining_size));
+}
+
+static int writecache_flush_thread(void *data)
+{
+	struct dm_writecache *wc = data;
+
+	while (!kthread_should_stop()) {
+		struct bio *bio;
+
+		bio = wc->flush_bio;
+		if (unlikely(!bio)) {
+		} else if (bio_op(bio) == REQ_OP_DISCARD) {
+			writecache_discard(wc, bio->bi_iter.bi_sector,
+					   bio->bi_iter.bi_sector + (bio->bi_iter.bi_size >> SECTOR_SHIFT));
+		} else {
+			writecache_flush(wc);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		wc->flush_bio = (void *)0x600 + POISON_POINTER_DELTA;	/* for debugging - catch uninitialized use */
+		complete(&wc->flush_completion);
+
+		schedule();
+	}
+
+	set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
+{
+	wc->flush_bio = bio;
+	reinit_completion(&wc->flush_completion);
+	wake_up_process(wc->flush_thread);
+	wait_for_completion_io(&wc->flush_completion);
+}
+
+static int writecache_map(struct dm_target *ti, struct bio *bio)
+{
+	struct wc_entry *e;
+	struct dm_writecache *wc = ti->private;
+
+	bio->bi_private = NULL;
+
+	wc_lock(wc);
+
+	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+		if (writecache_has_error(wc))
+			goto unlock_error;
+		if (WC_MODE_PMEM(wc))
+			writecache_flush(wc);
+		else
+			writecache_offload_bio(wc, bio);
+		goto unlock_ok_flush;
+	}
+
+	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
+
+	if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
+				(wc->block_size / 512 - 1)) != 0)) {
+		DMWARN("I/O is not aligned, sector %llu, size %u, block size %u",
+			(unsigned long long)bio->bi_iter.bi_sector,
+			bio->bi_iter.bi_size, wc->block_size);
+		goto unlock_error;
+	}
+
+	if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
+		if (writecache_has_error(wc))
+			goto unlock_error;
+		if (WC_MODE_PMEM(wc))
+			writecache_discard(wc, bio->bi_iter.bi_sector,
+					   bio->bi_iter.bi_sector + (bio->bi_iter.bi_size >> SECTOR_SHIFT));
+		else
+			writecache_offload_bio(wc, bio);
+		goto unlock_remap_origin;
+	}
+
+	if (bio_data_dir(bio) == READ) {
+next_block:
+		e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
+		if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
+			if (WC_MODE_PMEM(wc)) {
+				bio_copy_block(wc, bio, memory_data(wc, e));
+				if (bio->bi_iter.bi_size)
+					goto next_block;
+				goto unlock_ok_read;
+			} else {
+				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+				bio_set_dev(bio, wc->ssd_dev->bdev);
+				bio->bi_iter.bi_sector = cache_sector(wc, e);
+				if (!writecache_entry_is_committed(wc, e))
+					writecache_wait_for_ios(wc, WRITE);
+				goto unlock_remap;
+			}
+		} else {
+			if (e) {
+				sector_t next_boundary =
+					read_original_sector(wc, e) - bio->bi_iter.bi_sector;
+				if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
+					dm_accept_partial_bio(bio, next_boundary);
+				}
+			}
+			goto unlock_remap_origin;
+		}
+	} else {
+		do {
+			if (writecache_has_error(wc))
+				goto unlock_error;
+			e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
+			if (e) {
+				if (!writecache_entry_is_committed(wc, e))
+					goto bio_copy;
+				if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
+					wc->overwrote_committed = true;
+					goto bio_copy;
+				}
+			}
+			e = writecache_pop_from_freelist(wc);
+			if (unlikely(!e)) {
+				writecache_wait_on_freelist(wc);
+				continue;
+			}
+			write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
+			writecache_insert_entry(wc, e);
+			wc->uncommitted_blocks++;
+bio_copy:
+			if (WC_MODE_PMEM(wc)) {
+				bio_copy_block(wc, bio, memory_data(wc, e));
+			} else {
+				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+				bio_set_dev(bio, wc->ssd_dev->bdev);
+				bio->bi_iter.bi_sector = cache_sector(wc, e);
+				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
+					wc->uncommitted_blocks = 0;
+					queue_work(wc->writeback_wq, &wc->flush_work);
+				} else {
+					writecache_schedule_autocommit(wc);
+				}
+				goto unlock_remap;
+			}
+		} while (bio->bi_iter.bi_size);
+
+		if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
+			writecache_flush(wc);
+		} else {
+			writecache_schedule_autocommit(wc);
+		}
+
+		goto unlock_ok_write;
+	}
+
+unlock_remap_origin:
+	bio_set_dev(bio, wc->dev->bdev);
+	wc_unlock(wc);
+	return DM_MAPIO_REMAPPED;
+
+unlock_remap:
+	bio->bi_private = (void *)1;	/* make sure that writecache_end_io decrements bio_in_progress */
+	atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
+	wc_unlock(wc);
+	return DM_MAPIO_REMAPPED;
+
+unlock_ok_flush:
+#ifdef WC_MEASURE_LATENCY
+	wc_unlock(wc);
+	bio_endio(bio);
+	return DM_MAPIO_SUBMITTED;
+#endif
+
+unlock_ok_read:
+#ifdef WC_MEASURE_LATENCY
+	wc_unlock(wc);
+	bio_endio(bio);
+	return DM_MAPIO_SUBMITTED;
+#endif
+
+unlock_ok_write:
+	wc_unlock(wc);
+	bio_endio(bio);
+	return DM_MAPIO_SUBMITTED;
+
+unlock_error:
+	wc_unlock(wc);
+	bio_io_error(bio);
+	return DM_MAPIO_SUBMITTED;
+}
+
+static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
+{
+	struct dm_writecache *wc = ti->private;
+
+	if (bio->bi_private != NULL) {
+		int dir = bio_data_dir(bio);
+		if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
+			if (unlikely(swait_active(&wc->bio_in_progress_wait[dir])))
+				swake_up(&wc->bio_in_progress_wait[dir]);
+	}
+	return 0;
+}
+
+static int writecache_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
+{
+	struct dm_writecache *wc = ti->private;
+
+	return fn(ti, wc->dev, 0, ti->len, data);
+}
+
+static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct dm_writecache *wc = ti->private;
+
+	if (limits->logical_block_size < wc->block_size)
+		limits->logical_block_size = wc->block_size;
+
+	if (limits->physical_block_size < wc->block_size)
+		limits->physical_block_size = wc->block_size;
+
+	if (limits->io_min < wc->block_size)
+		limits->io_min = wc->block_size;
+}
+
+
+static void writecache_writeback_endio(struct bio *bio)
+{
+	struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
+	struct dm_writecache *wc = wb->wc;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&wc->endio_thread_wait.lock, flags);
+	list_add_tail(&wb->endio_entry, &wc->endio_list);
+	swake_up_locked(&wc->endio_thread_wait);
+	raw_spin_unlock_irqrestore(&wc->endio_thread_wait.lock, flags);
+}
+
+static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
+{
+	struct copy_struct *c = ptr;
+	struct dm_writecache *wc = c->wc;
+
+	c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
+
+	raw_spin_lock_irq(&wc->endio_thread_wait.lock);
+	list_add_tail(&c->endio_entry, &wc->endio_list);
+	swake_up_locked(&wc->endio_thread_wait);
+	raw_spin_unlock_irq(&wc->endio_thread_wait.lock);
+}
+
+static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
+{
+	unsigned i;
+	struct writeback_struct *wb;
+	struct wc_entry *e;
+	unsigned long n_walked = 0;
+
+	do {
+		wb = list_entry(list->next, struct writeback_struct, endio_entry);
+		list_del(&wb->endio_entry);
+
+		if (unlikely(wb->bio.bi_status != BLK_STS_OK))
+			writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
+					"write error %d", wb->bio.bi_status);
+		i = 0;
+		do {
+			e = wb->wc_list[i];
+			BUG_ON(!e->write_in_progress);
+			e->write_in_progress = false;
+			INIT_LIST_HEAD(&e->lru);
+			if (!writecache_has_error(wc))
+				writecache_free_entry(wc, e);
+			BUG_ON(!wc->writeback_size);
+			wc->writeback_size--;
+			n_walked++;
+			if (unlikely(n_walked >= ENDIO_LATENCY)) {
+				writecache_commit_flushed(wc);
+				wc_unlock(wc);
+				wc_lock(wc);
+				n_walked = 0;
+			}
+		} while (++i < wb->wc_list_n);
+
+		if (wb->wc_list != wb->wc_list_inline)
+			kfree(wb->wc_list);
+		bio_put(&wb->bio);
+	} while (!list_empty(list));
+}
+
+static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
+{
+	struct copy_struct *c;
+	struct wc_entry *e;
+
+	do {
+		c = list_entry(list->next, struct copy_struct, endio_entry);
+		list_del(&c->endio_entry);
+
+		if (unlikely(c->error))
+			writecache_error(wc, c->error, "copy error");
+
+		e = c->e;
+		do {
+			BUG_ON(!e->write_in_progress);
+			e->write_in_progress = false;
+			INIT_LIST_HEAD(&e->lru);
+			if (!writecache_has_error(wc))
+				writecache_free_entry(wc, e);
+
+			BUG_ON(!wc->writeback_size);
+			wc->writeback_size--;
+			e++;
+		} while (--c->n_entries);
+		mempool_free(c, wc->copy_pool);
+	} while (!list_empty(list));
+}
+
+static int writecache_endio_thread(void *data)
+{
+	struct dm_writecache *wc = data;
+
+	while (1) {
+		DECLARE_SWAITQUEUE(wait);
+		struct list_head list;
+
+		raw_spin_lock_irq(&wc->endio_thread_wait.lock);
+continue_locked:
+		if (!list_empty(&wc->endio_list))
+			goto pop_from_list;
+		set_current_state(TASK_INTERRUPTIBLE);
+		__prepare_to_swait(&wc->endio_thread_wait, &wait);
+		raw_spin_unlock_irq(&wc->endio_thread_wait.lock);
+
+		if (unlikely(kthread_should_stop())) {
+			finish_swait(&wc->endio_thread_wait, &wait);
+			break;
+		}
+
+		schedule();
+
+		raw_spin_lock_irq(&wc->endio_thread_wait.lock);
+		__finish_swait(&wc->endio_thread_wait, &wait);
+		goto continue_locked;
+
+pop_from_list:
+		list = wc->endio_list;
+		list.next->prev = list.prev->next = &list;
+		INIT_LIST_HEAD(&wc->endio_list);
+		raw_spin_unlock_irq(&wc->endio_thread_wait.lock);
+
+		writecache_disk_flush(wc, wc->dev);
+
+		wc_lock(wc);
+
+		if (WC_MODE_PMEM(wc)) {
+			__writecache_endio_pmem(wc, &list);
+		} else {
+			__writecache_endio_ssd(wc, &list);
+			writecache_wait_for_ios(wc, READ);
+		}
+
+		writecache_commit_flushed(wc);
+
+		wc_unlock(wc);
+	}
+
+	return 0;
+}
+
+static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e, gfp_t gfp)
+{
+	struct dm_writecache *wc = wb->wc;
+	unsigned block_size = wc->block_size;
+	void *address = memory_data(wc, e);
+
+	persistent_memory_flush_cache(address, block_size);
+	return bio_add_page(&wb->bio, persistent_memory_page(address),
+			    block_size, persistent_memory_page_offset(address)) != 0;
+}
+
+struct writeback_list {
+	struct list_head list;
+	size_t size;
+};
+
+static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+	if (ACCESS_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
+		wc_lock(wc);
+		while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs) {
+			writecache_wait_on_freelist_long(wc);
+		}
+		wc_unlock(wc);
+	}
+	cond_resched();
+}
+
+static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+	struct wc_entry *e, *f;
+	struct bio *bio;
+	struct writeback_struct *wb;
+	unsigned max_pages;
+
+	while (wbl->size) {
+		wbl->size--;
+		e = container_of(wbl->list.prev, struct wc_entry, lru);
+		list_del(&e->lru);
+
+		max_pages = e->wc_list_contiguous;
+
+		bio = bio_alloc_bioset(GFP_NOIO, max_pages, wc->bio_set);
+		wb = container_of(bio, struct writeback_struct, bio);
+		wb->wc = wc;
+		wb->bio.bi_end_io = writecache_writeback_endio;
+		bio_set_dev(&wb->bio, wc->dev->bdev);
+		wb->bio.bi_iter.bi_sector = read_original_sector(wc, e);
+		wb->page_offset = PAGE_SIZE;
+		if (max_pages > WB_LIST_INLINE) {
+			wb->wc_list = kmalloc(max_pages * sizeof(struct wc_entry *),
+					      GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
+			if (unlikely(!wb->wc_list))
+				goto use_inline_list;
+		} else {
+use_inline_list:
+			wb->wc_list = wb->wc_list_inline;
+			max_pages = WB_LIST_INLINE;
+		}
+
+		BUG_ON(!wc_add_block(wb, e, GFP_NOIO));
+
+		wb->wc_list[0] = e;
+		wb->wc_list_n = 1;
+
+		while (wbl->size && wb->wc_list_n < max_pages) {
+			f = container_of(wbl->list.prev, struct wc_entry, lru);
+			if (read_original_sector(wc, f) !=
+			    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
+				break;
+			if (!wc_add_block(wb, f, GFP_NOWAIT | __GFP_NOWARN))
+				break;
+			wbl->size--;
+			list_del(&f->lru);
+			wb->wc_list[wb->wc_list_n++] = f;
+			e = f;
+		}
+		bio_set_op_attrs(&wb->bio, REQ_OP_WRITE, 0);
+		if (writecache_has_error(wc)) {
+			bio->bi_status = BLK_STS_IOERR;
+			bio_endio(&wb->bio);
+		} else {
+			submit_bio(&wb->bio);
+		}
+
+		__writeback_throttle(wc, wbl);
+
+	}
+}
+
+static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
+{
+	struct wc_entry *e, *f;
+	struct dm_io_region from, to;
+	struct copy_struct *c;
+
+	while (wbl->size) {
+		unsigned n_sectors;
+
+		wbl->size--;
+		e = container_of(wbl->list.prev, struct wc_entry, lru);
+		list_del(&e->lru);
+
+		n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
+
+		from.bdev = wc->ssd_dev->bdev;
+		from.sector = cache_sector(wc, e);
+		from.count = n_sectors;
+		to.bdev = wc->dev->bdev;
+		to.sector = read_original_sector(wc, e);
+		to.count = n_sectors;
+
+		c = mempool_alloc(wc->copy_pool, GFP_NOIO);
+		c->wc = wc;
+		c->e = e;
+		c->n_entries = e->wc_list_contiguous;
+
+		while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
+			wbl->size--;
+			f = container_of(wbl->list.prev, struct wc_entry, lru);
+			BUG_ON(f != e + 1);
+			list_del(&f->lru);
+			e = f;
+		}
+
+		dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
+
+		__writeback_throttle(wc, wbl);
+	}
+}
+
+static void writecache_writeback(struct work_struct *work)
+{
+	struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
+	struct blk_plug plug;
+	struct wc_entry *e, *f, *g;
+	struct rb_node *node, *next_node;
+	struct list_head skipped;
+	struct writeback_list wbl;
+	unsigned long n_walked;
+
+	wc_lock(wc);
+restart:
+	if (writecache_has_error(wc) || unlikely(dm_suspended(wc->ti))) {
+		wc_unlock(wc);
+		return;
+	}
+
+	if (unlikely(wc->writeback_all)) {
+		if (writecache_wait_for_writeback(wc))
+			goto restart;
+	}
+
+	if (wc->overwrote_committed) {
+		writecache_wait_for_ios(wc, WRITE);
+	}
+
+	n_walked = 0;
+	INIT_LIST_HEAD(&skipped);
+	INIT_LIST_HEAD(&wbl.list);
+	wbl.size = 0;
+	while (!list_empty(&wc->lru) &&
+	       (wc->writeback_all ||
+		wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark)) {
+
+		n_walked++;
+		if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all)) {
+			queue_work(wc->writeback_wq, &wc->writeback_work);
+			break;
+		}
+
+		e = container_of(wc->lru.prev, struct wc_entry, lru);
+		BUG_ON(e->write_in_progress);
+		if (unlikely(!writecache_entry_is_committed(wc, e))) {
+			writecache_flush(wc);
+		}
+		node = rb_prev(&e->rb_node);
+		if (node) {
+			f = container_of(node, struct wc_entry, rb_node);
+			if (unlikely(read_original_sector(wc, f) ==
+				     read_original_sector(wc, e))) {
+				BUG_ON(!f->write_in_progress);
+				list_del(&e->lru);
+				list_add(&e->lru, &skipped);
+				cond_resched();
+				continue;
+			}
+		}
+		wc->writeback_size++;
+		list_del(&e->lru);
+		list_add(&e->lru, &wbl.list);
+		wbl.size++;
+		e->write_in_progress = true;
+		e->wc_list_contiguous = 1;
+
+		f = e;
+
+		while (1) {
+			next_node = rb_next(&f->rb_node);
+			if (unlikely(!next_node))
+				break;
+			g = container_of(next_node, struct wc_entry, rb_node);
+			if (read_original_sector(wc, g) ==
+			    read_original_sector(wc, f)) {
+				f = g;
+				continue;
+			}
+			if (read_original_sector(wc, g) !=
+			    read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
+				break;
+			if (unlikely(g->write_in_progress))
+				break;
+			if (unlikely(!writecache_entry_is_committed(wc, g)))
+				break;
+
+			if (!WC_MODE_PMEM(wc)) {
+				if (g != f + 1)
+					break;
+			}
+
+			n_walked++;
+			//if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
+			//	break;
+
+			wc->writeback_size++;
+			list_del(&g->lru);
+			list_add(&g->lru, &wbl.list);
+			wbl.size++;
+			g->write_in_progress = true;
+			g->wc_list_contiguous = BIO_MAX_PAGES;
+			f = g;
+			e->wc_list_contiguous++;
+			if (unlikely(e->wc_list_contiguous == BIO_MAX_PAGES))
+				break;
+		}
+		cond_resched();
+	}
+
+	if (!list_empty(&skipped)) {
+		list_splice_tail(&skipped, &wc->lru);
+		/*
+		 * If we didn't do any progress, we must wait until some
+		 * writeback finishes to avoid burning CPU in a loop
+		 */
+		if (unlikely(!wbl.size))
+			writecache_wait_for_writeback(wc);
+	}
+
+	wc_unlock(wc);
+
+	blk_start_plug(&plug);
+
+	if (WC_MODE_PMEM(wc))
+		__writecache_writeback_pmem(wc, &wbl);
+	else
+		__writecache_writeback_ssd(wc, &wbl);
+
+	blk_finish_plug(&plug);
+
+	if (unlikely(wc->writeback_all)) {
+		wc_lock(wc);
+		while (writecache_wait_for_writeback(wc));
+		wc_unlock(wc);
+	}
+}
+
+static int calculate_memory_size(uint64_t device_size, unsigned block_size,
+				 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
+{
+	uint64_t n_blocks, offset;
+	struct wc_entry e;
+
+	n_blocks = device_size;
+	do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
+
+	while (1) {
+		if (!n_blocks)
+			return -ENOSPC;
+		/* Verify the following entries[n_blocks] won't overflow */
+		if (n_blocks >= (size_t)-sizeof(struct wc_memory_superblock) / sizeof(struct wc_memory_entry))
+			return -EFBIG;
+		offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
+		offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
+		if (offset + n_blocks * block_size <= device_size)
+			break;
+		n_blocks--;
+	}
+
+	/* check if the bit field overflows */
+	e.index = n_blocks;
+	if (e.index != n_blocks)
+		return -EFBIG;
+
+	if (n_blocks_p)
+		*n_blocks_p = n_blocks;
+	if (n_metadata_blocks_p)
+		*n_metadata_blocks_p = offset >> __ffs(block_size);
+	return 0;
+}
+
+static int init_memory(struct dm_writecache *wc)
+{
+	size_t b;
+	int r;
+
+	r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
+	if (r)
+		return r;
+
+	r = writecache_alloc_entries(wc);
+	if (r)
+		return r;
+
+	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
+		NT_STORE(sb(wc)->padding[b], cpu_to_le64(0));
+	NT_STORE(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
+	NT_STORE(sb(wc)->block_size, cpu_to_le32(wc->block_size));
+	NT_STORE(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
+	NT_STORE(sb(wc)->seq_count, cpu_to_le64(0));
+
+	for (b = 0; b < wc->n_blocks; b++)
+		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
+
+	writecache_flush_all_metadata(wc);
+	writecache_commit_flushed(wc);
+	NT_STORE(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
+	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
+	writecache_commit_flushed(wc);
+
+	return 0;
+}
+
+static void writecache_dtr(struct dm_target *ti)
+{
+	struct dm_writecache *wc = ti->private;
+
+	if (!wc)
+		return;
+
+	if (wc->endio_thread)
+		kthread_stop(wc->endio_thread);
+
+	if (wc->flush_thread)
+		kthread_stop(wc->flush_thread);
+
+	mempool_destroy(wc->page_pool);
+
+	if (wc->bio_set)
+		bioset_free(wc->bio_set);
+
+	mempool_destroy(wc->copy_pool);
+
+	if (wc->writeback_wq)
+		destroy_workqueue(wc->writeback_wq);
+
+	if (wc->dev)
+		dm_put_device(ti, wc->dev);
+
+	if (wc->ssd_dev)
+		dm_put_device(ti, wc->ssd_dev);
+
+	if (wc->entries)
+		vfree(wc->entries);
+
+	if (wc->memory_map) {
+		if (WC_MODE_PMEM(wc))
+			persistent_memory_release(wc);
+		else
+			vfree(wc->memory_map);
+	}
+
+	if (wc->dm_kcopyd)
+		dm_kcopyd_client_destroy(wc->dm_kcopyd);
+
+	if (wc->dm_io)
+		dm_io_client_destroy(wc->dm_io);
+
+	if (wc->dirty_bitmap)
+		vfree(wc->dirty_bitmap);
+
+	kfree(wc);
+}
+
+static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+	struct dm_writecache *wc;
+	struct dm_arg_set as;
+	const char *string;
+	unsigned opt_params;
+	size_t offset, data_size;
+	int i, r;
+	char dummy;
+	int high_wm_percent = HIGH_WATERMARK;
+	int low_wm_percent = LOW_WATERMARK;
+	uint64_t x;
+	struct wc_memory_superblock s;
+
+	static struct dm_arg _args[] = {
+		{0, 10, "Invalid number of feature args"},
+	};
+
+	as.argc = argc;
+	as.argv = argv;
+
+	wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
+	if (!wc) {
+		ti->error = "Cannot allocate writecache structure";
+		r = -ENOMEM;
+		goto bad;
+	}
+	ti->private = wc;
+	wc->ti = ti;
+
+	mutex_init(&wc->lock);
+	writecache_poison_lists(wc);
+	init_swait_queue_head(&wc->freelist_wait);
+	setup_timer(&wc->autocommit_timer, writecache_autocommit_timer, (unsigned long)wc);
+
+	for (i = 0; i < 2; i++) {
+		atomic_set(&wc->bio_in_progress[i], 0);
+		init_swait_queue_head(&wc->bio_in_progress_wait[i]);
+	}
+
+	wc->dm_io = dm_io_client_create();
+	if (!wc->dm_io) {
+		r = -ENOMEM;
+		ti->error = "Unable to allocate dm-io client";
+		goto bad;
+	}
+
+	wc->writeback_wq = alloc_workqueue("writecache-writeabck", WQ_MEM_RECLAIM, 1);
+	if (!wc->writeback_wq) {
+		r = -ENOMEM;
+		ti->error = "Could not allocate writeback workqueue";
+		goto bad;
+	}
+	INIT_WORK(&wc->writeback_work, writecache_writeback);
+	INIT_WORK(&wc->flush_work, writecache_flush_work);
+
+	init_swait_queue_head(&wc->endio_thread_wait);
+	INIT_LIST_HEAD(&wc->endio_list);
+	wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
+	if (IS_ERR(wc->endio_thread)) {
+		r = PTR_ERR(wc->endio_thread);
+		wc->endio_thread = NULL;
+		ti->error = "Couldn't spawn endio thread";
+		goto bad;
+	}
+	wake_up_process(wc->endio_thread);
+
+	/*
+	 * Parse the mode (pmem or ssd)
+	 */
+	string = dm_shift_arg(&as);
+	if (!string)
+		goto bad_arguments;
+
+	if (!strcasecmp(string, "s")) {
+#ifndef DM_WRITECACHE_ONLY_SSD
+		wc->pmem_mode = false;
+#endif
+	} else if (!strcasecmp(string, "p")) {
+#ifndef DM_WRITECACHE_ONLY_SSD
+		wc->pmem_mode = true;
+#else
+		r = -EOPNOTSUPP;
+		ti->error = "Persistent memory not supported on this architecture";
+		goto bad;
+#endif
+	} else {
+		goto bad_arguments;
+	}
+
+	if (WC_MODE_PMEM(wc)) {
+		wc->bio_set = bioset_create(BIO_POOL_SIZE,
+					    offsetof(struct writeback_struct, bio),
+					    BIOSET_NEED_BVECS);
+		if (!wc->bio_set) {
+			r = -ENOMEM;
+			ti->error = "Could not allocate bio set";
+			goto bad;
+		}
+	} else {
+		wc->copy_pool = mempool_create_kmalloc_pool(1, sizeof(struct copy_struct));
+		if (!wc->copy_pool) {
+			r = -ENOMEM;
+			ti->error = "Could not allocate mempool";
+			goto bad;
+		}
+	}
+
+	/*
+	 * Parse the origin data device
+	 */
+	string = dm_shift_arg(&as);
+	if (!string)
+		goto bad_arguments;
+	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
+	if (r) {
+		ti->error = "Origin data device lookup failed";
+		goto bad;
+	}
+
+	/*
+	 * Parse cache data device (be it pmem or ssd)
+	 */
+	string = dm_shift_arg(&as);
+	if (!string)
+		goto bad_arguments;
+
+	r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
+	if (r) {
+		ti->error = "Cache data device lookup failed";
+		goto bad;
+	}
+	wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
+
+	if (WC_MODE_PMEM(wc)) {
+		r = persistent_memory_claim(wc);
+		if (r) {
+			ti->error = "Unable to map persistent memory for cache";
+			goto bad;
+		}
+	}
+
+	/*
+	 * Parse the cache block size
+	 */
+	string = dm_shift_arg(&as);
+	if (!string)
+		goto bad_arguments;
+	if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
+	    wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
+	    (wc->block_size & (wc->block_size - 1))) {
+		r = -EINVAL;
+		ti->error = "Invalid block size";
+		goto bad;
+	}
+	wc->block_size_bits = __ffs(wc->block_size);
+
+	wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
+	wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
+	wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
+
+	/*
+	 * Parse optional arguments
+	 */
+	r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+	if (r)
+		goto bad;
+
+	while (opt_params) {
+		string = dm_shift_arg(&as), opt_params--;
+		if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%d%%%c", &high_wm_percent, &dummy) != 1)
+				goto invalid_optional;
+			if (high_wm_percent < 0 || high_wm_percent > 100)
+				goto invalid_optional;
+			wc->high_wm_percent_set = true;
+		} else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%d%%%c", &low_wm_percent, &dummy) != 1)
+				goto invalid_optional;
+			if (low_wm_percent < 0 || low_wm_percent > 100)
+				goto invalid_optional;
+			wc->low_wm_percent_set = true;
+		} else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
+				goto invalid_optional;
+			if (!wc->max_writeback_jobs)
+				goto invalid_optional;
+			wc->max_writeback_jobs_set = true;
+		} else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
+				goto invalid_optional;
+			wc->autocommit_blocks_set = true;
+		} else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
+			unsigned autocommit_msecs;
+			string = dm_shift_arg(&as), opt_params--;
+			if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
+				goto invalid_optional;
+			if (autocommit_msecs > 3600000)
+				goto invalid_optional;
+			wc->autocommit_jiffies = jiffies_to_msecs(autocommit_msecs);
+			wc->autocommit_time_set = true;
+		} else {
+invalid_optional:
+			r = -EINVAL;
+			ti->error = "Invalid optional argument";
+			goto bad;
+		}
+	}
+
+	if (!WC_MODE_PMEM(wc)) {
+		struct dm_io_region region;
+		struct dm_io_request req;
+		size_t n_blocks, n_metadata_blocks;
+		uint64_t n_bitmap_bits;
+
+		init_completion(&wc->flush_completion);
+		wc->flush_thread = kthread_create(writecache_flush_thread, wc, "writecache_flush");
+		if (IS_ERR(wc->flush_thread)) {
+			r = PTR_ERR(wc->flush_thread);
+			wc->flush_thread = NULL;
+			ti->error = "Couldn't spawn endio thread";
+			goto bad;
+		}
+		writecache_offload_bio(wc, NULL);
+
+		r = calculate_memory_size(wc->memory_map_size, wc->block_size,
+					  &n_blocks, &n_metadata_blocks);
+		if (r) {
+			ti->error = "Invalid device size";
+			goto bad;
+		}
+
+		n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) + BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
+		/* this is limitation of test_bit functions */
+		if (n_bitmap_bits > 1U << 31) {
+			r = -EFBIG;
+			ti->error = "Invalid device size";
+		}
+
+		wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
+		if (!wc->memory_map) {
+			r = -ENOMEM;
+			ti->error = "Unable to allocate memory for metadata";
+			goto bad;
+		}
+
+		wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
+		if (!wc->dm_kcopyd) {
+			r = -ENOMEM;
+			ti->error = "Unable to allocate dm-kcopyd client";
+			goto bad;
+		}
+
+		wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
+		wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) / BITS_PER_LONG * sizeof(unsigned long);
+		wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
+		if (!wc->dirty_bitmap) {
+			r = -ENOMEM;
+			ti->error = "Unable to allocate dirty bitmap";
+			goto bad;
+		}
+
+		region.bdev = wc->ssd_dev->bdev;
+		region.sector = 0;
+		region.count = wc->metadata_sectors;
+		req.bi_op = REQ_OP_READ;
+		req.bi_op_flags = REQ_SYNC;
+		req.mem.type = DM_IO_VMA;
+		req.mem.ptr.vma = (char *)wc->memory_map;
+		req.client = wc->dm_io;
+		req.notify.fn = NULL;
+
+		r = dm_io(&req, 1, &region, NULL);
+		if (r) {
+			ti->error = "Unable to read metadata";
+			goto bad;
+		}
+	}
+
+	r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+	if (r) {
+		ti->error = "Hardware memory error when reading superblock";
+		goto bad;
+	}
+	if (!le32_to_cpu(le32_to_cpu(s.magic)) && !le32_to_cpu(s.version)) {
+		r = init_memory(wc);
+		if (r) {
+			ti->error = "Unable to initialize device";
+			goto bad;
+		}
+		r = memcpy_mcsafe(&s, sb(wc), sizeof(struct wc_memory_superblock));
+		if (r) {
+			ti->error = "Hardware memory error when reading superblock";
+			goto bad;
+		}
+	}
+
+	if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
+		ti->error = "Invalid magic in the superblock";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
+		ti->error = "Invalid version in the superblock";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	if (le32_to_cpu(s.block_size) != wc->block_size) {
+		ti->error = "Block size does not match superblock";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	wc->n_blocks = le64_to_cpu(s.n_blocks);
+
+	offset = wc->n_blocks * sizeof(struct wc_memory_entry);
+	if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
+overflow:
+		ti->error = "Overflow in size calculation";
+		r = -EINVAL;
+		goto bad;
+	}
+	offset += sizeof(struct wc_memory_superblock);
+	if (offset < sizeof(struct wc_memory_superblock))
+		goto overflow;
+	offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
+	data_size = wc->n_blocks * (size_t)wc->block_size;
+	if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
+	    (offset + data_size < offset))
+		goto overflow;
+	if (offset + data_size > wc->memory_map_size) {
+		ti->error = "Memory area is too small";
+		r = -EINVAL;
+		goto bad;
+	}
+
+	wc->metadata_sectors = offset >> SECTOR_SHIFT;
+	wc->block_start = (char *)sb(wc) + offset;
+
+	x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
+	x += 50;
+	do_div(x, 100);
+	wc->freelist_high_watermark = x;
+	x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
+	x += 50;
+	do_div(x, 100);
+	wc->freelist_low_watermark = x;
+
+	r = writecache_alloc_entries(wc);
+	if (r) {
+		ti->error = "Cannot allocate memory";
+		goto bad;
+	}
+
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+	ti->num_discard_bios = 1;
+
+	if (WC_MODE_PMEM(wc))
+		persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
+
+	return 0;
+
+bad_arguments:
+	r = -EINVAL;
+	ti->error = "Bad arguments";
+bad:
+	writecache_dtr(ti);
+	return r;
+}
+
+static void writecache_status(struct dm_target *ti, status_type_t type,
+			      unsigned status_flags, char *result, unsigned maxlen)
+{
+	struct dm_writecache *wc = ti->private;
+	unsigned extra_args;
+	unsigned sz = 0;
+	uint64_t x;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc), (unsigned long long)wc->n_blocks,
+		       (unsigned long long)wc->freelist_size, (unsigned long long)wc->writeback_size);
+		break;
+	case STATUSTYPE_TABLE:
+		DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
+				wc->dev->name, wc->ssd_dev->name, wc->block_size);
+		extra_args = 0;
+		if (wc->high_wm_percent_set)
+			extra_args += 2;
+		if (wc->low_wm_percent_set)
+			extra_args += 2;
+		if (wc->max_writeback_jobs_set)
+			extra_args += 2;
+		if (wc->autocommit_blocks_set)
+			extra_args += 2;
+		if (wc->autocommit_time_set)
+			extra_args += 2;
+		DMEMIT("%u", extra_args);
+		if (wc->high_wm_percent_set) {
+			x = (uint64_t)wc->freelist_high_watermark * 100;
+			x += wc->n_blocks / 2;
+			do_div(x, (size_t)wc->n_blocks);
+			DMEMIT(" high_watermark %u%%", 100 - (unsigned)x);
+		}
+		if (wc->low_wm_percent_set) {
+			x = (uint64_t)wc->freelist_low_watermark * 100;
+			x += wc->n_blocks / 2;
+			do_div(x, (size_t)wc->n_blocks);
+			DMEMIT(" low_watermark %u%%", 100 - (unsigned)x);
+		}
+		if (wc->max_writeback_jobs_set) {
+			DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
+		}
+		if (wc->autocommit_blocks_set) {
+			DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
+		}
+		if (wc->autocommit_time_set) {
+			DMEMIT(" autocommit_time %u", jiffies_to_msecs(wc->autocommit_jiffies));
+		}
+		break;
+	}
+}
+
+static struct target_type writecache_target = {
+	.name			= "writecache",
+	.version		= {1, 0, 0},
+	.module			= THIS_MODULE,
+	.ctr			= writecache_ctr,
+	.dtr			= writecache_dtr,
+	.status			= writecache_status,
+	.postsuspend		= writecache_suspend,
+	.resume			= writecache_resume,
+	.message		= writecache_message,
+	.map			= writecache_map,
+	.end_io			= writecache_end_io,
+	.iterate_devices	= writecache_iterate_devices,
+	.io_hints		= writecache_io_hints,
+};
+
+static int __init dm_writecache_init(void)
+{
+	int r;
+
+	r = dm_register_target(&writecache_target);
+	if (r < 0) {
+		DMERR("register failed %d", r);
+		return r;
+	}
+
+	return 0;
+}
+
+static void __exit dm_writecache_exit(void)
+{
+	dm_unregister_target(&writecache_target);
+}
+
+module_init(dm_writecache_init);
+module_exit(dm_writecache_exit);
+
+MODULE_DESCRIPTION(DM_NAME " writecache target");
+MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
+MODULE_LICENSE("GPL");