diff mbox

[v3,27/32] nvdimm: support DSM_CMD_IMPLEMENTED function

Message ID 1444535584-18220-28-git-send-email-guangrong.xiao@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Oct. 11, 2015, 3:52 a.m. UTC
__DSM is defined in ACPI 6.0: 9.14.1 _DSM (Device Specific Method)

Function 0 is a query function. We do not support any function on root
device and only 3 functions are support for NVDIMM device,
DSM_CMD_NAMESPACE_LABEL_SIZE, DSM_CMD_GET_NAMESPACE_LABEL_DATA and
DSM_CMD_SET_NAMESPACE_LABEL_DATA, that means we currently only allow to access
device's Label Namespace

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 hw/mem/nvdimm/acpi.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 177 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi Oct. 14, 2015, 9:40 a.m. UTC | #1
On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>  static void dsm_write(void *opaque, hwaddr addr,
>                        uint64_t val, unsigned size)
>  {
> +    NVDIMMState *state = opaque;
> +    MemoryRegion *dsm_ram_mr;
> +    dsm_in *in;
> +    dsm_out *out;
> +    uint32_t revision, function, handle;
> +
>      if (val != NOTIFY_VALUE) {
>          fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
>      }
> +
> +    dsm_ram_mr = memory_region_find(&state->mr, state->page_size,
> +                                    state->page_size).mr;
> +    memory_region_unref(dsm_ram_mr);
> +    in = memory_region_get_ram_ptr(dsm_ram_mr);

This looks suspicious.  Shouldn't the memory_region_unref(dsm_ram_mr)
happen after we're done using it?

> +    out = (dsm_out *)in;
> +
> +    revision = in->arg1;
> +    function = in->arg2;
> +    handle = in->handle;
> +    le32_to_cpus(&revision);
> +    le32_to_cpus(&function);
> +    le32_to_cpus(&handle);
> +
> +    nvdebug("UUID " UUID_FMT ".\n", in->arg0[0], in->arg0[1], in->arg0[2],
> +            in->arg0[3], in->arg0[4], in->arg0[5], in->arg0[6],
> +            in->arg0[7], in->arg0[8], in->arg0[9], in->arg0[10],
> +            in->arg0[11], in->arg0[12], in->arg0[13], in->arg0[14],
> +            in->arg0[15]);
> +    nvdebug("Revision %#x Function %#x Handler %#x.\n", revision, function,
> +            handle);
> +
> +    if (revision != DSM_REVISION) {
> +        nvdebug("Revision %#x is not supported, expect %#x.\n",
> +                revision, DSM_REVISION);
> +        goto exit;
> +    }
> +
> +    if (!handle) {
> +        if (!dsm_is_root_uuid(in->arg0)) {

Please don't dereference 'in' or pass it to other functions.  Avoid race
conditions with guest vcpus by coping in the entire dsm_in struct.

This is like a system call - the kernel cannot trust userspace memory
and must copy in before accessing data.  The same rules apply.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi Oct. 14, 2015, 9:41 a.m. UTC | #2
On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
> +    out->len = sizeof(out->status);

out->len is uint16_t, it needs cpu_to_le16().  There may be other
instances in this patch series.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Oct. 14, 2015, 2:50 p.m. UTC | #3
On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
> On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>>   static void dsm_write(void *opaque, hwaddr addr,
>>                         uint64_t val, unsigned size)
>>   {
>> +    NVDIMMState *state = opaque;
>> +    MemoryRegion *dsm_ram_mr;
>> +    dsm_in *in;
>> +    dsm_out *out;
>> +    uint32_t revision, function, handle;
>> +
>>       if (val != NOTIFY_VALUE) {
>>           fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
>>       }
>> +
>> +    dsm_ram_mr = memory_region_find(&state->mr, state->page_size,
>> +                                    state->page_size).mr;
>> +    memory_region_unref(dsm_ram_mr);
>> +    in = memory_region_get_ram_ptr(dsm_ram_mr);
>
> This looks suspicious.  Shouldn't the memory_region_unref(dsm_ram_mr)
> happen after we're done using it?

This region is keep-alive during QEMU's running, it is okay. The
same style is applied to other codes, for example:
line 208 in hw/s390x/sclp.c.

>
>> +    out = (dsm_out *)in;
>> +
>> +    revision = in->arg1;
>> +    function = in->arg2;
>> +    handle = in->handle;
>> +    le32_to_cpus(&revision);
>> +    le32_to_cpus(&function);
>> +    le32_to_cpus(&handle);
>> +
>> +    nvdebug("UUID " UUID_FMT ".\n", in->arg0[0], in->arg0[1], in->arg0[2],
>> +            in->arg0[3], in->arg0[4], in->arg0[5], in->arg0[6],
>> +            in->arg0[7], in->arg0[8], in->arg0[9], in->arg0[10],
>> +            in->arg0[11], in->arg0[12], in->arg0[13], in->arg0[14],
>> +            in->arg0[15]);
>> +    nvdebug("Revision %#x Function %#x Handler %#x.\n", revision, function,
>> +            handle);
>> +
>> +    if (revision != DSM_REVISION) {
>> +        nvdebug("Revision %#x is not supported, expect %#x.\n",
>> +                revision, DSM_REVISION);
>> +        goto exit;
>> +    }
>> +
>> +    if (!handle) {
>> +        if (!dsm_is_root_uuid(in->arg0)) {
>
> Please don't dereference 'in' or pass it to other functions.  Avoid race
> conditions with guest vcpus by coping in the entire dsm_in struct.
>
> This is like a system call - the kernel cannot trust userspace memory
> and must copy in before accessing data.  The same rules apply.
>

It's little different for QEMU:
- the memory address is always valid to QEMU, it's not always true for Kernel
   due to context-switch

- we have checked the header before use it's data, for example, when we get
   data from GET_NAMESPACE_DATA, we have got the @offset and @length from the
   memory, then copy memory based on these values, that means the userspace
   has no chance to cause buffer overflow by increasing these values at runtime.

   The scenario for our case is simple but Kernel is difficult to do
   check_all_before_use as many paths may be involved.

- guest changes some data is okay, the worst case is that the label data is
   corrupted. This is caused by guest itself. Kernel also supports this kind
   of behaviour, e,g. network TX zero copy, the userspace page is being
   transferred while userspace can still access it.

- it's 4K size on x86, full copy wastes CPU time too much.






--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Oct. 14, 2015, 2:52 p.m. UTC | #4
On 10/14/2015 05:41 PM, Stefan Hajnoczi wrote:
> On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>> +    out->len = sizeof(out->status);
>
> out->len is uint16_t, it needs cpu_to_le16().  There may be other
> instances in this patch series.
>

out->len is internally used only which is invisible to guest OS, i,e,
we write this value and read this value by ourself. I think it is
okay.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Eduardo Habkost Oct. 14, 2015, 5:06 p.m. UTC | #5
On Wed, Oct 14, 2015 at 10:50:40PM +0800, Xiao Guangrong wrote:
> On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
> >On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
> >>  static void dsm_write(void *opaque, hwaddr addr,
> >>                        uint64_t val, unsigned size)
> >>  {
> >>+    NVDIMMState *state = opaque;
> >>+    MemoryRegion *dsm_ram_mr;
> >>+    dsm_in *in;
> >>+    dsm_out *out;
> >>+    uint32_t revision, function, handle;
> >>+
> >>      if (val != NOTIFY_VALUE) {
> >>          fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
> >>      }
> >>+
> >>+    dsm_ram_mr = memory_region_find(&state->mr, state->page_size,
> >>+                                    state->page_size).mr;
> >>+    memory_region_unref(dsm_ram_mr);
> >>+    in = memory_region_get_ram_ptr(dsm_ram_mr);
> >
> >This looks suspicious.  Shouldn't the memory_region_unref(dsm_ram_mr)
> >happen after we're done using it?
> 
> This region is keep-alive during QEMU's running, it is okay.  The same
> style is applied to other codes, for example: line 208 in
> hw/s390x/sclp.c.

In sclp.c (assign_storage()), the memory region is never used after
memory_region_unref() is called. In unassign_storage(), sclp.c owns an
additional reference, grabbed by assign_storage().
Xiao Guangrong Oct. 15, 2015, 1:43 a.m. UTC | #6
On 10/15/2015 01:06 AM, Eduardo Habkost wrote:
> On Wed, Oct 14, 2015 at 10:50:40PM +0800, Xiao Guangrong wrote:
>> On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
>>> On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>>>>   static void dsm_write(void *opaque, hwaddr addr,
>>>>                         uint64_t val, unsigned size)
>>>>   {
>>>> +    NVDIMMState *state = opaque;
>>>> +    MemoryRegion *dsm_ram_mr;
>>>> +    dsm_in *in;
>>>> +    dsm_out *out;
>>>> +    uint32_t revision, function, handle;
>>>> +
>>>>       if (val != NOTIFY_VALUE) {
>>>>           fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
>>>>       }
>>>> +
>>>> +    dsm_ram_mr = memory_region_find(&state->mr, state->page_size,
>>>> +                                    state->page_size).mr;
>>>> +    memory_region_unref(dsm_ram_mr);
>>>> +    in = memory_region_get_ram_ptr(dsm_ram_mr);
>>>
>>> This looks suspicious.  Shouldn't the memory_region_unref(dsm_ram_mr)
>>> happen after we're done using it?
>>
>> This region is keep-alive during QEMU's running, it is okay.  The same
>> style is applied to other codes, for example: line 208 in
>> hw/s390x/sclp.c.
>
> In sclp.c (assign_storage()), the memory region is never used after
> memory_region_unref() is called. In unassign_storage(), sclp.c owns an
> additional reference, grabbed by assign_storage().
>

Ah... I got it, thank you for pointing it out.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi Oct. 15, 2015, 3:01 p.m. UTC | #7
On Wed, Oct 14, 2015 at 10:52:15PM +0800, Xiao Guangrong wrote:
> On 10/14/2015 05:41 PM, Stefan Hajnoczi wrote:
> >On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
> >>+    out->len = sizeof(out->status);
> >
> >out->len is uint16_t, it needs cpu_to_le16().  There may be other
> >instances in this patch series.
> >
> 
> out->len is internally used only which is invisible to guest OS, i,e,
> we write this value and read this value by ourself. I think it is
> okay.

'out' points to guest memory.  Guest memory is untrusted so QEMU cannot
stash values there - an evil guest could modify them.

Please put the len variable on the QEMU stack or heap where the guest
cannot access it.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi Oct. 15, 2015, 3:07 p.m. UTC | #8
On Wed, Oct 14, 2015 at 10:50:40PM +0800, Xiao Guangrong wrote:
> On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
> >On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
> >>+    out = (dsm_out *)in;
> >>+
> >>+    revision = in->arg1;
> >>+    function = in->arg2;
> >>+    handle = in->handle;
> >>+    le32_to_cpus(&revision);
> >>+    le32_to_cpus(&function);
> >>+    le32_to_cpus(&handle);
> >>+
> >>+    nvdebug("UUID " UUID_FMT ".\n", in->arg0[0], in->arg0[1], in->arg0[2],
> >>+            in->arg0[3], in->arg0[4], in->arg0[5], in->arg0[6],
> >>+            in->arg0[7], in->arg0[8], in->arg0[9], in->arg0[10],
> >>+            in->arg0[11], in->arg0[12], in->arg0[13], in->arg0[14],
> >>+            in->arg0[15]);
> >>+    nvdebug("Revision %#x Function %#x Handler %#x.\n", revision, function,
> >>+            handle);
> >>+
> >>+    if (revision != DSM_REVISION) {
> >>+        nvdebug("Revision %#x is not supported, expect %#x.\n",
> >>+                revision, DSM_REVISION);
> >>+        goto exit;
> >>+    }
> >>+
> >>+    if (!handle) {
> >>+        if (!dsm_is_root_uuid(in->arg0)) {
> >
> >Please don't dereference 'in' or pass it to other functions.  Avoid race
> >conditions with guest vcpus by coping in the entire dsm_in struct.
> >
> >This is like a system call - the kernel cannot trust userspace memory
> >and must copy in before accessing data.  The same rules apply.
> >
> 
> It's little different for QEMU:
> - the memory address is always valid to QEMU, it's not always true for Kernel
>   due to context-switch
> 
> - we have checked the header before use it's data, for example, when we get
>   data from GET_NAMESPACE_DATA, we have got the @offset and @length from the
>   memory, then copy memory based on these values, that means the userspace
>   has no chance to cause buffer overflow by increasing these values at runtime.
> 
>   The scenario for our case is simple but Kernel is difficult to do
>   check_all_before_use as many paths may be involved.
> 
> - guest changes some data is okay, the worst case is that the label data is
>   corrupted. This is caused by guest itself. Kernel also supports this kind
>   of behaviour, e,g. network TX zero copy, the userspace page is being
>   transferred while userspace can still access it.
> 
> - it's 4K size on x86, full copy wastes CPU time too much.

This isn't performance-critical code and I don't want to review it
keeping the race conditions in mind the whole time.  Also, if the code
is modified in the future, the chance of introducing a race is high.

I see this as premature optimization, please just copy in data.

Stefan
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Oct. 16, 2015, 2:30 a.m. UTC | #9
On 10/15/2015 11:07 PM, Stefan Hajnoczi wrote:
> On Wed, Oct 14, 2015 at 10:50:40PM +0800, Xiao Guangrong wrote:
>> On 10/14/2015 05:40 PM, Stefan Hajnoczi wrote:
>>> On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>>>> +    out = (dsm_out *)in;
>>>> +
>>>> +    revision = in->arg1;
>>>> +    function = in->arg2;
>>>> +    handle = in->handle;
>>>> +    le32_to_cpus(&revision);
>>>> +    le32_to_cpus(&function);
>>>> +    le32_to_cpus(&handle);
>>>> +
>>>> +    nvdebug("UUID " UUID_FMT ".\n", in->arg0[0], in->arg0[1], in->arg0[2],
>>>> +            in->arg0[3], in->arg0[4], in->arg0[5], in->arg0[6],
>>>> +            in->arg0[7], in->arg0[8], in->arg0[9], in->arg0[10],
>>>> +            in->arg0[11], in->arg0[12], in->arg0[13], in->arg0[14],
>>>> +            in->arg0[15]);
>>>> +    nvdebug("Revision %#x Function %#x Handler %#x.\n", revision, function,
>>>> +            handle);
>>>> +
>>>> +    if (revision != DSM_REVISION) {
>>>> +        nvdebug("Revision %#x is not supported, expect %#x.\n",
>>>> +                revision, DSM_REVISION);
>>>> +        goto exit;
>>>> +    }
>>>> +
>>>> +    if (!handle) {
>>>> +        if (!dsm_is_root_uuid(in->arg0)) {
>>>
>>> Please don't dereference 'in' or pass it to other functions.  Avoid race
>>> conditions with guest vcpus by coping in the entire dsm_in struct.
>>>
>>> This is like a system call - the kernel cannot trust userspace memory
>>> and must copy in before accessing data.  The same rules apply.
>>>
>>
>> It's little different for QEMU:
>> - the memory address is always valid to QEMU, it's not always true for Kernel
>>    due to context-switch
>>
>> - we have checked the header before use it's data, for example, when we get
>>    data from GET_NAMESPACE_DATA, we have got the @offset and @length from the
>>    memory, then copy memory based on these values, that means the userspace
>>    has no chance to cause buffer overflow by increasing these values at runtime.
>>
>>    The scenario for our case is simple but Kernel is difficult to do
>>    check_all_before_use as many paths may be involved.
>>
>> - guest changes some data is okay, the worst case is that the label data is
>>    corrupted. This is caused by guest itself. Kernel also supports this kind
>>    of behaviour, e,g. network TX zero copy, the userspace page is being
>>    transferred while userspace can still access it.
>>
>> - it's 4K size on x86, full copy wastes CPU time too much.
>
> This isn't performance-critical code and I don't want to review it
> keeping the race conditions in mind the whole time.  Also, if the code
> is modified in the future, the chance of introducing a race is high.
>
> I see this as premature optimization, please just copy in data.


No strong opinion on it... will do it following your idea.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Oct. 16, 2015, 2:32 a.m. UTC | #10
On 10/15/2015 11:01 PM, Stefan Hajnoczi wrote:
> On Wed, Oct 14, 2015 at 10:52:15PM +0800, Xiao Guangrong wrote:
>> On 10/14/2015 05:41 PM, Stefan Hajnoczi wrote:
>>> On Sun, Oct 11, 2015 at 11:52:59AM +0800, Xiao Guangrong wrote:
>>>> +    out->len = sizeof(out->status);
>>>
>>> out->len is uint16_t, it needs cpu_to_le16().  There may be other
>>> instances in this patch series.
>>>
>>
>> out->len is internally used only which is invisible to guest OS, i,e,
>> we write this value and read this value by ourself. I think it is
>> okay.
>
> 'out' points to guest memory.  Guest memory is untrusted so QEMU cannot
> stash values there - an evil guest could modify them.
>
> Please put the len variable on the QEMU stack or heap where the guest
> cannot access it.

okay okay.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/mem/nvdimm/acpi.c b/hw/mem/nvdimm/acpi.c
index 3b9399c..cb6a428 100644
--- a/hw/mem/nvdimm/acpi.c
+++ b/hw/mem/nvdimm/acpi.c
@@ -39,6 +39,22 @@  static void nfit_spa_uuid_pm(uuid_le *uuid)
     memcpy(uuid, &uuid_pm, sizeof(uuid_pm));
 }
 
+static bool dsm_is_root_uuid(uint8_t *uuid)
+{
+    uuid_le uuid_root = UUID_LE(0x2f10e7a4, 0x9e91, 0x11e4, 0x89,
+                                0xd3, 0x12, 0x3b, 0x93, 0xf7, 0x5c, 0xba);
+
+    return !memcmp(uuid, &uuid_root, sizeof(uuid_root));
+}
+
+static bool dsm_is_dimm_uuid(uint8_t *uuid)
+{
+    uuid_le uuid_dimm = UUID_LE(0x4309ac30, 0x0d11, 0x11e4, 0x91,
+                                0x91, 0x08, 0x00, 0x20, 0x0c, 0x9a, 0x66);
+
+    return !memcmp(uuid, &uuid_dimm, sizeof(uuid_dimm));
+}
+
 enum {
     NFIT_STRUCTURE_SPA = 0,
     NFIT_STRUCTURE_MEMDEV = 1,
@@ -195,6 +211,22 @@  static uint32_t nvdimm_slot_to_dcr_index(int slot)
     return nvdimm_slot_to_spa_index(slot) + 1;
 }
 
+static NVDIMMDevice
+*get_nvdimm_device_by_handle(GSList *list, uint32_t handle)
+{
+    for (; list; list = list->next) {
+        NVDIMMDevice *nvdimm = list->data;
+        int slot = object_property_get_int(OBJECT(nvdimm), DIMM_SLOT_PROP,
+                                           NULL);
+
+        if (nvdimm_slot_to_handle(slot) == handle) {
+            return nvdimm;
+        }
+    }
+
+    return NULL;
+}
+
 static int build_structure_spa(void *buf, NVDIMMDevice *nvdimm)
 {
     nfit_spa *nfit_spa;
@@ -310,6 +342,43 @@  static void build_nfit(void *fit, GSList *device_list, GArray *table_offsets,
 
 #define NOTIFY_VALUE      0x99
 
+enum {
+    DSM_CMD_IMPLEMENTED = 0,
+
+    /* root device commands */
+    DSM_CMD_ARS_CAP = 1,
+    DSM_CMD_ARS_START = 2,
+    DSM_CMD_ARS_QUERY = 3,
+
+    /* per-nvdimm device commands */
+    DSM_CMD_SMART = 1,
+    DSM_CMD_SMART_THRESHOLD = 2,
+    DSM_CMD_BLOCK_NVDIMM_FLAGS = 3,
+    DSM_CMD_NAMESPACE_LABEL_SIZE = 4,
+    DSM_CMD_GET_NAMESPACE_LABEL_DATA = 5,
+    DSM_CMD_SET_NAMESPACE_LABEL_DATA = 6,
+    DSM_CMD_VENDOR_EFFECT_LOG_SIZE = 7,
+    DSM_CMD_GET_VENDOR_EFFECT_LOG = 8,
+    DSM_CMD_VENDOR_SPECIFIC = 9,
+};
+
+enum {
+    DSM_STATUS_SUCCESS = 0,
+    DSM_STATUS_NOT_SUPPORTED = 1,
+    DSM_STATUS_NON_EXISTING_MEM_DEV = 2,
+    DSM_STATUS_INVALID_PARAS = 3,
+    DSM_STATUS_VENDOR_SPECIFIC_ERROR = 4,
+};
+
+#define DSM_REVISION        (1)
+
+/* do not support any command except NFIT_CMD_IMPLEMENTED on root. */
+#define ROOT_SUPPORT_CMD    (1 << DSM_CMD_IMPLEMENTED)
+#define DIMM_SUPPORT_CMD    ((1 << DSM_CMD_IMPLEMENTED)               \
+                           | (1 << DSM_CMD_NAMESPACE_LABEL_SIZE)      \
+                           | (1 << DSM_CMD_GET_NAMESPACE_LABEL_DATA)  \
+                           | (1 << DSM_CMD_SET_NAMESPACE_LABEL_DATA))
+
 struct dsm_in {
     uint32_t handle;
     uint8_t arg0[16];
@@ -320,10 +389,19 @@  struct dsm_in {
 } QEMU_PACKED;
 typedef struct dsm_in dsm_in;
 
+struct cmd_out_implemented {
+    uint64_t cmd_list;
+};
+typedef struct cmd_out_implemented cmd_out_implemented;
+
 struct dsm_out {
     /* the size of buffer filled by QEMU. */
     uint16_t len;
-    uint8_t data[0];
+    union {
+        uint8_t data[0];
+        uint32_t status;
+        cmd_out_implemented cmd_implemented;
+    };
 } QEMU_PACKED;
 typedef struct dsm_out dsm_out;
 
@@ -334,12 +412,110 @@  static uint64_t dsm_read(void *opaque, hwaddr addr,
     return 0;
 }
 
+static void dsm_write_root(uint32_t function, dsm_in *in, dsm_out *out)
+{
+    if (function == DSM_CMD_IMPLEMENTED) {
+        out->len = sizeof(out->cmd_implemented);
+        out->cmd_implemented.cmd_list = cpu_to_le64(ROOT_SUPPORT_CMD);
+        return;
+    }
+
+    out->len = sizeof(out->status);
+    out->status = cpu_to_le32(DSM_STATUS_NOT_SUPPORTED);
+    nvdebug("Return status %#x.\n", out->status);
+}
+
+static void dsm_write_nvdimm(uint32_t handle, uint32_t function, dsm_in *in,
+                             dsm_out *out)
+{
+    GSList *list = nvdimm_get_built_list();
+    NVDIMMDevice *nvdimm = get_nvdimm_device_by_handle(list, handle);
+    uint32_t status = DSM_STATUS_NON_EXISTING_MEM_DEV;
+    uint64_t cmd_list;
+
+    if (!nvdimm) {
+        out->len = sizeof(out->status);
+        goto set_status_free;
+    }
+
+    switch (function) {
+    case DSM_CMD_IMPLEMENTED:
+        cmd_list = DIMM_SUPPORT_CMD;
+        out->len = sizeof(out->cmd_implemented);
+        out->cmd_implemented.cmd_list = cpu_to_le64(cmd_list);
+        goto free;
+    default:
+        out->len = sizeof(out->status);
+        status = DSM_STATUS_NOT_SUPPORTED;
+    };
+
+    nvdebug("Return status %#x.\n", status);
+
+set_status_free:
+    out->status = cpu_to_le32(status);
+free:
+    g_slist_free(list);
+}
+
 static void dsm_write(void *opaque, hwaddr addr,
                       uint64_t val, unsigned size)
 {
+    NVDIMMState *state = opaque;
+    MemoryRegion *dsm_ram_mr;
+    dsm_in *in;
+    dsm_out *out;
+    uint32_t revision, function, handle;
+
     if (val != NOTIFY_VALUE) {
         fprintf(stderr, "BUG: unexepected notify value 0x%" PRIx64, val);
     }
+
+    dsm_ram_mr = memory_region_find(&state->mr, state->page_size,
+                                    state->page_size).mr;
+    memory_region_unref(dsm_ram_mr);
+    in = memory_region_get_ram_ptr(dsm_ram_mr);
+    out = (dsm_out *)in;
+
+    revision = in->arg1;
+    function = in->arg2;
+    handle = in->handle;
+    le32_to_cpus(&revision);
+    le32_to_cpus(&function);
+    le32_to_cpus(&handle);
+
+    nvdebug("UUID " UUID_FMT ".\n", in->arg0[0], in->arg0[1], in->arg0[2],
+            in->arg0[3], in->arg0[4], in->arg0[5], in->arg0[6],
+            in->arg0[7], in->arg0[8], in->arg0[9], in->arg0[10],
+            in->arg0[11], in->arg0[12], in->arg0[13], in->arg0[14],
+            in->arg0[15]);
+    nvdebug("Revision %#x Function %#x Handler %#x.\n", revision, function,
+            handle);
+
+    if (revision != DSM_REVISION) {
+        nvdebug("Revision %#x is not supported, expect %#x.\n",
+                revision, DSM_REVISION);
+        goto exit;
+    }
+
+    if (!handle) {
+        if (!dsm_is_root_uuid(in->arg0)) {
+            nvdebug("Root UUID does not match.\n");
+            goto exit;
+        }
+
+        return dsm_write_root(function, in, out);
+    }
+
+    if (!dsm_is_dimm_uuid(in->arg0)) {
+        nvdebug("DIMM UUID does not match.\n");
+        goto exit;
+    }
+
+    return dsm_write_nvdimm(handle, function, in, out);
+
+exit:
+    out->len = sizeof(out->status);
+    out->status = cpu_to_le32(DSM_STATUS_NOT_SUPPORTED);
 }
 
 static const MemoryRegionOps dsm_ops = {