diff mbox

[v2,08/18] nvdimm: init backend memory mapping and config data area

Message ID 1439563931-12352-9-git-send-email-guangrong.xiao@linux.intel.com
State New, archived
Headers show

Commit Message

Xiao Guangrong Aug. 14, 2015, 2:52 p.m. UTC
The parameter @file is used as backed memory for NVDIMM which is
divided into two parts if @dataconfig is true:
- first parts is (0, size - 128K], which is used as PMEM (Persistent
  Memory)
- 128K at the end of the file, which is used as Config Data Area, it's
  used to store Label namespace data

The @file supports both regular file and block device, of course we
can assign any these two kinds of files for test and emulation, however,
in the real word for performance reason, we usually used these files as
NVDIMM backed file:
- the regular file in the filesystem with DAX enabled created on NVDIMM
  device on host
- the raw PMEM device on host, e,g /dev/pmem0

Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
---
 hw/mem/nvdimm/pc-nvdimm.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++-
 include/hw/mem/pc-nvdimm.h |   7 +++
 2 files changed, 115 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi Aug. 25, 2015, 4:03 p.m. UTC | #1
On Fri, Aug 14, 2015 at 10:52:01PM +0800, Xiao Guangrong wrote:
> The parameter @file is used as backed memory for NVDIMM which is
> divided into two parts if @dataconfig is true:

s/dataconfig/configdata/

> @@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
>                               set_configdata, NULL);
>  }
>  
> +static uint64_t get_file_size(int fd)
> +{
> +    struct stat stat_buf;
> +    uint64_t size;
> +
> +    if (fstat(fd, &stat_buf) < 0) {
> +        return 0;
> +    }
> +
> +    if (S_ISREG(stat_buf.st_mode)) {
> +        return stat_buf.st_size;
> +    }
> +
> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
> +        return size;
> +    }

#ifdef __linux__ for ioctl(fd, BLKGETSIZE64, &size)?

There is nothing Linux-specific about emulating NVDIMMs so this code
should compile on all platforms.

> +
> +    return 0;
> +}
> +
>  static void pc_nvdimm_realize(DeviceState *dev, Error **errp)
>  {
>      PCNVDIMMDevice *nvdimm = PC_NVDIMM(dev);
> +    char name[512];
> +    void *buf;
> +    ram_addr_t addr;
> +    uint64_t size, nvdimm_size, config_size = MIN_CONFIG_DATA_SIZE;
> +    int fd;
>  
>      if (!nvdimm->file) {
>          error_setg(errp, "file property is not set");
>      }

Missing return here.

> +
> +    fd = open(nvdimm->file, O_RDWR);

Does it make sense to support read-only NVDIMMs?

It could be handy for sharing a read-only file between unprivileged
guests.  The permissions on the file would only allow read, not write.

> +    if (fd < 0) {
> +        error_setg(errp, "can not open %s", nvdimm->file);

s/can not/cannot/

> +        return;
> +    }
> +
> +    size = get_file_size(fd);
> +    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);

I guess the user will want to choose between MAP_SHARED and MAP_PRIVATE.
This can be added in the future.

> +    if (buf == MAP_FAILED) {
> +        error_setg(errp, "can not do mmap on %s", nvdimm->file);
> +        goto do_close;
> +    }
> +
> +    nvdimm->config_data_size = config_size;
> +    if (nvdimm->configdata) {
> +        /* reserve MIN_CONFIGDATA_AREA_SIZE for configue data. */
> +        nvdimm_size = size - config_size;
> +        nvdimm->config_data_addr = buf + nvdimm_size;
> +    } else {
> +        nvdimm_size = size;
> +        nvdimm->config_data_addr = NULL;
> +    }
> +
> +    if ((int64_t)nvdimm_size <= 0) {

The error cases can be detected before mmap(2).  That avoids the int64_t
cast and also avoids nvdimm_size underflow and the bogus
nvdimm->config_data_addr calculation above.

size = get_file_size(fd);
if (size == 0) {
    error_setg(errp, "empty file or unable to get file size");
    goto do_close;
} else if (nvdimm->configdata && size < config_size) {{
    error_setg(errp, "file size is too small to store NVDIMM"
                     " configure data");
    goto do_close;
}

> +        error_setg(errp, "file size is too small to store NVDIMM"
> +                         " configure data");
> +        goto do_unmap;
> +    }
> +
> +    addr = reserved_range_push(nvdimm_size);
> +    if (!addr) {
> +        error_setg(errp, "do not have enough space for size %#lx.\n", size);

error_setg() messages must not have a newline at the end.

Please use "%#" PRIx64 instead of "%#lx" so compilation works on 32-bit
hosts where sizeof(long) == 4.

> +        goto do_unmap;
> +    }
> +
> +    nvdimm->device_index = new_device_index();
> +    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
> +    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
> +                               buf);

How is the autogenerated name used?

Why not just use "pc-nvdimm.memory"?

> +    vmstate_register_ram(&nvdimm->mr, DEVICE(dev));
> +    memory_region_add_subregion(get_system_memory(), addr, &nvdimm->mr);
> +
> +    return;

fd is leaked.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 26, 2015, 10:40 a.m. UTC | #2
On 08/26/2015 12:03 AM, Stefan Hajnoczi wrote:
> On Fri, Aug 14, 2015 at 10:52:01PM +0800, Xiao Guangrong wrote:
>> The parameter @file is used as backed memory for NVDIMM which is
>> divided into two parts if @dataconfig is true:
>
> s/dataconfig/configdata/

Stupid typo, sorry.

>
>> @@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
>>                                set_configdata, NULL);
>>   }
>>
>> +static uint64_t get_file_size(int fd)
>> +{
>> +    struct stat stat_buf;
>> +    uint64_t size;
>> +
>> +    if (fstat(fd, &stat_buf) < 0) {
>> +        return 0;
>> +    }
>> +
>> +    if (S_ISREG(stat_buf.st_mode)) {
>> +        return stat_buf.st_size;
>> +    }
>> +
>> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
>> +        return size;
>> +    }
>
> #ifdef __linux__ for ioctl(fd, BLKGETSIZE64, &size)?
>
> There is nothing Linux-specific about emulating NVDIMMs so this code
> should compile on all platforms.

Right. I have no idea for how block devices work on other platforms so
I will only allow linux to directly use bock device file in the next
version.

>
>> +
>> +    return 0;
>> +}
>> +
>>   static void pc_nvdimm_realize(DeviceState *dev, Error **errp)
>>   {
>>       PCNVDIMMDevice *nvdimm = PC_NVDIMM(dev);
>> +    char name[512];
>> +    void *buf;
>> +    ram_addr_t addr;
>> +    uint64_t size, nvdimm_size, config_size = MIN_CONFIG_DATA_SIZE;
>> +    int fd;
>>
>>       if (!nvdimm->file) {
>>           error_setg(errp, "file property is not set");
>>       }
>
> Missing return here.

Will fix.

>
>> +
>> +    fd = open(nvdimm->file, O_RDWR);
>
> Does it make sense to support read-only NVDIMMs?
>
> It could be handy for sharing a read-only file between unprivileged
> guests.  The permissions on the file would only allow read, not write.

Make sense. Currently these patchset just implements "shared" mode so that
write permission is required, however, please see below:

>
>> +    if (fd < 0) {
>> +        error_setg(errp, "can not open %s", nvdimm->file);
>
> s/can not/cannot/
>
>> +        return;
>> +    }
>> +
>> +    size = get_file_size(fd);
>> +    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>
> I guess the user will want to choose between MAP_SHARED and MAP_PRIVATE.
> This can be added in the future.

Good idea, it will allow guest to write data but discards its content after it
exits. Will implement O_RDONLY + MAP_PRIVATE in the near future.

>
>> +    if (buf == MAP_FAILED) {
>> +        error_setg(errp, "can not do mmap on %s", nvdimm->file);
>> +        goto do_close;
>> +    }
>> +
>> +    nvdimm->config_data_size = config_size;
>> +    if (nvdimm->configdata) {
>> +        /* reserve MIN_CONFIGDATA_AREA_SIZE for configue data. */
>> +        nvdimm_size = size - config_size;
>> +        nvdimm->config_data_addr = buf + nvdimm_size;
>> +    } else {
>> +        nvdimm_size = size;
>> +        nvdimm->config_data_addr = NULL;
>> +    }
>> +
>> +    if ((int64_t)nvdimm_size <= 0) {
>
> The error cases can be detected before mmap(2).  That avoids the int64_t
> cast and also avoids nvdimm_size underflow and the bogus
> nvdimm->config_data_addr calculation above.

Okay.

>
> size = get_file_size(fd);
> if (size == 0) {
>      error_setg(errp, "empty file or unable to get file size");
>      goto do_close;
> } else if (nvdimm->configdata && size < config_size) {{
>      error_setg(errp, "file size is too small to store NVDIMM"
>                       " configure data");
>      goto do_close;
> }
>
>> +        error_setg(errp, "file size is too small to store NVDIMM"
>> +                         " configure data");
>> +        goto do_unmap;
>> +    }
>> +
>> +    addr = reserved_range_push(nvdimm_size);
>> +    if (!addr) {
>> +        error_setg(errp, "do not have enough space for size %#lx.\n", size);
>
> error_setg() messages must not have a newline at the end.
>
> Please use "%#" PRIx64 instead of "%#lx" so compilation works on 32-bit
> hosts where sizeof(long) == 4.

Good catch.

>
>> +        goto do_unmap;
>> +    }
>> +
>> +    nvdimm->device_index = new_device_index();
>> +    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
>> +    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
>> +                               buf);
>
> How is the autogenerated name used?
>
> Why not just use "pc-nvdimm.memory"?

Ah. Just for debug proposal :) and i am not sure if a name used for multiple
MRs (MemoryRegion) is a good idea.

>
>> +    vmstate_register_ram(&nvdimm->mr, DEVICE(dev));
>> +    memory_region_add_subregion(get_system_memory(), addr, &nvdimm->mr);
>> +
>> +    return;
>
> fd is leaked.

Will fix.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi Aug. 28, 2015, 11:58 a.m. UTC | #3
On Wed, Aug 26, 2015 at 06:40:26PM +0800, Xiao Guangrong wrote:
> On 08/26/2015 12:03 AM, Stefan Hajnoczi wrote:
> >On Fri, Aug 14, 2015 at 10:52:01PM +0800, Xiao Guangrong wrote:
> >
> >>+    if (fd < 0) {
> >>+        error_setg(errp, "can not open %s", nvdimm->file);
> >
> >s/can not/cannot/
> >
> >>+        return;
> >>+    }
> >>+
> >>+    size = get_file_size(fd);
> >>+    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> >
> >I guess the user will want to choose between MAP_SHARED and MAP_PRIVATE.
> >This can be added in the future.
> 
> Good idea, it will allow guest to write data but discards its content after it
> exits. Will implement O_RDONLY + MAP_PRIVATE in the near future.

Great.

> >>+        goto do_unmap;
> >>+    }
> >>+
> >>+    nvdimm->device_index = new_device_index();
> >>+    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
> >>+    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
> >>+                               buf);
> >
> >How is the autogenerated name used?
> >
> >Why not just use "pc-nvdimm.memory"?
> 
> Ah. Just for debug proposal :) and i am not sure if a name used for multiple
> MRs (MemoryRegion) is a good idea.

Other devices use a constant name too (git grep
memory_region_init_ram_ptr) so it seems to be okay.  The unique thing is
the OBJECT(dev) which differs for each NVDIMM instance.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 31, 2015, 6:23 a.m. UTC | #4
Hi Stefan,

On 08/28/2015 07:58 PM, Stefan Hajnoczi wrote:

>
>>>> +        goto do_unmap;
>>>> +    }
>>>> +
>>>> +    nvdimm->device_index = new_device_index();
>>>> +    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
>>>> +    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
>>>> +                               buf);
>>>
>>> How is the autogenerated name used?
>>>
>>> Why not just use "pc-nvdimm.memory"?
>>
>> Ah. Just for debug proposal :) and i am not sure if a name used for multiple
>> MRs (MemoryRegion) is a good idea.
>
> Other devices use a constant name too (git grep
> memory_region_init_ram_ptr) so it seems to be okay.  The unique thing is
> the OBJECT(dev) which differs for each NVDIMM instance.
>

When I was digging into live migration code, i noticed that the same MR name may
cause the name "idstr", please refer to qemu_ram_set_idstr().

Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
function.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Stefan Hajnoczi Sept. 1, 2015, 9:14 a.m. UTC | #5
On Mon, Aug 31, 2015 at 02:23:43PM +0800, Xiao Guangrong wrote:
> 
> Hi Stefan,
> 
> On 08/28/2015 07:58 PM, Stefan Hajnoczi wrote:
> 
> >
> >>>>+        goto do_unmap;
> >>>>+    }
> >>>>+
> >>>>+    nvdimm->device_index = new_device_index();
> >>>>+    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
> >>>>+    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
> >>>>+                               buf);
> >>>
> >>>How is the autogenerated name used?
> >>>
> >>>Why not just use "pc-nvdimm.memory"?
> >>
> >>Ah. Just for debug proposal :) and i am not sure if a name used for multiple
> >>MRs (MemoryRegion) is a good idea.
> >
> >Other devices use a constant name too (git grep
> >memory_region_init_ram_ptr) so it seems to be okay.  The unique thing is
> >the OBJECT(dev) which differs for each NVDIMM instance.
> >
> 
> When I was digging into live migration code, i noticed that the same MR name may
> cause the name "idstr", please refer to qemu_ram_set_idstr().
> 
> Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
> function.

I see.  The other devices that use a constant name are on a bus so the
abort doesn't trigger.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Sept. 7, 2015, 2:11 p.m. UTC | #6
On Fri, 14 Aug 2015 22:52:01 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> The parameter @file is used as backed memory for NVDIMM which is
> divided into two parts if @dataconfig is true:
> - first parts is (0, size - 128K], which is used as PMEM (Persistent
>   Memory)
> - 128K at the end of the file, which is used as Config Data Area, it's
>   used to store Label namespace data
> 
> The @file supports both regular file and block device, of course we
> can assign any these two kinds of files for test and emulation, however,
> in the real word for performance reason, we usually used these files as
> NVDIMM backed file:
> - the regular file in the filesystem with DAX enabled created on NVDIMM
>   device on host
> - the raw PMEM device on host, e,g /dev/pmem0

A lot of code in this series could reuse what QEMU already
uses for implementing pc-dimm devices.

here is common concepts that could be reused.
  - on physical system both DIMM and NVDIMM devices use
    the same slots. We could share QEMU's '-m slots' option between
    both devices. An alternative to not sharing would be to introduce
    '-machine nvdimm_slots' option.
    And yes, we need to know number of NVDIMMs to describe
    them all in ACPI table (taking in amount future hotplug
    include in this possible NVDIMM devices)
    I'd go the same way as on real hardware on make them share the same slots.
  - they share the same physical address space and limits
    on how much memory system can handle. So I'd suggest sharing existing
    '-m maxmem' option and reuse hotplug_memory address space.

Essentially what I'm suggesting is to inherit NVDIMM's implementation
from pc-dimm reusing all of its code/backends and
just override parts that do memory mapping into guest's address space to
accommodate NVDIMM's requirements.

> 
> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> ---
>  hw/mem/nvdimm/pc-nvdimm.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++-
>  include/hw/mem/pc-nvdimm.h |   7 +++
>  2 files changed, 115 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/mem/nvdimm/pc-nvdimm.c b/hw/mem/nvdimm/pc-nvdimm.c
> index 7a270a8..97710d1 100644
> --- a/hw/mem/nvdimm/pc-nvdimm.c
> +++ b/hw/mem/nvdimm/pc-nvdimm.c
> @@ -22,12 +22,20 @@
>   * License along with this library; if not, see <http://www.gnu.org/licenses/>
>   */
>  
> +#include <sys/mman.h>
> +#include <sys/ioctl.h>
> +#include <linux/fs.h>
> +
> +#include "exec/address-spaces.h"
>  #include "hw/mem/pc-nvdimm.h"
>  
> -#define PAGE_SIZE      (1UL << 12)
> +#define PAGE_SIZE               (1UL << 12)
> +
> +#define MIN_CONFIG_DATA_SIZE    (128 << 10)
>  
>  static struct nvdimms_info {
>      ram_addr_t current_addr;
> +    int device_index;
>  } nvdimms_info;
>  
>  /* the address range [offset, ~0ULL) is reserved for NVDIMM. */
> @@ -37,6 +45,26 @@ void pc_nvdimm_reserve_range(ram_addr_t offset)
>      nvdimms_info.current_addr = offset;
>  }
>  
> +static ram_addr_t reserved_range_push(uint64_t size)
> +{
> +    uint64_t current;
> +
> +    current = ROUND_UP(nvdimms_info.current_addr, PAGE_SIZE);
> +
> +    /* do not have enough space? */
> +    if (current + size < current) {
> +        return 0;
> +    }
> +
> +    nvdimms_info.current_addr = current + size;
> +    return current;
> +}
You can't use all memory above hotplug_memory area since
we have to tell guest where 64-bit PCI window starts,
and currently it should start at reserved-memory-end
(but it isn't due to a bug: I've just posted fix to qemu-devel
 "[PATCH 0/2] pc: fix 64-bit PCI window clashing with memory hotplug region"
)

> +
> +static uint32_t new_device_index(void)
> +{
> +    return nvdimms_info.device_index++;
> +}
> +
>  static char *get_file(Object *obj, Error **errp)
>  {
>      PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
> @@ -48,6 +76,11 @@ static void set_file(Object *obj, const char *str, Error **errp)
>  {
>      PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
>  
> +    if (memory_region_size(&nvdimm->mr)) {
> +        error_setg(errp, "cannot change property value");
> +        return;
> +    }
> +
>      if (nvdimm->file) {
>          g_free(nvdimm->file);
>      }
> @@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
>                               set_configdata, NULL);
>  }
>  
> +static uint64_t get_file_size(int fd)
> +{
> +    struct stat stat_buf;
> +    uint64_t size;
> +
> +    if (fstat(fd, &stat_buf) < 0) {
> +        return 0;
> +    }
> +
> +    if (S_ISREG(stat_buf.st_mode)) {
> +        return stat_buf.st_size;
> +    }
> +
> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
> +        return size;
> +    }
> +
> +    return 0;
> +}
All this file stuff I'd leave to already existing backends like
memory-backend-file or even memory-backend-ram which already do
above and more allowing to configure persistent and volatile
NVDIMMs without changing NVDIMM fronted code.

> +
>  static void pc_nvdimm_realize(DeviceState *dev, Error **errp)
>  {
>      PCNVDIMMDevice *nvdimm = PC_NVDIMM(dev);
> +    char name[512];
> +    void *buf;
> +    ram_addr_t addr;
> +    uint64_t size, nvdimm_size, config_size = MIN_CONFIG_DATA_SIZE;
> +    int fd;
>  
>      if (!nvdimm->file) {
>          error_setg(errp, "file property is not set");
>      }
> +
> +    fd = open(nvdimm->file, O_RDWR);
> +    if (fd < 0) {
> +        error_setg(errp, "can not open %s", nvdimm->file);
> +        return;
> +    }
> +
> +    size = get_file_size(fd);
> +    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
> +    if (buf == MAP_FAILED) {
> +        error_setg(errp, "can not do mmap on %s", nvdimm->file);
> +        goto do_close;
> +    }
> +
> +    nvdimm->config_data_size = config_size;
> +    if (nvdimm->configdata) {
> +        /* reserve MIN_CONFIGDATA_AREA_SIZE for configue data. */
> +        nvdimm_size = size - config_size;
> +        nvdimm->config_data_addr = buf + nvdimm_size;
> +    } else {
> +        nvdimm_size = size;
> +        nvdimm->config_data_addr = NULL;
> +    }
> +
> +    if ((int64_t)nvdimm_size <= 0) {
> +        error_setg(errp, "file size is too small to store NVDIMM"
> +                         " configure data");
> +        goto do_unmap;
> +    }
> +
> +    addr = reserved_range_push(nvdimm_size);
> +    if (!addr) {
> +        error_setg(errp, "do not have enough space for size %#lx.\n", size);
> +        goto do_unmap;
> +    }
> +
> +    nvdimm->device_index = new_device_index();
> +    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
> +    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
> +                               buf);
> +    vmstate_register_ram(&nvdimm->mr, DEVICE(dev));
> +    memory_region_add_subregion(get_system_memory(), addr, &nvdimm->mr);
> +
> +    return;
> +
> +do_unmap:
> +    munmap(buf, size);
> +do_close:
> +    close(fd);
>  }
>  
>  static void pc_nvdimm_class_init(ObjectClass *oc, void *data)
> diff --git a/include/hw/mem/pc-nvdimm.h b/include/hw/mem/pc-nvdimm.h
> index 8601e9b..f617fd2 100644
> --- a/include/hw/mem/pc-nvdimm.h
> +++ b/include/hw/mem/pc-nvdimm.h
> @@ -21,6 +21,13 @@ typedef struct PCNVDIMMDevice {
>  
>      char *file;
>      bool configdata;
> +
> +    int device_index;
> +
> +    uint64_t config_data_size;
> +    void *config_data_addr;
> +
> +    MemoryRegion mr;
>  } PCNVDIMMDevice;
>  
>  #define TYPE_PC_NVDIMM "pc-nvdimm"

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 8, 2015, 1:38 p.m. UTC | #7
On 09/07/2015 10:11 PM, Igor Mammedov wrote:
> On Fri, 14 Aug 2015 22:52:01 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>> The parameter @file is used as backed memory for NVDIMM which is
>> divided into two parts if @dataconfig is true:
>> - first parts is (0, size - 128K], which is used as PMEM (Persistent
>>    Memory)
>> - 128K at the end of the file, which is used as Config Data Area, it's
>>    used to store Label namespace data
>>
>> The @file supports both regular file and block device, of course we
>> can assign any these two kinds of files for test and emulation, however,
>> in the real word for performance reason, we usually used these files as
>> NVDIMM backed file:
>> - the regular file in the filesystem with DAX enabled created on NVDIMM
>>    device on host
>> - the raw PMEM device on host, e,g /dev/pmem0
>
> A lot of code in this series could reuse what QEMU already
> uses for implementing pc-dimm devices.
>
> here is common concepts that could be reused.
>    - on physical system both DIMM and NVDIMM devices use
>      the same slots. We could share QEMU's '-m slots' option between
>      both devices. An alternative to not sharing would be to introduce
>      '-machine nvdimm_slots' option.
>      And yes, we need to know number of NVDIMMs to describe
>      them all in ACPI table (taking in amount future hotplug
>      include in this possible NVDIMM devices)
>      I'd go the same way as on real hardware on make them share the same slots.

I'd prefer sharing slots for pc-dimm and nvdimm, it's easier to reuse the
logic of slot-assignment and plug/unplug.

>    - they share the same physical address space and limits
>      on how much memory system can handle. So I'd suggest sharing existing
>      '-m maxmem' option and reuse hotplug_memory address space.

Sounds good to me.

>
> Essentially what I'm suggesting is to inherit NVDIMM's implementation
> from pc-dimm reusing all of its code/backends and
> just override parts that do memory mapping into guest's address space to
> accommodate NVDIMM's requirements.

Good idea!

We have to differentiate pc-dimm and nvdimm in the common code and nvdimm
has different points with pc-dimm (for example, its has reserved-region, and
need support live migration of label data). How about rename 'pc-nvdimm' to
'memory-device' and make it as a common device type, then build pc-dimm and
nvdimm on top of it?

Something like:
static TypeInfo memory_device_info = {
     .name          = TYPE_MEM_DEV,
     .parent        = TYPE_DEVICE,
};

static TypeInfo memory_device_info = {
.name = TYPE_PC_DIMM,
.parent = TYPE_MEM_DEV,
};

static TypeInfo memory_device_info = {
.name = TYPE_NVDIMM,
.parent = TYPE_MEM_DEV,
};

It also make CONIFG_NVDIMM and CONFIG_HOT_PLUG be independent.

>
>>
>> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
>> ---
>>   hw/mem/nvdimm/pc-nvdimm.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++-
>>   include/hw/mem/pc-nvdimm.h |   7 +++
>>   2 files changed, 115 insertions(+), 1 deletion(-)
>>
>> diff --git a/hw/mem/nvdimm/pc-nvdimm.c b/hw/mem/nvdimm/pc-nvdimm.c
>> index 7a270a8..97710d1 100644
>> --- a/hw/mem/nvdimm/pc-nvdimm.c
>> +++ b/hw/mem/nvdimm/pc-nvdimm.c
>> @@ -22,12 +22,20 @@
>>    * License along with this library; if not, see <http://www.gnu.org/licenses/>
>>    */
>>
>> +#include <sys/mman.h>
>> +#include <sys/ioctl.h>
>> +#include <linux/fs.h>
>> +
>> +#include "exec/address-spaces.h"
>>   #include "hw/mem/pc-nvdimm.h"
>>
>> -#define PAGE_SIZE      (1UL << 12)
>> +#define PAGE_SIZE               (1UL << 12)
>> +
>> +#define MIN_CONFIG_DATA_SIZE    (128 << 10)
>>
>>   static struct nvdimms_info {
>>       ram_addr_t current_addr;
>> +    int device_index;
>>   } nvdimms_info;
>>
>>   /* the address range [offset, ~0ULL) is reserved for NVDIMM. */
>> @@ -37,6 +45,26 @@ void pc_nvdimm_reserve_range(ram_addr_t offset)
>>       nvdimms_info.current_addr = offset;
>>   }
>>
>> +static ram_addr_t reserved_range_push(uint64_t size)
>> +{
>> +    uint64_t current;
>> +
>> +    current = ROUND_UP(nvdimms_info.current_addr, PAGE_SIZE);
>> +
>> +    /* do not have enough space? */
>> +    if (current + size < current) {
>> +        return 0;
>> +    }
>> +
>> +    nvdimms_info.current_addr = current + size;
>> +    return current;
>> +}
> You can't use all memory above hotplug_memory area since
> we have to tell guest where 64-bit PCI window starts,
> and currently it should start at reserved-memory-end
> (but it isn't due to a bug: I've just posted fix to qemu-devel
>   "[PATCH 0/2] pc: fix 64-bit PCI window clashing with memory hotplug region"
> )

Ah, got it, thanks for you pointing it out.

>
>> +
>> +static uint32_t new_device_index(void)
>> +{
>> +    return nvdimms_info.device_index++;
>> +}
>> +
>>   static char *get_file(Object *obj, Error **errp)
>>   {
>>       PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
>> @@ -48,6 +76,11 @@ static void set_file(Object *obj, const char *str, Error **errp)
>>   {
>>       PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
>>
>> +    if (memory_region_size(&nvdimm->mr)) {
>> +        error_setg(errp, "cannot change property value");
>> +        return;
>> +    }
>> +
>>       if (nvdimm->file) {
>>           g_free(nvdimm->file);
>>       }
>> @@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
>>                                set_configdata, NULL);
>>   }
>>
>> +static uint64_t get_file_size(int fd)
>> +{
>> +    struct stat stat_buf;
>> +    uint64_t size;
>> +
>> +    if (fstat(fd, &stat_buf) < 0) {
>> +        return 0;
>> +    }
>> +
>> +    if (S_ISREG(stat_buf.st_mode)) {
>> +        return stat_buf.st_size;
>> +    }
>> +
>> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
>> +        return size;
>> +    }
>> +
>> +    return 0;
>> +}
> All this file stuff I'd leave to already existing backends like
> memory-backend-file or even memory-backend-ram which already do
> above and more allowing to configure persistent and volatile
> NVDIMMs without changing NVDIMM fronted code.
>

The current memory backends use all memory size and map it to guest's
address space. However, nvdimm needs a reserved region for its label
data which is only accessed in Qemu.

How about introduce two parameters, "reserved_size" and "reserved_addr"
to TYPE_MEMORY_BACKEND, then only the memory region
[0, size - reserved_size) is mapped to guest and the remain part is
pointed by "reserved_addr"?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Sept. 10, 2015, 10:35 a.m. UTC | #8
On Tue, 8 Sep 2015 21:38:17 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> 
> 
> On 09/07/2015 10:11 PM, Igor Mammedov wrote:
> > On Fri, 14 Aug 2015 22:52:01 +0800
> > Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
> >
> >> The parameter @file is used as backed memory for NVDIMM which is
> >> divided into two parts if @dataconfig is true:
> >> - first parts is (0, size - 128K], which is used as PMEM (Persistent
> >>    Memory)
> >> - 128K at the end of the file, which is used as Config Data Area, it's
> >>    used to store Label namespace data
> >>
> >> The @file supports both regular file and block device, of course we
> >> can assign any these two kinds of files for test and emulation, however,
> >> in the real word for performance reason, we usually used these files as
> >> NVDIMM backed file:
> >> - the regular file in the filesystem with DAX enabled created on NVDIMM
> >>    device on host
> >> - the raw PMEM device on host, e,g /dev/pmem0
> >
> > A lot of code in this series could reuse what QEMU already
> > uses for implementing pc-dimm devices.
> >
> > here is common concepts that could be reused.
> >    - on physical system both DIMM and NVDIMM devices use
> >      the same slots. We could share QEMU's '-m slots' option between
> >      both devices. An alternative to not sharing would be to introduce
> >      '-machine nvdimm_slots' option.
> >      And yes, we need to know number of NVDIMMs to describe
> >      them all in ACPI table (taking in amount future hotplug
> >      include in this possible NVDIMM devices)
> >      I'd go the same way as on real hardware on make them share the same slots.
> 
> I'd prefer sharing slots for pc-dimm and nvdimm, it's easier to reuse the
> logic of slot-assignment and plug/unplug.
> 
> >    - they share the same physical address space and limits
> >      on how much memory system can handle. So I'd suggest sharing existing
> >      '-m maxmem' option and reuse hotplug_memory address space.
> 
> Sounds good to me.
> 
> >
> > Essentially what I'm suggesting is to inherit NVDIMM's implementation
> > from pc-dimm reusing all of its code/backends and
> > just override parts that do memory mapping into guest's address space to
> > accommodate NVDIMM's requirements.
> 
> Good idea!
> 
> We have to differentiate pc-dimm and nvdimm in the common code and nvdimm
> has different points with pc-dimm (for example, its has reserved-region, and
> need support live migration of label data). How about rename 'pc-nvdimm' to
> 'memory-device' and make it as a common device type, then build pc-dimm and
> nvdimm on top of it?
sound good, only I'd call it just 'dimm' as 'memory-device' is too broad.
Also I'd make base class abstract.

> 
> Something like:
> static TypeInfo memory_device_info = {
>      .name          = TYPE_MEM_DEV,
>      .parent        = TYPE_DEVICE,
> };
> 
> static TypeInfo memory_device_info = {
> .name = TYPE_PC_DIMM,
> .parent = TYPE_MEM_DEV,
> };
> 
> static TypeInfo memory_device_info = {
> .name = TYPE_NVDIMM,
> .parent = TYPE_MEM_DEV,
> };
> 
> It also make CONIFG_NVDIMM and CONFIG_HOT_PLUG be independent.
> 
> >
> >>
> >> Signed-off-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
> >> ---
> >>   hw/mem/nvdimm/pc-nvdimm.c  | 109 ++++++++++++++++++++++++++++++++++++++++++++-
> >>   include/hw/mem/pc-nvdimm.h |   7 +++
> >>   2 files changed, 115 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/hw/mem/nvdimm/pc-nvdimm.c b/hw/mem/nvdimm/pc-nvdimm.c
> >> index 7a270a8..97710d1 100644
> >> --- a/hw/mem/nvdimm/pc-nvdimm.c
> >> +++ b/hw/mem/nvdimm/pc-nvdimm.c
> >> @@ -22,12 +22,20 @@
> >>    * License along with this library; if not, see <http://www.gnu.org/licenses/>
> >>    */
> >>
> >> +#include <sys/mman.h>
> >> +#include <sys/ioctl.h>
> >> +#include <linux/fs.h>
> >> +
> >> +#include "exec/address-spaces.h"
> >>   #include "hw/mem/pc-nvdimm.h"
> >>
> >> -#define PAGE_SIZE      (1UL << 12)
> >> +#define PAGE_SIZE               (1UL << 12)
> >> +
> >> +#define MIN_CONFIG_DATA_SIZE    (128 << 10)
> >>
> >>   static struct nvdimms_info {
> >>       ram_addr_t current_addr;
> >> +    int device_index;
> >>   } nvdimms_info;
> >>
> >>   /* the address range [offset, ~0ULL) is reserved for NVDIMM. */
> >> @@ -37,6 +45,26 @@ void pc_nvdimm_reserve_range(ram_addr_t offset)
> >>       nvdimms_info.current_addr = offset;
> >>   }
> >>
> >> +static ram_addr_t reserved_range_push(uint64_t size)
> >> +{
> >> +    uint64_t current;
> >> +
> >> +    current = ROUND_UP(nvdimms_info.current_addr, PAGE_SIZE);
> >> +
> >> +    /* do not have enough space? */
> >> +    if (current + size < current) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    nvdimms_info.current_addr = current + size;
> >> +    return current;
> >> +}
> > You can't use all memory above hotplug_memory area since
> > we have to tell guest where 64-bit PCI window starts,
> > and currently it should start at reserved-memory-end
> > (but it isn't due to a bug: I've just posted fix to qemu-devel
> >   "[PATCH 0/2] pc: fix 64-bit PCI window clashing with memory hotplug region"
> > )
> 
> Ah, got it, thanks for you pointing it out.
> 
> >
> >> +
> >> +static uint32_t new_device_index(void)
> >> +{
> >> +    return nvdimms_info.device_index++;
> >> +}
> >> +
> >>   static char *get_file(Object *obj, Error **errp)
> >>   {
> >>       PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
> >> @@ -48,6 +76,11 @@ static void set_file(Object *obj, const char *str, Error **errp)
> >>   {
> >>       PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
> >>
> >> +    if (memory_region_size(&nvdimm->mr)) {
> >> +        error_setg(errp, "cannot change property value");
> >> +        return;
> >> +    }
> >> +
> >>       if (nvdimm->file) {
> >>           g_free(nvdimm->file);
> >>       }
> >> @@ -76,13 +109,87 @@ static void pc_nvdimm_init(Object *obj)
> >>                                set_configdata, NULL);
> >>   }
> >>
> >> +static uint64_t get_file_size(int fd)
> >> +{
> >> +    struct stat stat_buf;
> >> +    uint64_t size;
> >> +
> >> +    if (fstat(fd, &stat_buf) < 0) {
> >> +        return 0;
> >> +    }
> >> +
> >> +    if (S_ISREG(stat_buf.st_mode)) {
> >> +        return stat_buf.st_size;
> >> +    }
> >> +
> >> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
> >> +        return size;
> >> +    }
> >> +
> >> +    return 0;
> >> +}
> > All this file stuff I'd leave to already existing backends like
> > memory-backend-file or even memory-backend-ram which already do
> > above and more allowing to configure persistent and volatile
> > NVDIMMs without changing NVDIMM fronted code.
> >
> 
> The current memory backends use all memory size and map it to guest's
> address space. However, nvdimm needs a reserved region for its label
> data which is only accessed in Qemu.
> 
> How about introduce two parameters, "reserved_size" and "reserved_addr"
> to TYPE_MEMORY_BACKEND, then only the memory region
> [0, size - reserved_size) is mapped to guest and the remain part is
> pointed by "reserved_addr"?
Looks like only "reserved_size" is sufficient if there aren't any plans
/reasons to keep labels area not at the end of NVDIMM.

Also keeping reserved area in the same backend => MemoryRegion should
automatically migrate it during live migration.
To separate and map guest visible part data part of backend's MemoryRegion
you could use memory_region_init_alias().
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 15, 2015, 4:06 p.m. UTC | #9
On 25/08/2015 18:03, Stefan Hajnoczi wrote:
>> >  
>> > +static uint64_t get_file_size(int fd)
>> > +{
>> > +    struct stat stat_buf;
>> > +    uint64_t size;
>> > +
>> > +    if (fstat(fd, &stat_buf) < 0) {
>> > +        return 0;
>> > +    }
>> > +
>> > +    if (S_ISREG(stat_buf.st_mode)) {
>> > +        return stat_buf.st_size;
>> > +    }
>> > +
>> > +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
>> > +        return size;
>> > +    }
> #ifdef __linux__ for ioctl(fd, BLKGETSIZE64, &size)?
> 
> There is nothing Linux-specific about emulating NVDIMMs so this code
> should compile on all platforms.

The code from block/raw-posix.c and block/raw-win32.c's raw_getlength
should probably be extracted to a new function in utils/, and reused here.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 15, 2015, 4:07 p.m. UTC | #10
On 26/08/2015 12:40, Xiao Guangrong wrote:
>>>
>>> +
>>> +    size = get_file_size(fd);
>>> +    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>>
>> I guess the user will want to choose between MAP_SHARED and MAP_PRIVATE.
>> This can be added in the future.
> 
> Good idea, it will allow guest to write data but discards its content
> after it exits. Will implement O_RDONLY + MAP_PRIVATE in the near future.

FWIW, if Igor's backend/frontend idea is implemented, the choice between
MAP_SHARED and MAP_PRIVATE should belong in the backend.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 15, 2015, 4:10 p.m. UTC | #11
On 01/09/2015 11:14, Stefan Hajnoczi wrote:
>> > 
>> > When I was digging into live migration code, i noticed that the same MR name may
>> > cause the name "idstr", please refer to qemu_ram_set_idstr().
>> > 
>> > Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
>> > function.
> I see.  The other devices that use a constant name are on a bus so the
> abort doesn't trigger.

However, the MR name must be the same across the two machines.  Indices
are not friendly to hotplug.  Even though hotplug isn't supported now,
we should prepare and try not to change migration format when we support
hotplug in the future.

Is there any other fixed value that we can use, for example the base
address of the NVDIMM?

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 15, 2015, 4:11 p.m. UTC | #12
On 07/09/2015 16:11, Igor Mammedov wrote:
> 
> here is common concepts that could be reused.
>   - on physical system both DIMM and NVDIMM devices use
>     the same slots. We could share QEMU's '-m slots' option between
>     both devices. An alternative to not sharing would be to introduce
>     '-machine nvdimm_slots' option.
>     And yes, we need to know number of NVDIMMs to describe
>     them all in ACPI table (taking in amount future hotplug
>     include in this possible NVDIMM devices)
>     I'd go the same way as on real hardware on make them share the same slots.
>   - they share the same physical address space and limits
>     on how much memory system can handle. So I'd suggest sharing existing
>     '-m maxmem' option and reuse hotplug_memory address space.

I agree, and the slot number also provide a nice way to build a
consistent memory region name across multiple systems.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 17, 2015, 8:21 a.m. UTC | #13
On 09/16/2015 12:06 AM, Paolo Bonzini wrote:
>
>
> On 25/08/2015 18:03, Stefan Hajnoczi wrote:
>>>>
>>>> +static uint64_t get_file_size(int fd)
>>>> +{
>>>> +    struct stat stat_buf;
>>>> +    uint64_t size;
>>>> +
>>>> +    if (fstat(fd, &stat_buf) < 0) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    if (S_ISREG(stat_buf.st_mode)) {
>>>> +        return stat_buf.st_size;
>>>> +    }
>>>> +
>>>> +    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
>>>> +        return size;
>>>> +    }
>> #ifdef __linux__ for ioctl(fd, BLKGETSIZE64, &size)?
>>
>> There is nothing Linux-specific about emulating NVDIMMs so this code
>> should compile on all platforms.
>
> The code from block/raw-posix.c and block/raw-win32.c's raw_getlength
> should probably be extracted to a new function in utils/, and reused here.
>

The function you pointed out is really complex - it mixed 9 platforms and each
platform has its own specific implementation. It is hard for us to verify the
change.

I'd prefer to make it for Linux specific first then share it to other platforms
if it's needed in the future.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 17, 2015, 8:23 a.m. UTC | #14
On 09/16/2015 12:07 AM, Paolo Bonzini wrote:
>
>
> On 26/08/2015 12:40, Xiao Guangrong wrote:
>>>>
>>>> +
>>>> +    size = get_file_size(fd);
>>>> +    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
>>>
>>> I guess the user will want to choose between MAP_SHARED and MAP_PRIVATE.
>>> This can be added in the future.
>>
>> Good idea, it will allow guest to write data but discards its content
>> after it exits. Will implement O_RDONLY + MAP_PRIVATE in the near future.
>
> FWIW, if Igor's backend/frontend idea is implemented, the choice between
> MAP_SHARED and MAP_PRIVATE should belong in the backend.

Yes. I can not agree with you more! :)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 17, 2015, 8:39 a.m. UTC | #15
On 09/16/2015 12:10 AM, Paolo Bonzini wrote:
>
>
> On 01/09/2015 11:14, Stefan Hajnoczi wrote:
>>>>
>>>> When I was digging into live migration code, i noticed that the same MR name may
>>>> cause the name "idstr", please refer to qemu_ram_set_idstr().
>>>>
>>>> Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
>>>> function.
>> I see.  The other devices that use a constant name are on a bus so the
>> abort doesn't trigger.
>
> However, the MR name must be the same across the two machines.  Indices
> are not friendly to hotplug.  Even though hotplug isn't supported now,
> we should prepare and try not to change migration format when we support
> hotplug in the future.
>

Thanks for your reminder.

> Is there any other fixed value that we can use, for example the base
> address of the NVDIMM?

How about use object_get_canonical_path(OBJECT(dev)) (the @dev is NVDIMM
device) ?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Igor Mammedov Sept. 17, 2015, 9:04 a.m. UTC | #16
On Thu, 17 Sep 2015 16:39:12 +0800
Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:

> 
> 
> On 09/16/2015 12:10 AM, Paolo Bonzini wrote:
> >
> >
> > On 01/09/2015 11:14, Stefan Hajnoczi wrote:
> >>>>
> >>>> When I was digging into live migration code, i noticed that the same MR name may
> >>>> cause the name "idstr", please refer to qemu_ram_set_idstr().
> >>>>
> >>>> Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
> >>>> function.
> >> I see.  The other devices that use a constant name are on a bus so the
> >> abort doesn't trigger.
> >
> > However, the MR name must be the same across the two machines.  Indices
> > are not friendly to hotplug.  Even though hotplug isn't supported now,
> > we should prepare and try not to change migration format when we support
> > hotplug in the future.
> >
> 
> Thanks for your reminder.
> 
> > Is there any other fixed value that we can use, for example the base
> > address of the NVDIMM?
> 
> How about use object_get_canonical_path(OBJECT(dev)) (the @dev is NVDIMM
> device) ?
if you use split backend/frotnend idea then existing backends
already have a stable name derived from backend's ID and you won't need to care
about it.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 17, 2015, 9:14 a.m. UTC | #17
On 09/17/2015 05:04 PM, Igor Mammedov wrote:
> On Thu, 17 Sep 2015 16:39:12 +0800
> Xiao Guangrong <guangrong.xiao@linux.intel.com> wrote:
>
>>
>>
>> On 09/16/2015 12:10 AM, Paolo Bonzini wrote:
>>>
>>>
>>> On 01/09/2015 11:14, Stefan Hajnoczi wrote:
>>>>>>
>>>>>> When I was digging into live migration code, i noticed that the same MR name may
>>>>>> cause the name "idstr", please refer to qemu_ram_set_idstr().
>>>>>>
>>>>>> Since nvdimm devices do not have parent-bus, it will trigger the abort() in that
>>>>>> function.
>>>> I see.  The other devices that use a constant name are on a bus so the
>>>> abort doesn't trigger.
>>>
>>> However, the MR name must be the same across the two machines.  Indices
>>> are not friendly to hotplug.  Even though hotplug isn't supported now,
>>> we should prepare and try not to change migration format when we support
>>> hotplug in the future.
>>>
>>
>> Thanks for your reminder.
>>
>>> Is there any other fixed value that we can use, for example the base
>>> address of the NVDIMM?
>>
>> How about use object_get_canonical_path(OBJECT(dev)) (the @dev is NVDIMM
>> device) ?
> if you use split backend/frotnend idea then existing backends
> already have a stable name derived from backend's ID and you won't need to care
> about it.
>

Yes, i am using this idea and addressing your suggestion that use
memory_region_init_alias() to partly map hostmem to guest's address
space.

The code is like this:

/* get the memory region from backend memory. */
mr = host_memory_backend_get_memory(dimm->hostmem, errp);

/* nvdimm_nr will map to guest address space. */
memory_region_init_alias(&nvdimm->nvdimm_mr, OBJECT(dev),
                          object_get_canonical_path(OBJECT(dev)), mr, 0,
                          size - nvdimm->label_size);

/* the label size at the end of the file used as label_data of NVDIMM. */
......

So there are two memory regions, one is the backend-mem and another one
is nvdimm_mr in the example above. The name i am worried about is the name
of nvdimm_mr.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Sept. 17, 2015, 9:34 a.m. UTC | #18
On 17/09/2015 11:14, Xiao Guangrong wrote:
> 
> 
> /* get the memory region from backend memory. */
> mr = host_memory_backend_get_memory(dimm->hostmem, errp);
> 
> /* nvdimm_nr will map to guest address space. */
> memory_region_init_alias(&nvdimm->nvdimm_mr, OBJECT(dev),
>                          object_get_canonical_path(OBJECT(dev)), mr, 0,
>                          size - nvdimm->label_size);

You can just use "memory" here for the name here.  The name only needs
to be unique for RAM memory regions, and dimm->hostmem will take care of it.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Sept. 17, 2015, 12:43 p.m. UTC | #19
On 09/17/2015 05:34 PM, Paolo Bonzini wrote:
>
>
> On 17/09/2015 11:14, Xiao Guangrong wrote:
>>
>>
>> /* get the memory region from backend memory. */
>> mr = host_memory_backend_get_memory(dimm->hostmem, errp);
>>
>> /* nvdimm_nr will map to guest address space. */
>> memory_region_init_alias(&nvdimm->nvdimm_mr, OBJECT(dev),
>>                           object_get_canonical_path(OBJECT(dev)), mr, 0,
>>                           size - nvdimm->label_size);
>
> You can just use "memory" here for the name here.  The name only needs
> to be unique for RAM memory regions, and dimm->hostmem will take care of it.
>

Okay. I will try it, thank you, Paolo.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/hw/mem/nvdimm/pc-nvdimm.c b/hw/mem/nvdimm/pc-nvdimm.c
index 7a270a8..97710d1 100644
--- a/hw/mem/nvdimm/pc-nvdimm.c
+++ b/hw/mem/nvdimm/pc-nvdimm.c
@@ -22,12 +22,20 @@ 
  * License along with this library; if not, see <http://www.gnu.org/licenses/>
  */
 
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <linux/fs.h>
+
+#include "exec/address-spaces.h"
 #include "hw/mem/pc-nvdimm.h"
 
-#define PAGE_SIZE      (1UL << 12)
+#define PAGE_SIZE               (1UL << 12)
+
+#define MIN_CONFIG_DATA_SIZE    (128 << 10)
 
 static struct nvdimms_info {
     ram_addr_t current_addr;
+    int device_index;
 } nvdimms_info;
 
 /* the address range [offset, ~0ULL) is reserved for NVDIMM. */
@@ -37,6 +45,26 @@  void pc_nvdimm_reserve_range(ram_addr_t offset)
     nvdimms_info.current_addr = offset;
 }
 
+static ram_addr_t reserved_range_push(uint64_t size)
+{
+    uint64_t current;
+
+    current = ROUND_UP(nvdimms_info.current_addr, PAGE_SIZE);
+
+    /* do not have enough space? */
+    if (current + size < current) {
+        return 0;
+    }
+
+    nvdimms_info.current_addr = current + size;
+    return current;
+}
+
+static uint32_t new_device_index(void)
+{
+    return nvdimms_info.device_index++;
+}
+
 static char *get_file(Object *obj, Error **errp)
 {
     PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
@@ -48,6 +76,11 @@  static void set_file(Object *obj, const char *str, Error **errp)
 {
     PCNVDIMMDevice *nvdimm = PC_NVDIMM(obj);
 
+    if (memory_region_size(&nvdimm->mr)) {
+        error_setg(errp, "cannot change property value");
+        return;
+    }
+
     if (nvdimm->file) {
         g_free(nvdimm->file);
     }
@@ -76,13 +109,87 @@  static void pc_nvdimm_init(Object *obj)
                              set_configdata, NULL);
 }
 
+static uint64_t get_file_size(int fd)
+{
+    struct stat stat_buf;
+    uint64_t size;
+
+    if (fstat(fd, &stat_buf) < 0) {
+        return 0;
+    }
+
+    if (S_ISREG(stat_buf.st_mode)) {
+        return stat_buf.st_size;
+    }
+
+    if (S_ISBLK(stat_buf.st_mode) && !ioctl(fd, BLKGETSIZE64, &size)) {
+        return size;
+    }
+
+    return 0;
+}
+
 static void pc_nvdimm_realize(DeviceState *dev, Error **errp)
 {
     PCNVDIMMDevice *nvdimm = PC_NVDIMM(dev);
+    char name[512];
+    void *buf;
+    ram_addr_t addr;
+    uint64_t size, nvdimm_size, config_size = MIN_CONFIG_DATA_SIZE;
+    int fd;
 
     if (!nvdimm->file) {
         error_setg(errp, "file property is not set");
     }
+
+    fd = open(nvdimm->file, O_RDWR);
+    if (fd < 0) {
+        error_setg(errp, "can not open %s", nvdimm->file);
+        return;
+    }
+
+    size = get_file_size(fd);
+    buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+    if (buf == MAP_FAILED) {
+        error_setg(errp, "can not do mmap on %s", nvdimm->file);
+        goto do_close;
+    }
+
+    nvdimm->config_data_size = config_size;
+    if (nvdimm->configdata) {
+        /* reserve MIN_CONFIGDATA_AREA_SIZE for configue data. */
+        nvdimm_size = size - config_size;
+        nvdimm->config_data_addr = buf + nvdimm_size;
+    } else {
+        nvdimm_size = size;
+        nvdimm->config_data_addr = NULL;
+    }
+
+    if ((int64_t)nvdimm_size <= 0) {
+        error_setg(errp, "file size is too small to store NVDIMM"
+                         " configure data");
+        goto do_unmap;
+    }
+
+    addr = reserved_range_push(nvdimm_size);
+    if (!addr) {
+        error_setg(errp, "do not have enough space for size %#lx.\n", size);
+        goto do_unmap;
+    }
+
+    nvdimm->device_index = new_device_index();
+    sprintf(name, "NVDIMM-%d", nvdimm->device_index);
+    memory_region_init_ram_ptr(&nvdimm->mr, OBJECT(dev), name, nvdimm_size,
+                               buf);
+    vmstate_register_ram(&nvdimm->mr, DEVICE(dev));
+    memory_region_add_subregion(get_system_memory(), addr, &nvdimm->mr);
+
+    return;
+
+do_unmap:
+    munmap(buf, size);
+do_close:
+    close(fd);
 }
 
 static void pc_nvdimm_class_init(ObjectClass *oc, void *data)
diff --git a/include/hw/mem/pc-nvdimm.h b/include/hw/mem/pc-nvdimm.h
index 8601e9b..f617fd2 100644
--- a/include/hw/mem/pc-nvdimm.h
+++ b/include/hw/mem/pc-nvdimm.h
@@ -21,6 +21,13 @@  typedef struct PCNVDIMMDevice {
 
     char *file;
     bool configdata;
+
+    int device_index;
+
+    uint64_t config_data_size;
+    void *config_data_addr;
+
+    MemoryRegion mr;
 } PCNVDIMMDevice;
 
 #define TYPE_PC_NVDIMM "pc-nvdimm"