@@ -75,6 +75,11 @@ typedef struct privcmd_mmapbatch_v2 {
int __user *err; /* array of error codes */
} privcmd_mmapbatch_v2_t;
+struct privcmd_hcall_buf {
+ void *start;
+ size_t len;
+};
+
/*
* @cmd: IOCTL_PRIVCMD_HYPERCALL
* @arg: &privcmd_hypercall_t
@@ -89,4 +94,36 @@ typedef struct privcmd_mmapbatch_v2 {
#define IOCTL_PRIVCMD_MMAPBATCH_V2 \
_IOC(_IOC_NONE, 'P', 4, sizeof(privcmd_mmapbatch_v2_t))
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_LOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: 0 on success. On an error, -1 is returned and errno is set
+ * to EINVAL, ENOMEM, or EFAULT.
+ *
+ * Locks a memory buffer so it may be used in a hypercall. This is
+ * similar to mlock(2) but also prevents compaction/page migration.
+ *
+ * The buffers may have any alignment and size and may overlap other
+ * buffers.
+ *
+ * Locked buffers are unlocked with IOCTL_PRIVCMD_HCALL_BUF_UNLOCK or
+ * by closing the file handle.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_LOCK \
+ _IOC(_IOC_NONE, 'P', 5, sizeof(struct privcmd_hcall_buf))
+
+/*
+ * @cmd: IOCTL_PRIVCMD_HCALL_BUF_UNLOCK
+ * @arg: struct privcmd hcall_buf *
+ * Return: Always 0.
+ *
+ * Unlocks a memory buffer previously locked with
+ * IOCTL_PRIVCMD_HCALL_BUF_LOCK.
+ *
+ * It is not possible to partially unlock a buffer. i.e., the
+ * LOCK/UNLOCK must be exactly paired.
+ */
+#define IOCTL_PRIVCMD_HCALL_BUF_UNLOCK \
+ _IOC(_IOC_NONE, 'P', 6, sizeof(struct privcmd_hcall_buf))
+
#endif /* __LINUX_PUBLIC_PRIVCMD_H__ */
@@ -68,6 +68,8 @@ int osdep_hypercall(xencall_handle *xcall, privcmd_hypercall_t *hypercall)
return ioctl(xcall->fd, IOCTL_PRIVCMD_HYPERCALL, hypercall);
}
+static int have_hbuf_lock = 1;
+
void *osdep_alloc_pages(xencall_handle *xcall, size_t npages)
{
size_t size = npages * PAGE_SIZE;
@@ -84,7 +86,7 @@ void *osdep_alloc_pages(xencall_handle *xcall, size_t npages)
/* Do not copy the VMA to child process on fork. Avoid the page being COW
on hypercall. */
- rc = madvise(p, npages * PAGE_SIZE, MADV_DONTFORK);
+ rc = madvise(p, size, MADV_DONTFORK);
if ( rc < 0 )
{
PERROR("alloc_pages: madvise failed");
@@ -103,6 +105,33 @@ void *osdep_alloc_pages(xencall_handle *xcall, size_t npages)
*c = 0;
}
+ if ( have_hbuf_lock )
+ {
+ struct privcmd_hcall_buf hbuf;
+
+ hbuf.start = p;
+ hbuf.len = size;
+
+ rc = ioctl(xcall->fd, IOCTL_PRIVCMD_HCALL_BUF_LOCK, &hbuf);
+ if ( rc < 0 )
+ {
+ /*
+ * Older drivers return EINVAL if the ioctl was not
+ * supported.
+ */
+ if ( errno == ENOTTY || errno == EINVAL )
+ {
+ have_hbuf_lock = 0;
+ errno = 0;
+ }
+ else
+ {
+ PERROR("alloc_pages: lock failed");
+ goto out;
+ }
+ }
+ }
+
return p;
out:
@@ -114,11 +143,23 @@ out:
void osdep_free_pages(xencall_handle *xcall, void *ptr, size_t npages)
{
+ size_t size = npages * PAGE_SIZE;
int saved_errno = errno;
+
+ if ( have_hbuf_lock )
+ {
+ struct privcmd_hcall_buf hbuf;
+
+ hbuf.start = ptr;
+ hbuf.len = size;
+
+ ioctl(xcall->fd, IOCTL_PRIVCMD_HCALL_BUF_UNLOCK, &hbuf);
+ }
+
/* Recover the VMA flags. Maybe it's not necessary */
- madvise(ptr, npages * PAGE_SIZE, MADV_DOFORK);
+ madvise(ptr, size, MADV_DOFORK);
- munmap(ptr, npages * PAGE_SIZE);
+ munmap(ptr, size);
/* We MUST propagate the hypercall errno, not unmap call's. */
errno = saved_errno;
}
Using just mlock'd buffers for hypercalls is not sufficient as these are still subject to compaction and page migration. Use the new IOCTL_PRIVCMD_HCALL_BUF_LOCK and IOCTL_PRIVCMD_HCALL_BUF_UNLOCK ioctls provided by the privcmd driver to prevent this. Since not all kernels support these ioctls, don't repeatedly try these ioctls if they are unsupported. MAP_LOCKED is still used as this places the pages on the unevictable list avoiding the need for the VM subsystem to scan them. madvise(.., MADV_DONTFORK) is still required since we still need to prevent children getting CoW mappings of the hypercall buffers. Signed-off-by: David Vrabel <david.vrabel@citrix.com> --- tools/include/xen-sys/Linux/privcmd.h | 37 +++++++++++++++++++++++++++ tools/libs/call/linux.c | 47 ++++++++++++++++++++++++++++++++--- 2 files changed, 81 insertions(+), 3 deletions(-)