Message ID | a13b23ca540a8846891895462d2fb139ec597237.1609231374.git.eafanasova@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Introduce MMIO/PIO dispatch file descriptors (ioregionfd) | expand |
On Tue, Dec 29, 2020 at 01:02:44PM +0300, Elena Afanasova wrote: > Signed-off-by: Elena Afanasova <eafanasova@gmail.com> > --- > virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 157 insertions(+) > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > index a200c3761343..8523f4126337 100644 > --- a/virt/kvm/ioregion.c > +++ b/virt/kvm/ioregion.c > @@ -4,6 +4,33 @@ > #include <kvm/iodev.h> > #include "eventfd.h" > > +/* Wire protocol */ > +struct ioregionfd_cmd { > + __u32 info; > + __u32 padding; > + __u64 user_data; > + __u64 offset; > + __u64 data; > +}; > + > +struct ioregionfd_resp { > + __u64 data; > + __u8 pad[24]; > +}; > + > +#define IOREGIONFD_CMD_READ 0 > +#define IOREGIONFD_CMD_WRITE 1 > + > +#define IOREGIONFD_SIZE_8BIT 0 > +#define IOREGIONFD_SIZE_16BIT 1 > +#define IOREGIONFD_SIZE_32BIT 2 > +#define IOREGIONFD_SIZE_64BIT 3 > + > +#define IOREGIONFD_SIZE_OFFSET 4 > +#define IOREGIONFD_RESP_OFFSET 6 > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET) > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET) These belong in the uapi header so userspace also has struct ioregionfd_cmd, struct ioregionfd_resp, etc. > + > void > kvm_ioregionfd_init(struct kvm *kvm) > { > @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p) > kfree(p); > } > > +static bool > +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, bool resp, > + u64 user_data, const void *val) > +{ > + u64 size = 0; > + > + switch (len) { > + case 1: > + size = IOREGIONFD_SIZE_8BIT; > + *((u8 *)&cmd->data) = val ? *(u8 *)val : 0; > + break; > + case 2: > + size = IOREGIONFD_SIZE_16BIT; > + *((u16 *)&cmd->data) = val ? *(u16 *)val : 0; > + break; > + case 4: > + size = IOREGIONFD_SIZE_32BIT; > + *((u32 *)&cmd->data) = val ? *(u32 *)val : 0; > + break; > + case 8: > + size = IOREGIONFD_SIZE_64BIT; > + *((u64 *)&cmd->data) = val ? *(u64 *)val : 0; > + break; > + default: > + return false; > The assignments and casts can be replaced with a single memcpy after the switch statement. This is also how KVM_EXIT_MMIO and Coalesced MMIO do it: memcpy(cmd->data, val, len); However, we need to make sure that cmd has been zeroed so that kernel memory is not accidentally exposed to userspace. + } > + cmd->user_data = user_data; > + cmd->offset = offset; > + cmd->info |= opt; > + cmd->info |= IOREGIONFD_SIZE(size); > + cmd->info |= IOREGIONFD_RESP(resp); > + > + return true; > +} > + > static int > ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, > int len, void *val) > { > + struct ioregion *p = to_ioregion(this); > + struct ioregionfd_cmd *cmd; > + struct ioregionfd_resp *resp; > + size_t buf_size; > + void *buf; > + int ret = 0; > + > + if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK)) > + return -EINVAL; Another CPU could change file descriptor flags while we are running. Therefore it might be simplest to handle kernel_write() and kernel_read() -EAGAIN return values instead of trying to check. > + if ((addr + len - 1) > (p->paddr + p->size - 1)) > + return -EINVAL; > + > + buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp)); > + buf = kzalloc(buf_size, GFP_KERNEL); > + if (!buf) > + return -ENOMEM; > + cmd = (struct ioregionfd_cmd *)buf; > + resp = (struct ioregionfd_resp *)buf; I think they are small enough to declare them on the stack: union { struct ioregionfd_cmd cmd; struct ioregionfd_resp resp; } buf; memset(&buf, 0, sizeof(buf)); > + if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ, > + 1, p->user_data, NULL)) { > + kfree(buf); > + return -EOPNOTSUPP; > + } > + > + ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0); > + if (ret != sizeof(*cmd)) { > + kfree(buf); > + return (ret < 0) ? ret : -EIO; > + } > + memset(buf, 0, buf_size); > + ret = kernel_read(p->rf, resp, sizeof(*resp), 0); > + if (ret != sizeof(*resp)) { > + kfree(buf); > + return (ret < 0) ? ret : -EIO; > + } > + > + switch (len) { > + case 1: > + *(u8 *)val = (u8)resp->data; > + break; > + case 2: > + *(u16 *)val = (u16)resp->data; > + break; > + case 4: > + *(u32 *)val = (u32)resp->data; > + break; > + case 8: > + *(u64 *)val = (u64)resp->data; > + break; > + default: > + break; > + } This looks inconsistent. cmd->data is treated as a packed u8/u16/u32/864 whereas resp->data is treated as u64? I was expecting memcpy(val, &resp->data, len) here not the u64 -> u8/u16/u32/u64 conversion.
On Tue, 2020-12-29 at 12:00 +0000, Stefan Hajnoczi wrote: > On Tue, Dec 29, 2020 at 01:02:44PM +0300, Elena Afanasova wrote: > > Signed-off-by: Elena Afanasova <eafanasova@gmail.com> > > --- > > virt/kvm/ioregion.c | 157 > > ++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 157 insertions(+) > > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > > index a200c3761343..8523f4126337 100644 > > --- a/virt/kvm/ioregion.c > > +++ b/virt/kvm/ioregion.c > > @@ -4,6 +4,33 @@ > > #include <kvm/iodev.h> > > #include "eventfd.h" > > > > +/* Wire protocol */ > > +struct ioregionfd_cmd { > > + __u32 info; > > + __u32 padding; > > + __u64 user_data; > > + __u64 offset; > > + __u64 data; > > +}; > > + > > +struct ioregionfd_resp { > > + __u64 data; > > + __u8 pad[24]; > > +}; > > + > > +#define IOREGIONFD_CMD_READ 0 > > +#define IOREGIONFD_CMD_WRITE 1 > > + > > +#define IOREGIONFD_SIZE_8BIT 0 > > +#define IOREGIONFD_SIZE_16BIT 1 > > +#define IOREGIONFD_SIZE_32BIT 2 > > +#define IOREGIONFD_SIZE_64BIT 3 > > + > > +#define IOREGIONFD_SIZE_OFFSET 4 > > +#define IOREGIONFD_RESP_OFFSET 6 > > +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET) > > +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET) > > These belong in the uapi header so userspace also has struct > ioregionfd_cmd, struct ioregionfd_resp, etc. > I'll move the wire protocol to a separate uapi header > > + > > void > > kvm_ioregionfd_init(struct kvm *kvm) > > { > > @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p) > > kfree(p); > > } > > > > +static bool > > +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, > > bool resp, > > + u64 user_data, const void *val) > > +{ > > + u64 size = 0; > > + > > + switch (len) { > > + case 1: > > + size = IOREGIONFD_SIZE_8BIT; > > + *((u8 *)&cmd->data) = val ? *(u8 *)val : 0; > > + break; > > + case 2: > > + size = IOREGIONFD_SIZE_16BIT; > > + *((u16 *)&cmd->data) = val ? *(u16 *)val : 0; > > + break; > > + case 4: > > + size = IOREGIONFD_SIZE_32BIT; > > + *((u32 *)&cmd->data) = val ? *(u32 *)val : 0; > > + break; > > + case 8: > > + size = IOREGIONFD_SIZE_64BIT; > > + *((u64 *)&cmd->data) = val ? *(u64 *)val : 0; > > + break; > > + default: > > + return false; > > > > The assignments and casts can be replaced with a single memcpy after > the > switch statement. This is also how KVM_EXIT_MMIO and Coalesced MMIO > do > it: > > memcpy(cmd->data, val, len); > Thanks for pointing it out > However, we need to make sure that cmd has been zeroed so that kernel > memory is not accidentally exposed to userspace. > > + } > > + cmd->user_data = user_data; > > + cmd->offset = offset; > > + cmd->info |= opt; > > + cmd->info |= IOREGIONFD_SIZE(size); > > + cmd->info |= IOREGIONFD_RESP(resp); > > + > > + return true; > > +} > > + > > static int > > ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, > > gpa_t addr, > > int len, void *val) > > { > > + struct ioregion *p = to_ioregion(this); > > + struct ioregionfd_cmd *cmd; > > + struct ioregionfd_resp *resp; > > + size_t buf_size; > > + void *buf; > > + int ret = 0; > > + > > + if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & > > O_NONBLOCK)) > > + return -EINVAL; > > Another CPU could change file descriptor flags while we are running. > Therefore it might be simplest to handle kernel_write() and > kernel_read() -EAGAIN return values instead of trying to check. > Ok, I'll fix this > > + if ((addr + len - 1) > (p->paddr + p->size - 1)) > > + return -EINVAL; > > + > > + buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp)); > > + buf = kzalloc(buf_size, GFP_KERNEL); > > + if (!buf) > > + return -ENOMEM; > > + cmd = (struct ioregionfd_cmd *)buf; > > + resp = (struct ioregionfd_resp *)buf; > > I think they are small enough to declare them on the stack: > > union { > struct ioregionfd_cmd cmd; > struct ioregionfd_resp resp; > } buf; > > memset(&buf, 0, sizeof(buf)); > Ok > > + if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ, > > + 1, p->user_data, NULL)) { > > + kfree(buf); > > + return -EOPNOTSUPP; > > + } > > + > > + ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0); > > + if (ret != sizeof(*cmd)) { > > + kfree(buf); > > + return (ret < 0) ? ret : -EIO; > > + } > > + memset(buf, 0, buf_size); > > + ret = kernel_read(p->rf, resp, sizeof(*resp), 0); > > + if (ret != sizeof(*resp)) { > > + kfree(buf); > > + return (ret < 0) ? ret : -EIO; > > + } > > + > > + switch (len) { > > + case 1: > > + *(u8 *)val = (u8)resp->data; > > + break; > > + case 2: > > + *(u16 *)val = (u16)resp->data; > > + break; > > + case 4: > > + *(u32 *)val = (u32)resp->data; > > + break; > > + case 8: > > + *(u64 *)val = (u64)resp->data; > > + break; > > + default: > > + break; > > + } > > This looks inconsistent. cmd->data is treated as a packed > u8/u16/u32/864 > whereas resp->data is treated as u64? > > I was expecting memcpy(val, &resp->data, len) here not the u64 -> > u8/u16/u32/u64 conversion. I'll fix this, thanks!
On 2020/12/29 下午6:02, Elena Afanasova wrote: > Signed-off-by: Elena Afanasova<eafanasova@gmail.com> > --- > virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 157 insertions(+) > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > index a200c3761343..8523f4126337 100644 > --- a/virt/kvm/ioregion.c > +++ b/virt/kvm/ioregion.c > @@ -4,6 +4,33 @@ > #include <kvm/iodev.h> > #include "eventfd.h" > > +/* Wire protocol */ > +struct ioregionfd_cmd { > + __u32 info; > + __u32 padding; > + __u64 user_data; > + __u64 offset; > + __u64 data; > +}; > + I wonder do we need a seq in the protocol. It might be useful if we allow a pair of file descriptors to be used for multiple different ranges. Thanks > +struct ioregionfd_resp { > + __u64 data; > + __u8 pad[24]; > +};
On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote: > On 2020/12/29 下午6:02, Elena Afanasova wrote: > > Signed-off-by: Elena Afanasova<eafanasova@gmail.com> > > --- > > virt/kvm/ioregion.c | 157 > > ++++++++++++++++++++++++++++++++++++++++++++ > > 1 file changed, 157 insertions(+) > > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > > index a200c3761343..8523f4126337 100644 > > --- a/virt/kvm/ioregion.c > > +++ b/virt/kvm/ioregion.c > > @@ -4,6 +4,33 @@ > > #include <kvm/iodev.h> > > #include "eventfd.h" > > > > +/* Wire protocol */ > > +struct ioregionfd_cmd { > > + __u32 info; > > + __u32 padding; > > + __u64 user_data; > > + __u64 offset; > > + __u64 data; > > +}; > > + > > I wonder do we need a seq in the protocol. It might be useful if we > allow a pair of file descriptors to be used for multiple different > ranges. > I think it might be helpful in the case of out-of-order requests. In the case of in order requests seq field seems not to be necessary since there will be cmds/replies serialization. I’ll include the synchronization code in a RFC v2 series. > Thanks > > > > +struct ioregionfd_resp { > > + __u64 data; > > + __u8 pad[24]; > > +};
On 2021/1/4 上午4:37, Elena Afanasova wrote: > On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote: >> On 2020/12/29 下午6:02, Elena Afanasova wrote: >>> Signed-off-by: Elena Afanasova<eafanasova@gmail.com> >>> --- >>> virt/kvm/ioregion.c | 157 >>> ++++++++++++++++++++++++++++++++++++++++++++ >>> 1 file changed, 157 insertions(+) >>> >>> diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c >>> index a200c3761343..8523f4126337 100644 >>> --- a/virt/kvm/ioregion.c >>> +++ b/virt/kvm/ioregion.c >>> @@ -4,6 +4,33 @@ >>> #include <kvm/iodev.h> >>> #include "eventfd.h" >>> >>> +/* Wire protocol */ >>> +struct ioregionfd_cmd { >>> + __u32 info; >>> + __u32 padding; >>> + __u64 user_data; >>> + __u64 offset; >>> + __u64 data; >>> +}; >>> + >> I wonder do we need a seq in the protocol. It might be useful if we >> allow a pair of file descriptors to be used for multiple different >> ranges. >> > I think it might be helpful in the case of out-of-order requests. > In the case of in order requests seq field seems not to be necessary > since there will be cmds/replies serialization. I’ll include the > synchronization code in a RFC v2 series. See my reply to V1. It might be helpful for the case of using single ioregionfd for multiple ranges. Thanks > >> Thanks >> >> >>> +struct ioregionfd_resp { >>> + __u64 data; >>> + __u8 pad[24]; >>> +};
On Mon, 2021-01-04 at 13:37 +0800, Jason Wang wrote: > On 2021/1/4 上午4:37, Elena Afanasova wrote: > > On Thu, 2020-12-31 at 11:46 +0800, Jason Wang wrote: > > > On 2020/12/29 下午6:02, Elena Afanasova wrote: > > > > Signed-off-by: Elena Afanasova<eafanasova@gmail.com> > > > > --- > > > > virt/kvm/ioregion.c | 157 > > > > ++++++++++++++++++++++++++++++++++++++++++++ > > > > 1 file changed, 157 insertions(+) > > > > > > > > diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c > > > > index a200c3761343..8523f4126337 100644 > > > > --- a/virt/kvm/ioregion.c > > > > +++ b/virt/kvm/ioregion.c > > > > @@ -4,6 +4,33 @@ > > > > #include <kvm/iodev.h> > > > > #include "eventfd.h" > > > > > > > > +/* Wire protocol */ > > > > +struct ioregionfd_cmd { > > > > + __u32 info; > > > > + __u32 padding; > > > > + __u64 user_data; > > > > + __u64 offset; > > > > + __u64 data; > > > > +}; > > > > + > > > I wonder do we need a seq in the protocol. It might be useful if > > > we > > > allow a pair of file descriptors to be used for multiple > > > different > > > ranges. > > > > > I think it might be helpful in the case of out-of-order requests. > > In the case of in order requests seq field seems not to be > > necessary > > since there will be cmds/replies serialization. I’ll include the > > synchronization code in a RFC v2 series. > > See my reply to V1. It might be helpful for the case of using single > ioregionfd for multiple ranges. > Ok, thank you! > Thanks > > > > > Thanks > > > > > > > > > > +struct ioregionfd_resp { > > > > + __u64 data; > > > > + __u8 pad[24]; > > > > +};
diff --git a/virt/kvm/ioregion.c b/virt/kvm/ioregion.c index a200c3761343..8523f4126337 100644 --- a/virt/kvm/ioregion.c +++ b/virt/kvm/ioregion.c @@ -4,6 +4,33 @@ #include <kvm/iodev.h> #include "eventfd.h" +/* Wire protocol */ +struct ioregionfd_cmd { + __u32 info; + __u32 padding; + __u64 user_data; + __u64 offset; + __u64 data; +}; + +struct ioregionfd_resp { + __u64 data; + __u8 pad[24]; +}; + +#define IOREGIONFD_CMD_READ 0 +#define IOREGIONFD_CMD_WRITE 1 + +#define IOREGIONFD_SIZE_8BIT 0 +#define IOREGIONFD_SIZE_16BIT 1 +#define IOREGIONFD_SIZE_32BIT 2 +#define IOREGIONFD_SIZE_64BIT 3 + +#define IOREGIONFD_SIZE_OFFSET 4 +#define IOREGIONFD_RESP_OFFSET 6 +#define IOREGIONFD_SIZE(x) ((x) << IOREGIONFD_SIZE_OFFSET) +#define IOREGIONFD_RESP(x) ((x) << IOREGIONFD_RESP_OFFSET) + void kvm_ioregionfd_init(struct kvm *kvm) { @@ -38,10 +65,100 @@ ioregion_release(struct ioregion *p) kfree(p); } +static bool +pack_cmd(struct ioregionfd_cmd *cmd, u64 offset, u64 len, int opt, bool resp, + u64 user_data, const void *val) +{ + u64 size = 0; + + switch (len) { + case 1: + size = IOREGIONFD_SIZE_8BIT; + *((u8 *)&cmd->data) = val ? *(u8 *)val : 0; + break; + case 2: + size = IOREGIONFD_SIZE_16BIT; + *((u16 *)&cmd->data) = val ? *(u16 *)val : 0; + break; + case 4: + size = IOREGIONFD_SIZE_32BIT; + *((u32 *)&cmd->data) = val ? *(u32 *)val : 0; + break; + case 8: + size = IOREGIONFD_SIZE_64BIT; + *((u64 *)&cmd->data) = val ? *(u64 *)val : 0; + break; + default: + return false; + } + cmd->user_data = user_data; + cmd->offset = offset; + cmd->info |= opt; + cmd->info |= IOREGIONFD_SIZE(size); + cmd->info |= IOREGIONFD_RESP(resp); + + return true; +} + static int ioregion_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, void *val) { + struct ioregion *p = to_ioregion(this); + struct ioregionfd_cmd *cmd; + struct ioregionfd_resp *resp; + size_t buf_size; + void *buf; + int ret = 0; + + if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK)) + return -EINVAL; + if ((addr + len - 1) > (p->paddr + p->size - 1)) + return -EINVAL; + + buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp)); + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + cmd = (struct ioregionfd_cmd *)buf; + resp = (struct ioregionfd_resp *)buf; + if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_READ, + 1, p->user_data, NULL)) { + kfree(buf); + return -EOPNOTSUPP; + } + + ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0); + if (ret != sizeof(*cmd)) { + kfree(buf); + return (ret < 0) ? ret : -EIO; + } + memset(buf, 0, buf_size); + ret = kernel_read(p->rf, resp, sizeof(*resp), 0); + if (ret != sizeof(*resp)) { + kfree(buf); + return (ret < 0) ? ret : -EIO; + } + + switch (len) { + case 1: + *(u8 *)val = (u8)resp->data; + break; + case 2: + *(u16 *)val = (u16)resp->data; + break; + case 4: + *(u32 *)val = (u32)resp->data; + break; + case 8: + *(u64 *)val = (u64)resp->data; + break; + default: + break; + } + + kfree(buf); + return 0; } @@ -49,6 +166,46 @@ static int ioregion_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr, int len, const void *val) { + struct ioregion *p = to_ioregion(this); + struct ioregionfd_cmd *cmd; + struct ioregionfd_resp *resp; + size_t buf_size = 0; + void *buf; + int ret = 0; + + if ((p->rf->f_flags & O_NONBLOCK) || (p->wf->f_flags & O_NONBLOCK)) + return -EINVAL; + if ((addr + len - 1) > (p->paddr + p->size - 1)) + return -EINVAL; + + buf_size = max_t(size_t, sizeof(*cmd), sizeof(*resp)); + buf = kzalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + cmd = (struct ioregionfd_cmd *)buf; + if (!pack_cmd(cmd, addr - p->paddr, len, IOREGIONFD_CMD_WRITE, + p->posted_writes ? 0 : 1, p->user_data, val)) { + kfree(buf); + return -EOPNOTSUPP; + } + + ret = kernel_write(p->wf, cmd, sizeof(*cmd), 0); + if (ret != sizeof(*cmd)) { + kfree(buf); + return (ret < 0) ? ret : -EIO; + } + + if (!p->posted_writes) { + memset(buf, 0, buf_size); + resp = (struct ioregionfd_resp *)buf; + ret = kernel_read(p->rf, resp, sizeof(*resp), 0); + if (ret != sizeof(*resp)) { + kfree(buf); + return (ret < 0) ? ret : -EIO; + } + } + kfree(buf); + return 0; }
Signed-off-by: Elena Afanasova <eafanasova@gmail.com> --- virt/kvm/ioregion.c | 157 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 157 insertions(+)