@@ -173,6 +173,17 @@ This protocol extension allows a client to inform the server about
intent to access a portion of the export, to allow the server an
opportunity to cache things appropriately.
+=item C<NBD_CMD_FLAG_FAST_ZERO>
+
+Supported in nbdkit E<ge> 1.13.9.
+
+This protocol extension allows a server to advertise that it can rank
+all zero requests as fast or slow, at which point the client can make
+fast zero requests which fail immediately with C<ENOTSUP> if the
+request is no faster than a counterpart write would be, while normal
+zero requests still benefit from compressed network traffic regardless
+of the time taken.
+
=item Resize Extension
I<Not supported>.
@@ -168,6 +168,7 @@ struct connection {
bool is_rotational;
bool can_trim;
bool can_zero;
+ bool can_fast_zero;
bool can_fua;
bool can_multi_conn;
bool can_cache;
@@ -275,6 +276,7 @@ struct backend {
int (*is_rotational) (struct backend *, struct connection *conn);
int (*can_trim) (struct backend *, struct connection *conn);
int (*can_zero) (struct backend *, struct connection *conn);
+ int (*can_fast_zero) (struct backend *, struct connection *conn);
int (*can_extents) (struct backend *, struct connection *conn);
int (*can_fua) (struct backend *, struct connection *conn);
int (*can_multi_conn) (struct backend *, struct connection *conn);
@@ -96,6 +96,7 @@ extern const char *name_of_nbd_flag (int);
#define NBD_FLAG_SEND_DF (1 << 7)
#define NBD_FLAG_CAN_MULTI_CONN (1 << 8)
#define NBD_FLAG_SEND_CACHE (1 << 10)
+#define NBD_FLAG_SEND_FAST_ZERO (1 << 11)
/* NBD options (new style handshake only). */
extern const char *name_of_nbd_opt (int);
@@ -223,10 +224,11 @@ extern const char *name_of_nbd_cmd (int);
#define NBD_CMD_BLOCK_STATUS 7
extern const char *name_of_nbd_cmd_flag (int);
-#define NBD_CMD_FLAG_FUA (1<<0)
-#define NBD_CMD_FLAG_NO_HOLE (1<<1)
-#define NBD_CMD_FLAG_DF (1<<2)
-#define NBD_CMD_FLAG_REQ_ONE (1<<3)
+#define NBD_CMD_FLAG_FUA (1<<0)
+#define NBD_CMD_FLAG_NO_HOLE (1<<1)
+#define NBD_CMD_FLAG_DF (1<<2)
+#define NBD_CMD_FLAG_REQ_ONE (1<<3)
+#define NBD_CMD_FLAG_FAST_ZERO (1<<4)
/* Error codes (previously errno).
* See http://git.qemu.org/?p=qemu.git;a=commitdiff;h=ca4414804114fd0095b317785bc0b51862e62ebb
@@ -239,6 +241,7 @@ extern const char *name_of_nbd_error (int);
#define NBD_EINVAL 22
#define NBD_ENOSPC 28
#define NBD_EOVERFLOW 75
+#define NBD_ENOTSUP 95
#define NBD_ESHUTDOWN 108
#endif /* NBDKIT_PROTOCOL_H */
@@ -403,8 +403,11 @@ next_zero (void *nxdata, uint32_t count, uint64_t offset, uint32_t flags,
int r;
r = b_conn->b->zero (b_conn->b, b_conn->conn, count, offset, flags, err);
- if (r == -1)
- assert (*err && *err != ENOTSUP && *err != EOPNOTSUPP);
+ if (r == -1) {
+ assert (*err);
+ if (!(flags & NBDKIT_FLAG_FAST_ZERO))
+ assert (*err != ENOTSUP && *err != EOPNOTSUPP);
+ }
return r;
}
@@ -586,6 +589,15 @@ filter_can_zero (struct backend *b, struct connection *conn)
return f->backend.next->can_zero (f->backend.next, conn);
}
+static int
+filter_can_fast_zero (struct backend *b, struct connection *conn)
+{
+ struct backend_filter *f = container_of (b, struct backend_filter, backend);
+
+ debug ("%s: can_fast_zero", f->name);
+ return 0; /* Next patch will query or pass through */
+}
+
static int
filter_can_extents (struct backend *b, struct connection *conn)
{
@@ -738,7 +750,8 @@ filter_zero (struct backend *b, struct connection *conn,
void *handle = connection_get_handle (conn, f->backend.i);
struct b_conn nxdata = { .b = f->backend.next, .conn = conn };
- assert (!(flags & ~(NBDKIT_FLAG_MAY_TRIM | NBDKIT_FLAG_FUA)));
+ assert (!(flags & ~(NBDKIT_FLAG_MAY_TRIM | NBDKIT_FLAG_FUA |
+ NBDKIT_FLAG_FAST_ZERO)));
debug ("%s: zero count=%" PRIu32 " offset=%" PRIu64 " flags=0x%" PRIx32,
f->name, count, offset, flags);
@@ -818,6 +831,7 @@ static struct backend filter_functions = {
.is_rotational = filter_is_rotational,
.can_trim = filter_can_trim,
.can_zero = filter_can_zero,
+ .can_fast_zero = filter_can_fast_zero,
.can_extents = filter_can_extents,
.can_fua = filter_can_fua,
.can_multi_conn = filter_can_multi_conn,
@@ -401,6 +401,16 @@ plugin_can_zero (struct backend *b, struct connection *conn)
return plugin_can_write (b, conn);
}
+static int
+plugin_can_fast_zero (struct backend *b, struct connection *conn)
+{
+ assert (connection_get_handle (conn, 0));
+
+ debug ("can_fast_zero");
+
+ return 0; /* Upcoming patch will actually add support. */
+}
+
static int
plugin_can_extents (struct backend *b, struct connection *conn)
{
@@ -621,15 +631,18 @@ plugin_zero (struct backend *b, struct connection *conn,
int r = -1;
bool may_trim = flags & NBDKIT_FLAG_MAY_TRIM;
bool fua = flags & NBDKIT_FLAG_FUA;
+ bool fast_zero = flags & NBDKIT_FLAG_FAST_ZERO;
bool emulate = false;
bool need_flush = false;
int can_zero = 1; /* TODO cache this per-connection? */
assert (connection_get_handle (conn, 0));
- assert (!(flags & ~(NBDKIT_FLAG_MAY_TRIM | NBDKIT_FLAG_FUA)));
+ assert (!(flags & ~(NBDKIT_FLAG_MAY_TRIM | NBDKIT_FLAG_FUA |
+ NBDKIT_FLAG_FAST_ZERO)));
- debug ("zero count=%" PRIu32 " offset=%" PRIu64 " may_trim=%d fua=%d",
- count, offset, may_trim, fua);
+ debug ("zero count=%" PRIu32 " offset=%" PRIu64
+ " may_trim=%d fua=%d fast_zero=%d",
+ count, offset, may_trim, fua, fast_zero);
if (fua && plugin_can_fua (b, conn) != NBDKIT_FUA_NATIVE) {
flags &= ~NBDKIT_FLAG_FUA;
@@ -643,6 +656,8 @@ plugin_zero (struct backend *b, struct connection *conn,
}
if (can_zero) {
+ /* if (!can_fast_zero) */
+ flags &= ~NBDKIT_FLAG_FAST_ZERO;
errno = 0;
if (p->plugin.zero)
r = p->plugin.zero (connection_get_handle (conn, 0), count, offset,
@@ -658,6 +673,12 @@ plugin_zero (struct backend *b, struct connection *conn,
goto done;
}
+ if (fast_zero) {
+ assert (r == -1);
+ *err = EOPNOTSUPP;
+ goto done;
+ }
+
assert (p->plugin.pwrite || p->plugin._pwrite_old);
flags &= ~NBDKIT_FLAG_MAY_TRIM;
threadlocal_set_error (0);
@@ -762,6 +783,7 @@ static struct backend plugin_functions = {
.is_rotational = plugin_is_rotational,
.can_trim = plugin_can_trim,
.can_zero = plugin_can_zero,
+ .can_fast_zero = plugin_can_fast_zero,
.can_extents = plugin_can_extents,
.can_fua = plugin_can_fua,
.can_multi_conn = plugin_can_multi_conn,
@@ -66,6 +66,13 @@ protocol_compute_eflags (struct connection *conn, uint16_t *flags)
if (fl) {
eflags |= NBD_FLAG_SEND_WRITE_ZEROES;
conn->can_zero = true;
+ fl = backend->can_fast_zero (backend, conn);
+ if (fl == -1)
+ return -1;
+ if (fl) {
+ eflags |= NBD_FLAG_SEND_FAST_ZERO;
+ conn->can_fast_zero = true;
+ }
}
fl = backend->can_trim (backend, conn);
@@ -110,7 +110,8 @@ validate_request (struct connection *conn,
/* Validate flags */
if (flags & ~(NBD_CMD_FLAG_FUA | NBD_CMD_FLAG_NO_HOLE |
- NBD_CMD_FLAG_DF | NBD_CMD_FLAG_REQ_ONE)) {
+ NBD_CMD_FLAG_DF | NBD_CMD_FLAG_REQ_ONE |
+ NBD_CMD_FLAG_FAST_ZERO)) {
nbdkit_error ("invalid request: unknown flag (0x%x)", flags);
*error = EINVAL;
return false;
@@ -121,6 +122,21 @@ validate_request (struct connection *conn,
*error = EINVAL;
return false;
}
+ if (flags & NBD_CMD_FLAG_FAST_ZERO) {
+ if (cmd != NBD_CMD_WRITE_ZEROES) {
+ nbdkit_error ("invalid request: "
+ "FAST_ZERO flag needs WRITE_ZEROES request");
+ *error = EINVAL;
+ return false;
+ }
+ if (!conn->can_fast_zero) {
+ nbdkit_error ("invalid request: "
+ "%s: fast zero support was not advertised",
+ name_of_nbd_cmd (cmd));
+ *error = EINVAL;
+ return false;
+ }
+ }
if (flags & NBD_CMD_FLAG_DF) {
if (cmd != NBD_CMD_READ) {
nbdkit_error ("invalid request: DF flag needs READ request");
@@ -297,8 +313,12 @@ handle_request (struct connection *conn,
f |= NBDKIT_FLAG_MAY_TRIM;
if (fua)
f |= NBDKIT_FLAG_FUA;
+ if (flags & NBD_CMD_FLAG_FAST_ZERO)
+ f |= NBDKIT_FLAG_FAST_ZERO;
if (backend->zero (backend, conn, count, offset, f, &err) == -1) {
- assert (err && err != ENOTSUP && err != EOPNOTSUPP);
+ assert (err);
+ if (!(flags & NBD_CMD_FLAG_FAST_ZERO))
+ assert (err != ENOTSUP && err != EOPNOTSUPP);
return err;
}
break;
@@ -368,7 +388,7 @@ skip_over_write_buffer (int sock, size_t count)
/* Convert a system errno to an NBD_E* error code. */
static int
-nbd_errno (int error, bool flag_df)
+nbd_errno (int error, uint16_t flags)
{
switch (error) {
case 0:
@@ -390,10 +410,17 @@ nbd_errno (int error, bool flag_df)
case ESHUTDOWN:
return NBD_ESHUTDOWN;
#endif
+ case ENOTSUP:
+#if ENOTSUP != EOPNOTSUPP
+ case EOPNOTSUPP:
+#endif
+ if (flags & NBD_CMD_FLAG_FAST_ZERO)
+ return NBD_ENOTSUP;
+ return NBD_EINVAL;
case EOVERFLOW:
- if (flag_df)
+ if (flags & NBD_CMD_FLAG_DF)
return NBD_EOVERFLOW;
- /* fallthrough */
+ return NBD_EINVAL;
case EINVAL:
default:
return NBD_EINVAL;
@@ -402,7 +429,7 @@ nbd_errno (int error, bool flag_df)
static int
send_simple_reply (struct connection *conn,
- uint64_t handle, uint16_t cmd,
+ uint64_t handle, uint16_t cmd, uint16_t flags,
const char *buf, uint32_t count,
uint32_t error)
{
@@ -413,7 +440,7 @@ send_simple_reply (struct connection *conn,
reply.magic = htobe32 (NBD_SIMPLE_REPLY_MAGIC);
reply.handle = handle;
- reply.error = htobe32 (nbd_errno (error, false));
+ reply.error = htobe32 (nbd_errno (error, flags));
r = conn->send (conn, &reply, sizeof reply, f);
if (r == -1) {
@@ -640,7 +667,7 @@ send_structured_reply_error (struct connection *conn,
}
/* Send the error. */
- error_data.error = htobe32 (nbd_errno (error, flags & NBD_CMD_FLAG_DF));
+ error_data.error = htobe32 (nbd_errno (error, flags));
error_data.len = htobe16 (0);
r = conn->send (conn, &error_data, sizeof error_data, 0);
if (r == -1) {
@@ -790,5 +817,6 @@ protocol_recv_request_send_reply (struct connection *conn)
error);
}
else
- return send_simple_reply (conn, request.handle, cmd, buf, count, error);
+ return send_simple_reply (conn, request.handle, cmd, flags, buf, count,
+ error);
}
@@ -57,9 +57,10 @@ extern "C" {
#define NBDKIT_THREAD_MODEL_SERIALIZE_REQUESTS 2
#define NBDKIT_THREAD_MODEL_PARALLEL 3
-#define NBDKIT_FLAG_MAY_TRIM (1<<0) /* Maps to !NBD_CMD_FLAG_NO_HOLE */
-#define NBDKIT_FLAG_FUA (1<<1) /* Maps to NBD_CMD_FLAG_FUA */
-#define NBDKIT_FLAG_REQ_ONE (1<<2) /* Maps to NBD_CMD_FLAG_REQ_ONE */
+#define NBDKIT_FLAG_MAY_TRIM (1<<0) /* Maps to !NBD_CMD_FLAG_NO_HOLE */
+#define NBDKIT_FLAG_FUA (1<<1) /* Maps to NBD_CMD_FLAG_FUA */
+#define NBDKIT_FLAG_REQ_ONE (1<<2) /* Maps to NBD_CMD_FLAG_REQ_ONE */
+#define NBDKIT_FLAG_FAST_ZERO (1<<3) /* Maps to NBD_CMD_FLAG_FAST_ZERO */
#define NBDKIT_FUA_NONE 0
#define NBDKIT_FUA_EMULATE 1
@@ -857,7 +857,6 @@ ocaml_nbdkit_set_error (value nv)
case 5: err = ENOSPC; break;
case 6: err = ESHUTDOWN; break;
case 7: err = EOVERFLOW; break;
- /* Necessary for .zero support */
case 8: err = EOPNOTSUPP; break;
/* Other errno values that server/protocol.c treats specially */
case 9: err = EROFS; break;
@@ -347,7 +347,6 @@ handle_script_error (char *ebuf, size_t len)
err = ESHUTDOWN;
skip = 9;
}
- /* Necessary for .zero support */
else if (strncasecmp (ebuf, "ENOTSUP", 7) == 0) {
err = ENOTSUP;
skip = 7;
Qemu was able to demonstrate that knowing whether a zero operation is fast is useful when copying from one image to another: there is a choice between bulk pre-zeroing and then revisiting the data sections (fewer transactions, but depends on the zeroing to be fast), vs. visiting every portion of the disk only once (more transactions, but no time lost to duplicated I/O due to slow zeroes). As such, the NBD protocol is adding an extension to allow clients to request fast failure when zero is not efficient, from servers that advertise support for the new flag. In nbdkit, start by plumbing a new .can_fast_zero through the backends (although it stays 0 in this patch, later patches will actually expose it to plugins and filters to override). Advertise the flag to the client when the plugin provides a .can_fast_zero, and wire in passing the flag down to .zero in the plugin. In turn, if the flag is set and the implementation .zero fails with ENOTSUP/EOPNOTSUPP, we no longer attempt the .pwrite fallback. Signed-off-by: Eric Blake <eblake@redhat.com> --- docs/nbdkit-protocol.pod | 11 +++++++++ server/internal.h | 2 ++ common/protocol/protocol.h | 11 +++++---- server/filters.c | 20 +++++++++++++--- server/plugins.c | 28 +++++++++++++++++++--- server/protocol-handshake.c | 7 ++++++ server/protocol.c | 46 +++++++++++++++++++++++++++++-------- include/nbdkit-common.h | 7 +++--- plugins/ocaml/ocaml.c | 1 - plugins/sh/call.c | 1 - 10 files changed, 110 insertions(+), 24 deletions(-)