aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnthony Liguori <aliguori@us.ibm.com>2011-12-27 08:52:42 -0600
committerAnthony Liguori <aliguori@us.ibm.com>2011-12-27 08:52:42 -0600
commitebdfc3c83cef33f8f619ffcb57d297c6063db59d (patch)
treeaf4c54a676ecc8b9c6bae7370a27261b640a247c
parenta0fa82085e175bf8ce6d69a3f83695f81af2a649 (diff)
parent44f76b289a33399abedfbca2d92d21d910792264 (diff)
Merge remote-tracking branch 'bonzini/nbd-for-anthony' into staging
* bonzini/nbd-for-anthony: (26 commits) nbd: add myself as maintainer qemu-nbd: throttle requests qemu-nbd: asynchronous operation qemu-nbd: add client pointer to NBDRequest qemu-nbd: move client handling to nbd.c qemu-nbd: use common main loop link the main loop and its dependencies into the tools qemu-nbd: introduce NBDRequest qemu-nbd: introduce NBDExport qemu-nbd: introduce nbd_do_receive_request qemu-nbd: more robust handling of invalid requests qemu-nbd: introduce nbd_do_send_reply qemu-nbd: simplify nbd_trip move corking functions to osdep.c qemu-nbd: remove data_size argument to nbd_trip qemu-nbd: remove offset argument to nbd_trip Update ioctl order in nbd_init() to detect EBUSY nbd: add support for NBD_CMD_TRIM nbd: add support for NBD_CMD_FLUSH nbd: add support for NBD_CMD_FLAG_FUA ...
-rw-r--r--MAINTAINERS7
-rw-r--r--Makefile5
-rw-r--r--Makefile.objs2
-rw-r--r--block/nbd.c319
-rw-r--r--block/sheepdog.c250
-rw-r--r--cutils.c111
-rw-r--r--main-loop.h6
-rw-r--r--nbd.c439
-rw-r--r--nbd.h14
-rw-r--r--os-posix.c42
-rw-r--r--os-win32.c5
-rw-r--r--osdep.c76
-rw-r--r--oslib-posix.c43
-rw-r--r--oslib-win32.c5
-rw-r--r--qemu-common.h34
-rw-r--r--qemu-coroutine-io.c96
-rw-r--r--qemu-nbd.c120
-rw-r--r--qemu-tool.c42
-rw-r--r--qemu_socket.h1
19 files changed, 1087 insertions, 530 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index e22bfa1a3..764c92dab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -473,6 +473,13 @@ M: Mark McLoughlin <markmc@redhat.com>
S: Maintained
F: net/
+Network Block Device (NBD)
+M: Paolo Bonzini <pbonzini@redhat.com>
+S: Odd Fixes
+F: block/nbd.c
+F: nbd.*
+F: qemu-nbd.c
+
SLIRP
M: Jan Kiszka <jan.kiszka@siemens.com>
S: Maintained
diff --git a/Makefile b/Makefile
index 2c030552a..368eeae9b 100644
--- a/Makefile
+++ b/Makefile
@@ -147,8 +147,9 @@ endif
qemu-img.o: qemu-img-cmds.h
qemu-img.o qemu-tool.o qemu-nbd.o qemu-io.o cmd.o qemu-ga.o: $(GENERATED_HEADERS)
-tools-obj-y = qemu-tool.o $(oslib-obj-y) $(trace-obj-y) \
- qemu-timer-common.o cutils.o
+tools-obj-y = $(oslib-obj-y) $(trace-obj-y) qemu-tool.o qemu-timer.o \
+ qemu-timer-common.o main-loop.o notify.o iohandler.o cutils.o async.o
+tools-obj-$(CONFIG_POSIX) += compatfd.o
qemu-img$(EXESUF): qemu-img.o $(tools-obj-y) $(block-obj-y)
qemu-nbd$(EXESUF): qemu-nbd.o $(tools-obj-y) $(block-obj-y)
diff --git a/Makefile.objs b/Makefile.objs
index f753d838f..881367358 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -12,7 +12,7 @@ oslib-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o
#######################################################################
# coroutines
-coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o
+coroutine-obj-y = qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
ifeq ($(CONFIG_UCONTEXT_COROUTINE),y)
coroutine-obj-$(CONFIG_POSIX) += coroutine-ucontext.o
else
diff --git a/block/nbd.c b/block/nbd.c
index 95212dac6..161b29985 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -46,14 +46,25 @@
#define logout(fmt, ...) ((void)0)
#endif
+#define MAX_NBD_REQUESTS 16
+#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs))
+#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs))
+
typedef struct BDRVNBDState {
- CoMutex lock;
int sock;
uint32_t nbdflags;
off_t size;
size_t blocksize;
char *export_name; /* An NBD server may export several devices */
+ CoMutex send_mutex;
+ CoMutex free_sema;
+ Coroutine *send_coroutine;
+ int in_flight;
+
+ Coroutine *recv_coroutine[MAX_NBD_REQUESTS];
+ struct nbd_reply reply;
+
/* If it begins with '/', this is a UNIX domain socket. Otherwise,
* it's a string of the form <hostname|ip4|\[ip6\]>:port
*/
@@ -106,6 +117,130 @@ out:
return err;
}
+static void nbd_coroutine_start(BDRVNBDState *s, struct nbd_request *request)
+{
+ int i;
+
+ /* Poor man semaphore. The free_sema is locked when no other request
+ * can be accepted, and unlocked after receiving one reply. */
+ if (s->in_flight >= MAX_NBD_REQUESTS - 1) {
+ qemu_co_mutex_lock(&s->free_sema);
+ assert(s->in_flight < MAX_NBD_REQUESTS);
+ }
+ s->in_flight++;
+
+ for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+ if (s->recv_coroutine[i] == NULL) {
+ s->recv_coroutine[i] = qemu_coroutine_self();
+ break;
+ }
+ }
+
+ assert(i < MAX_NBD_REQUESTS);
+ request->handle = INDEX_TO_HANDLE(s, i);
+}
+
+static int nbd_have_request(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+
+ return s->in_flight > 0;
+}
+
+static void nbd_reply_ready(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+ int i;
+
+ if (s->reply.handle == 0) {
+ /* No reply already in flight. Fetch a header. */
+ if (nbd_receive_reply(s->sock, &s->reply) < 0) {
+ s->reply.handle = 0;
+ goto fail;
+ }
+ }
+
+ /* There's no need for a mutex on the receive side, because the
+ * handler acts as a synchronization point and ensures that only
+ * one coroutine is called until the reply finishes. */
+ i = HANDLE_TO_INDEX(s, s->reply.handle);
+ if (s->recv_coroutine[i]) {
+ qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+ return;
+ }
+
+fail:
+ for (i = 0; i < MAX_NBD_REQUESTS; i++) {
+ if (s->recv_coroutine[i]) {
+ qemu_coroutine_enter(s->recv_coroutine[i], NULL);
+ }
+ }
+}
+
+static void nbd_restart_write(void *opaque)
+{
+ BDRVNBDState *s = opaque;
+ qemu_coroutine_enter(s->send_coroutine, NULL);
+}
+
+static int nbd_co_send_request(BDRVNBDState *s, struct nbd_request *request,
+ struct iovec *iov, int offset)
+{
+ int rc, ret;
+
+ qemu_co_mutex_lock(&s->send_mutex);
+ s->send_coroutine = qemu_coroutine_self();
+ qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, nbd_restart_write,
+ nbd_have_request, NULL, s);
+ rc = nbd_send_request(s->sock, request);
+ if (rc != -1 && iov) {
+ ret = qemu_co_sendv(s->sock, iov, request->len, offset);
+ if (ret != request->len) {
+ errno = -EIO;
+ rc = -1;
+ }
+ }
+ qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
+ nbd_have_request, NULL, s);
+ s->send_coroutine = NULL;
+ qemu_co_mutex_unlock(&s->send_mutex);
+ return rc;
+}
+
+static void nbd_co_receive_reply(BDRVNBDState *s, struct nbd_request *request,
+ struct nbd_reply *reply,
+ struct iovec *iov, int offset)
+{
+ int ret;
+
+ /* Wait until we're woken up by the read handler. TODO: perhaps
+ * peek at the next reply and avoid yielding if it's ours? */
+ qemu_coroutine_yield();
+ *reply = s->reply;
+ if (reply->handle != request->handle) {
+ reply->error = EIO;
+ } else {
+ if (iov && reply->error == 0) {
+ ret = qemu_co_recvv(s->sock, iov, request->len, offset);
+ if (ret != request->len) {
+ reply->error = EIO;
+ }
+ }
+
+ /* Tell the read handler to read another header. */
+ s->reply.handle = 0;
+ }
+}
+
+static void nbd_coroutine_end(BDRVNBDState *s, struct nbd_request *request)
+{
+ int i = HANDLE_TO_INDEX(s, request->handle);
+ s->recv_coroutine[i] = NULL;
+ if (s->in_flight-- == MAX_NBD_REQUESTS) {
+ qemu_co_mutex_unlock(&s->free_sema);
+ }
+}
+
static int nbd_establish_connection(BlockDriverState *bs)
{
BDRVNBDState *s = bs->opaque;
@@ -135,8 +270,11 @@ static int nbd_establish_connection(BlockDriverState *bs)
return -errno;
}
- /* Now that we're connected, set the socket to be non-blocking */
+ /* Now that we're connected, set the socket to be non-blocking and
+ * kick the reply mechanism. */
socket_set_nonblock(sock);
+ qemu_aio_set_fd_handler(s->sock, nbd_reply_ready, NULL,
+ nbd_have_request, NULL, s);
s->sock = sock;
s->size = size;
@@ -152,11 +290,11 @@ static void nbd_teardown_connection(BlockDriverState *bs)
struct nbd_request request;
request.type = NBD_CMD_DISC;
- request.handle = (uint64_t)(intptr_t)bs;
request.from = 0;
request.len = 0;
nbd_send_request(s->sock, &request);
+ qemu_aio_set_fd_handler(s->sock, NULL, NULL, NULL, NULL, NULL);
closesocket(s->sock);
}
@@ -165,6 +303,9 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
BDRVNBDState *s = bs->opaque;
int result;
+ qemu_co_mutex_init(&s->send_mutex);
+ qemu_co_mutex_init(&s->free_sema);
+
/* Pop the config into our state object. Exit if invalid. */
result = nbd_config(s, filename, flags);
if (result != 0) {
@@ -176,90 +317,146 @@ static int nbd_open(BlockDriverState *bs, const char* filename, int flags)
*/
result = nbd_establish_connection(bs);
- qemu_co_mutex_init(&s->lock);
return result;
}
-static int nbd_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
+static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov,
+ int offset)
{
BDRVNBDState *s = bs->opaque;
struct nbd_request request;
struct nbd_reply reply;
request.type = NBD_CMD_READ;
- request.handle = (uint64_t)(intptr_t)bs;
request.from = sector_num * 512;
request.len = nb_sectors * 512;
- if (nbd_send_request(s->sock, &request) == -1)
- return -errno;
-
- if (nbd_receive_reply(s->sock, &reply) == -1)
- return -errno;
-
- if (reply.error !=0)
- return -reply.error;
-
- if (reply.handle != request.handle)
- return -EIO;
-
- if (nbd_wr_sync(s->sock, buf, request.len, 1) != request.len)
- return -EIO;
+ nbd_coroutine_start(s, &request);
+ if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
+ reply.error = errno;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, qiov->iov, offset);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
- return 0;
}
-static int nbd_write(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
+static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov,
+ int offset)
{
BDRVNBDState *s = bs->opaque;
struct nbd_request request;
struct nbd_reply reply;
request.type = NBD_CMD_WRITE;
- request.handle = (uint64_t)(intptr_t)bs;
+ if (!bdrv_enable_write_cache(bs) && (s->nbdflags & NBD_FLAG_SEND_FUA)) {
+ request.type |= NBD_CMD_FLAG_FUA;
+ }
+
request.from = sector_num * 512;
request.len = nb_sectors * 512;
- if (nbd_send_request(s->sock, &request) == -1)
- return -errno;
-
- if (nbd_wr_sync(s->sock, (uint8_t*)buf, request.len, 0) != request.len)
- return -EIO;
-
- if (nbd_receive_reply(s->sock, &reply) == -1)
- return -errno;
-
- if (reply.error !=0)
- return -reply.error;
+ nbd_coroutine_start(s, &request);
+ if (nbd_co_send_request(s, &request, qiov->iov, offset) == -1) {
+ reply.error = errno;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
+}
- if (reply.handle != request.handle)
- return -EIO;
+/* qemu-nbd has a limit of slightly less than 1M per request. Try to
+ * remain aligned to 4K. */
+#define NBD_MAX_SECTORS 2040
- return 0;
+static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
+{
+ int offset = 0;
+ int ret;
+ while (nb_sectors > NBD_MAX_SECTORS) {
+ ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
+ if (ret < 0) {
+ return ret;
+ }
+ offset += NBD_MAX_SECTORS * 512;
+ sector_num += NBD_MAX_SECTORS;
+ nb_sectors -= NBD_MAX_SECTORS;
+ }
+ return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset);
}
-static coroutine_fn int nbd_co_read(BlockDriverState *bs, int64_t sector_num,
- uint8_t *buf, int nb_sectors)
+static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors, QEMUIOVector *qiov)
{
+ int offset = 0;
int ret;
+ while (nb_sectors > NBD_MAX_SECTORS) {
+ ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset);
+ if (ret < 0) {
+ return ret;
+ }
+ offset += NBD_MAX_SECTORS * 512;
+ sector_num += NBD_MAX_SECTORS;
+ nb_sectors -= NBD_MAX_SECTORS;
+ }
+ return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset);
+}
+
+static int nbd_co_flush(BlockDriverState *bs)
+{
BDRVNBDState *s = bs->opaque;
- qemu_co_mutex_lock(&s->lock);
- ret = nbd_read(bs, sector_num, buf, nb_sectors);
- qemu_co_mutex_unlock(&s->lock);
- return ret;
+ struct nbd_request request;
+ struct nbd_reply reply;
+
+ if (!(s->nbdflags & NBD_FLAG_SEND_FLUSH)) {
+ return 0;
+ }
+
+ request.type = NBD_CMD_FLUSH;
+ if (s->nbdflags & NBD_FLAG_SEND_FUA) {
+ request.type |= NBD_CMD_FLAG_FUA;
+ }
+
+ request.from = 0;
+ request.len = 0;
+
+ nbd_coroutine_start(s, &request);
+ if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
+ reply.error = errno;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
}
-static coroutine_fn int nbd_co_write(BlockDriverState *bs, int64_t sector_num,
- const uint8_t *buf, int nb_sectors)
+static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num,
+ int nb_sectors)
{
- int ret;
BDRVNBDState *s = bs->opaque;
- qemu_co_mutex_lock(&s->lock);
- ret = nbd_write(bs, sector_num, buf, nb_sectors);
- qemu_co_mutex_unlock(&s->lock);
- return ret;
+ struct nbd_request request;
+ struct nbd_reply reply;
+
+ if (!(s->nbdflags & NBD_FLAG_SEND_TRIM)) {
+ return 0;
+ }
+ request.type = NBD_CMD_TRIM;
+ request.from = sector_num * 512;;
+ request.len = nb_sectors * 512;
+
+ nbd_coroutine_start(s, &request);
+ if (nbd_co_send_request(s, &request, NULL, 0) == -1) {
+ reply.error = errno;
+ } else {
+ nbd_co_receive_reply(s, &request, &reply, NULL, 0);
+ }
+ nbd_coroutine_end(s, &request);
+ return -reply.error;
}
static void nbd_close(BlockDriverState *bs)
@@ -279,14 +476,16 @@ static int64_t nbd_getlength(BlockDriverState *bs)
}
static BlockDriver bdrv_nbd = {
- .format_name = "nbd",
- .instance_size = sizeof(BDRVNBDState),
- .bdrv_file_open = nbd_open,
- .bdrv_read = nbd_co_read,
- .bdrv_write = nbd_co_write,
- .bdrv_close = nbd_close,
- .bdrv_getlength = nbd_getlength,
- .protocol_name = "nbd",
+ .format_name = "nbd",
+ .instance_size = sizeof(BDRVNBDState),
+ .bdrv_file_open = nbd_open,
+ .bdrv_co_readv = nbd_co_readv,
+ .bdrv_co_writev = nbd_co_writev,
+ .bdrv_close = nbd_close,
+ .bdrv_co_flush_to_os = nbd_co_flush,
+ .bdrv_co_discard = nbd_co_discard,
+ .bdrv_getlength = nbd_getlength,
+ .protocol_name = "nbd",
};
static void bdrv_nbd_init(void)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index aa9707f2a..17a79beb2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -443,129 +443,6 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov,
return acb;
}
-#ifdef _WIN32
-
-struct msghdr {
- struct iovec *msg_iov;
- size_t msg_iovlen;
-};
-
-static ssize_t sendmsg(int s, const struct msghdr *msg, int flags)
-{
- size_t size = 0;
- char *buf, *p;
- int i, ret;
-
- /* count the msg size */
- for (i = 0; i < msg->msg_iovlen; i++) {
- size += msg->msg_iov[i].iov_len;
- }
- buf = g_malloc(size);
-
- p = buf;
- for (i = 0; i < msg->msg_iovlen; i++) {
- memcpy(p, msg->msg_iov[i].iov_base, msg->msg_iov[i].iov_len);
- p += msg->msg_iov[i].iov_len;
- }
-
- ret = send(s, buf, size, flags);
-
- g_free(buf);
- return ret;
-}
-
-static ssize_t recvmsg(int s, struct msghdr *msg, int flags)
-{
- size_t size = 0;
- char *buf, *p;
- int i, ret;
-
- /* count the msg size */
- for (i = 0; i < msg->msg_iovlen; i++) {
- size += msg->msg_iov[i].iov_len;
- }
- buf = g_malloc(size);
-
- ret = qemu_recv(s, buf, size, flags);
- if (ret < 0) {
- goto out;
- }
-
- p = buf;
- for (i = 0; i < msg->msg_iovlen; i++) {
- memcpy(msg->msg_iov[i].iov_base, p, msg->msg_iov[i].iov_len);
- p += msg->msg_iov[i].iov_len;
- }
-out:
- g_free(buf);
- return ret;
-}
-
-#endif
-
-/*
- * Send/recv data with iovec buffers
- *
- * This function send/recv data from/to the iovec buffer directly.
- * The first `offset' bytes in the iovec buffer are skipped and next
- * `len' bytes are used.
- *
- * For example,
- *
- * do_send_recv(sockfd, iov, len, offset, 1);
- *
- * is equals to
- *
- * char *buf = malloc(size);
- * iov_to_buf(iov, iovcnt, buf, offset, size);
- * send(sockfd, buf, size, 0);
- * free(buf);
- */
-static int do_send_recv(int sockfd, struct iovec *iov, int len, int offset,
- int write)
-{
- struct msghdr msg;
- int ret, diff;
-
- memset(&msg, 0, sizeof(msg));
- msg.msg_iov = iov;
- msg.msg_iovlen = 1;
-
- len += offset;
-
- while (iov->iov_len < len) {
- len -= iov->iov_len;
-
- iov++;
- msg.msg_iovlen++;
- }
-
- diff = iov->iov_len - len;
- iov->iov_len -= diff;
-
- while (msg.msg_iov->iov_len <= offset) {
- offset -= msg.msg_iov->iov_len;
-
- msg.msg_iov++;
- msg.msg_iovlen--;
- }
-
- msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base + offset;
- msg.msg_iov->iov_len -= offset;
-
- if (write) {
- ret = sendmsg(sockfd, &msg, 0);
- } else {
- ret = recvmsg(sockfd, &msg, 0);
- }
-
- msg.msg_iov->iov_base = (char *) msg.msg_iov->iov_base - offset;
- msg.msg_iov->iov_len += offset;
-
- iov->iov_len += diff;
- return ret;
-}
-
static int connect_to_sdog(const char *addr, const char *port)
{
char hbuf[NI_MAXHOST], sbuf[NI_MAXSERV];
@@ -618,83 +495,19 @@ success:
return fd;
}
-static int do_readv_writev(int sockfd, struct iovec *iov, int len,
- int iov_offset, int write)
-{
- int ret;
-again:
- ret = do_send_recv(sockfd, iov, len, iov_offset, write);
- if (ret < 0) {
- if (errno == EINTR) {
- goto again;
- }
- if (errno == EAGAIN) {
- if (qemu_in_coroutine()) {
- qemu_coroutine_yield();
- }
- goto again;
- }
- error_report("failed to recv a rsp, %s", strerror(errno));
- return 1;
- }
-
- iov_offset += ret;
- len -= ret;
- if (len) {
- goto again;
- }
-
- return 0;
-}
-
-static int do_readv(int sockfd, struct iovec *iov, int len, int iov_offset)
-{
- return do_readv_writev(sockfd, iov, len, iov_offset, 0);
-}
-
-static int do_writev(int sockfd, struct iovec *iov, int len, int iov_offset)
-{
- return do_readv_writev(sockfd, iov, len, iov_offset, 1);
-}
-
-static int do_read_write(int sockfd, void *buf, int len, int write)
-{
- struct iovec iov;
-
- iov.iov_base = buf;
- iov.iov_len = len;
-
- return do_readv_writev(sockfd, &iov, len, 0, write);
-}
-
-static int do_read(int sockfd, void *buf, int len)
-{
- return do_read_write(sockfd, buf, len, 0);
-}
-
-static int do_write(int sockfd, void *buf, int len)
-{
- return do_read_write(sockfd, buf, len, 1);
-}
-
static int send_req(int sockfd, SheepdogReq *hdr, void *data,
unsigned int *wlen)
{
int ret;
- struct iovec iov[2];
- iov[0].iov_base = hdr;
- iov[0].iov_len = sizeof(*hdr);
-
- if (*wlen) {
- iov[1].iov_base = data;
- iov[1].iov_len = *wlen;
+ ret = qemu_send_full(sockfd, hdr, sizeof(*hdr), 0);
+ if (ret < sizeof(*hdr)) {
+ error_report("failed to send a req, %s", strerror(errno));
}
- ret = do_writev(sockfd, iov, sizeof(*hdr) + *wlen, 0);
- if (ret) {
+ ret = qemu_send_full(sockfd, data, *wlen, 0);
+ if (ret < *wlen) {
error_report("failed to send a req, %s", strerror(errno));
- ret = -1;
}
return ret;
@@ -705,16 +518,15 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
{
int ret;
+ socket_set_block(sockfd);
ret = send_req(sockfd, hdr, data, wlen);
- if (ret) {
- ret = -1;
+ if (ret < 0) {
goto out;
}
- ret = do_read(sockfd, hdr, sizeof(*hdr));
- if (ret) {
+ ret = qemu_recv_full(sockfd, hdr, sizeof(*hdr), 0);
+ if (ret < sizeof(*hdr)) {
error_report("failed to get a rsp, %s", strerror(errno));
- ret = -1;
goto out;
}
@@ -723,15 +535,15 @@ static int do_req(int sockfd, SheepdogReq *hdr, void *data,
}
if (*rlen) {
- ret = do_read(sockfd, data, *rlen);
- if (ret) {
+ ret = qemu_recv_full(sockfd, data, *rlen, 0);
+ if (ret < *rlen) {
error_report("failed to get the data, %s", strerror(errno));
- ret = -1;
goto out;
}
}
ret = 0;
out:
+ socket_set_nonblock(sockfd);
return ret;
}
@@ -793,8 +605,8 @@ static void coroutine_fn aio_read_response(void *opaque)
}
/* read a header */
- ret = do_read(fd, &rsp, sizeof(rsp));
- if (ret) {
+ ret = qemu_co_recv(fd, &rsp, sizeof(rsp));
+ if (ret < 0) {
error_report("failed to get the header, %s", strerror(errno));
goto out;
}
@@ -839,9 +651,9 @@ static void coroutine_fn aio_read_response(void *opaque)
}
break;
case AIOCB_READ_UDATA:
- ret = do_readv(fd, acb->qiov->iov, rsp.data_length,
- aio_req->iov_offset);
- if (ret) {
+ ret = qemu_co_recvv(fd, acb->qiov->iov, rsp.data_length,
+ aio_req->iov_offset);
+ if (ret < 0) {
error_report("failed to get the data, %s", strerror(errno));
goto out;
}
@@ -890,22 +702,6 @@ static int aio_flush_request(void *opaque)
return !QLIST_EMPTY(&s->outstanding_aio_head);
}
-#if !defined(SOL_TCP) || !defined(TCP_CORK)
-
-static int set_cork(int fd, int v)
-{
- return 0;
-}
-
-#else
-
-static int set_cork(int fd, int v)
-{
- return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
-}
-
-#endif
-
static int set_nodelay(int fd)
{
int ret, opt;
@@ -1111,26 +907,26 @@ static int coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req,
s->co_send = qemu_coroutine_self();
qemu_aio_set_fd_handler(s->fd, co_read_response, co_write_request,
aio_flush_request, NULL, s);
- set_cork(s->fd, 1);
+ socket_set_cork(s->fd, 1);
/* send a header */
- ret = do_write(s->fd, &hdr, sizeof(hdr));
- if (ret) {
+ ret = qemu_co_send(s->fd, &hdr, sizeof(hdr));
+ if (ret < 0) {
qemu_co_mutex_unlock(&s->lock);
error_report("failed to send a req, %s", strerror(errno));
return -EIO;
}
if (wlen) {
- ret = do_writev(s->fd, iov, wlen, aio_req->iov_offset);
- if (ret) {
+ ret = qemu_co_sendv(s->fd, iov, wlen, aio_req->iov_offset);
+ if (ret < 0) {
qemu_co_mutex_unlock(&s->lock);
error_report("failed to send a data, %s", strerror(errno));
return -EIO;
}
}
- set_cork(s->fd, 0);
+ socket_set_cork(s->fd, 0);
qemu_aio_set_fd_handler(s->fd, co_read_response, NULL,
aio_flush_request, NULL, s);
qemu_co_mutex_unlock(&s->lock);
diff --git a/cutils.c b/cutils.c
index 24b3fe355..a6ffd4644 100644
--- a/cutils.c
+++ b/cutils.c
@@ -25,6 +25,8 @@
#include "host-utils.h"
#include <math.h>
+#include "qemu_socket.h"
+
void pstrcpy(char *buf, int buf_size, const char *str)
{
int c;
@@ -403,3 +405,112 @@ int qemu_parse_fd(const char *param)
}
return fd;
}
+
+/*
+ * Send/recv data with iovec buffers
+ *
+ * This function send/recv data from/to the iovec buffer directly.
+ * The first `offset' bytes in the iovec buffer are skipped and next
+ * `len' bytes are used.
+ *
+ * For example,
+ *
+ * do_sendv_recvv(sockfd, iov, len, offset, 1);
+ *
+ * is equal to
+ *
+ * char *buf = malloc(size);
+ * iov_to_buf(iov, iovcnt, buf, offset, size);
+ * send(sockfd, buf, size, 0);
+ * free(buf);
+ */
+static int do_sendv_recvv(int sockfd, struct iovec *iov, int len, int offset,
+ int do_sendv)
+{
+ int ret, diff, iovlen;
+ struct iovec *last_iov;
+
+ /* last_iov is inclusive, so count from one. */
+ iovlen = 1;
+ last_iov = iov;
+ len += offset;
+
+ while (last_iov->iov_len < len) {
+ len -= last_iov->iov_len;
+
+ last_iov++;
+ iovlen++;
+ }
+
+ diff = last_iov->iov_len - len;
+ last_iov->iov_len -= diff;
+
+ while (iov->iov_len <= offset) {
+ offset -= iov->iov_len;
+
+ iov++;
+ iovlen--;
+ }
+
+ iov->iov_base = (char *) iov->iov_base + offset;
+ iov->iov_len -= offset;
+
+ {
+#if defined CONFIG_IOVEC && defined CONFIG_POSIX
+ struct msghdr msg;
+ memset(&msg, 0, sizeof(msg));
+ msg.msg_iov = iov;
+ msg.msg_iovlen = iovlen;
+
+ do {
+ if (do_sendv) {
+ ret = sendmsg(sockfd, &msg, 0);
+ } else {
+ ret = recvmsg(sockfd, &msg, 0);
+ }
+ } while (ret == -1 && errno == EINTR);
+#else
+ struct iovec *p = iov;
+ ret = 0;
+ while (iovlen > 0) {
+ int rc;
+ if (do_sendv) {
+ rc = send(sockfd, p->iov_base, p->iov_len, 0);
+ } else {
+ rc = qemu_recv(sockfd, p->iov_base, p->iov_len, 0);
+ }
+ if (rc == -1) {
+ if (errno == EINTR) {
+ continue;
+ }
+ if (ret == 0) {
+ ret = -1;
+ }
+ break;
+ }
+ if (rc == 0) {
+ break;
+ }
+ ret += rc;
+ iovlen--, p++;
+ }
+#endif
+ }
+
+ /* Undo the changes above */
+ iov->iov_base = (char *) iov->iov_base - offset;
+ iov->iov_len += offset;
+ last_iov->iov_len += diff;
+ return ret;
+}
+
+int qemu_recvv(int sockfd, struct iovec *iov, int len, int iov_offset)
+{
+ return do_sendv_recvv(sockfd, iov, len, iov_offset, 0);
+}
+
+int qemu_sendv(int sockfd, struct iovec *iov, int len, int iov_offset)
+{
+ return do_sendv_recvv(sockfd, iov, len, iov_offset, 1);
+}
+
diff --git a/main-loop.h b/main-loop.h
index 876092dd1..f9710136c 100644
--- a/main-loop.h
+++ b/main-loop.h
@@ -324,6 +324,9 @@ int qemu_add_child_watch(pid_t pid);
* by threads other than the main loop thread when calling
* qemu_bh_new(), qemu_set_fd_handler() and basically all other
* functions documented in this file.
+ *
+ * NOTE: tools currently are single-threaded and qemu_mutex_lock_iothread
+ * is a no-op there.
*/
void qemu_mutex_lock_iothread(void);
@@ -336,6 +339,9 @@ void qemu_mutex_lock_iothread(void);
* as soon as possible by threads other than the main loop thread,
* because it prevents the main loop from processing callbacks,
* including timers and bottom halves.
+ *
+ * NOTE: tools currently are single-threaded and qemu_mutex_unlock_iothread
+ * is a no-op there.
*/
void qemu_mutex_unlock_iothread(void);
diff --git a/nbd.c b/nbd.c
index de880fe3c..567e94e27 100644
--- a/nbd.c
+++ b/nbd.c
@@ -18,6 +18,9 @@
#include "nbd.h"
#include "block.h"
+#include "block_int.h"
+
+#include "qemu-coroutine.h"
#include <errno.h>
#include <string.h>
@@ -35,6 +38,7 @@
#endif
#include "qemu_socket.h"
+#include "qemu-queue.h"
//#define DEBUG_NBD
@@ -81,6 +85,14 @@ size_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
{
size_t offset = 0;
+ if (qemu_in_coroutine()) {
+ if (do_read) {
+ return qemu_co_recv(fd, buffer, size);
+ } else {
+ return qemu_co_send(fd, buffer, size);
+ }
+ }
+
while (offset < size) {
ssize_t len;
@@ -178,7 +190,7 @@ int unix_socket_outgoing(const char *path)
Request (type == 2)
*/
-int nbd_negotiate(int csock, off_t size, uint32_t flags)
+static int nbd_send_negotiate(int csock, off_t size, uint32_t flags)
{
char buf[8 + 8 + 8 + 128];
@@ -194,7 +206,9 @@ int nbd_negotiate(int csock, off_t size, uint32_t flags)
memcpy(buf, "NBDMAGIC", 8);
cpu_to_be64w((uint64_t*)(buf + 8), 0x00420281861253LL);
cpu_to_be64w((uint64_t*)(buf + 16), size);
- cpu_to_be32w((uint32_t*)(buf + 24), flags | NBD_FLAG_HAS_FLAGS);
+ cpu_to_be32w((uint32_t*)(buf + 24),
+ flags | NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
+ NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
memset(buf + 28, 0, 124);
if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
@@ -348,6 +362,15 @@ int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
#ifdef __linux__
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
{
+ TRACE("Setting NBD socket");
+
+ if (ioctl(fd, NBD_SET_SOCK, csock) == -1) {
+ int serrno = errno;
+ LOG("Failed to set NBD socket");
+ errno = serrno;
+ return -1;
+ }
+
TRACE("Setting block size to %lu", (unsigned long)blocksize);
if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) == -1) {
@@ -386,24 +409,6 @@ int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
return -1;
}
- TRACE("Clearing NBD socket");
-
- if (ioctl(fd, NBD_CLEAR_SOCK) == -1) {
- int serrno = errno;
- LOG("Failed clearing NBD socket");
- errno = serrno;
- return -1;
- }
-
- TRACE("Setting NBD socket");
-
- if (ioctl(fd, NBD_SET_SOCK, csock) == -1) {
- int serrno = errno;
- LOG("Failed to set NBD socket");
- errno = serrno;
- return -1;
- }
-
TRACE("Negotiation ended");
return 0;
@@ -582,121 +587,369 @@ static int nbd_send_reply(int csock, struct nbd_reply *reply)
return 0;
}
-int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
- off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size)
+#define MAX_NBD_REQUESTS 16
+
+typedef struct NBDRequest NBDRequest;
+
+struct NBDRequest {
+ QSIMPLEQ_ENTRY(NBDRequest) entry;
+ NBDClient *client;
+ uint8_t *data;
+};
+
+struct NBDExport {
+ BlockDriverState *bs;
+ off_t dev_offset;
+ off_t size;
+ uint32_t nbdflags;
+ QSIMPLEQ_HEAD(, NBDRequest) requests;
+};
+
+struct NBDClient {
+ int refcount;
+ void (*close)(NBDClient *client);
+
+ NBDExport *exp;
+ int sock;
+
+ Coroutine *recv_coroutine;
+
+ CoMutex send_lock;
+ Coroutine *send_coroutine;
+
+ int nb_requests;
+};
+
+static void nbd_client_get(NBDClient *client)
{
- struct nbd_request request;
- struct nbd_reply reply;
+ client->refcount++;
+}
- TRACE("Reading request.");
+static void nbd_client_put(NBDClient *client)
+{
+ if (--client->refcount == 0) {
+ g_free(client);
+ }
+}
- if (nbd_receive_request(csock, &request) == -1)
- return -1;
+static void nbd_client_close(NBDClient *client)
+{
+ qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
+ close(client->sock);
+ client->sock = -1;
+ if (client->close) {
+ client->close(client);
+ }
+ nbd_client_put(client);
+}
+
+static NBDRequest *nbd_request_get(NBDClient *client)
+{
+ NBDRequest *req;
+ NBDExport *exp = client->exp;
- if (request.len + NBD_REPLY_SIZE > data_size) {
+ assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
+ client->nb_requests++;
+
+ if (QSIMPLEQ_EMPTY(&exp->requests)) {
+ req = g_malloc0(sizeof(NBDRequest));
+ req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
+ } else {
+ req = QSIMPLEQ_FIRST(&exp->requests);
+ QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
+ }
+ nbd_client_get(client);
+ req->client = client;
+ return req;
+}
+
+static void nbd_request_put(NBDRequest *req)
+{
+ NBDClient *client = req->client;
+ QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
+ if (client->nb_requests-- == MAX_NBD_REQUESTS) {
+ qemu_notify_event();
+ }
+ nbd_client_put(client);
+}
+
+NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
+ off_t size, uint32_t nbdflags)
+{
+ NBDExport *exp = g_malloc0(sizeof(NBDExport));
+ QSIMPLEQ_INIT(&exp->requests);
+ exp->bs = bs;
+ exp->dev_offset = dev_offset;
+ exp->nbdflags = nbdflags;
+ exp->size = size == -1 ? exp->bs->total_sectors * 512 : size;
+ return exp;
+}
+
+void nbd_export_close(NBDExport *exp)
+{
+ while (!QSIMPLEQ_EMPTY(&exp->requests)) {
+ NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
+ QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
+ qemu_vfree(first->data);
+ g_free(first);
+ }
+
+ bdrv_close(exp->bs);
+ g_free(exp);
+}
+
+static int nbd_can_read(void *opaque);
+static void nbd_read(void *opaque);
+static void nbd_restart_write(void *opaque);
+
+static int nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
+ int len)
+{
+ NBDClient *client = req->client;
+ int csock = client->sock;
+ int rc, ret;
+
+ qemu_co_mutex_lock(&client->send_lock);
+ qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
+ nbd_restart_write, client);
+ client->send_coroutine = qemu_coroutine_self();
+
+ if (!len) {
+ rc = nbd_send_reply(csock, reply);
+ if (rc == -1) {
+ rc = -errno;
+ }
+ } else {
+ socket_set_cork(csock, 1);
+ rc = nbd_send_reply(csock, reply);
+ if (rc != -1) {
+ ret = qemu_co_send(csock, req->data, len);
+ if (ret != len) {
+ errno = EIO;
+ rc = -1;
+ }
+ }
+ if (rc == -1) {
+ rc = -errno;
+ }
+ socket_set_cork(csock, 0);
+ }
+
+ client->send_coroutine = NULL;
+ qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
+ qemu_co_mutex_unlock(&client->send_lock);
+ return rc;
+}
+
+static int nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
+{
+ NBDClient *client = req->client;
+ int csock = client->sock;
+ int rc;
+
+ client->recv_coroutine = qemu_coroutine_self();
+ if (nbd_receive_request(csock, request) == -1) {
+ rc = -EIO;
+ goto out;
+ }
+
+ if (request->len > NBD_BUFFER_SIZE) {
LOG("len (%u) is larger than max len (%u)",
- request.len + NBD_REPLY_SIZE, data_size);
- errno = EINVAL;
- return -1;
+ request->len, NBD_BUFFER_SIZE);
+ rc = -EINVAL;
+ goto out;
}
- if ((request.from + request.len) < request.from) {
+ if ((request->from + request->len) < request->from) {
LOG("integer overflow detected! "
"you're probably being attacked");
- errno = EINVAL;
- return -1;
+ rc = -EINVAL;
+ goto out;
}
- if ((request.from + request.len) > size) {
- LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
- ", Offset: %" PRIu64 "\n",
- request.from, request.len, (uint64_t)size, dev_offset);
- LOG("requested operation past EOF--bad client?");
- errno = EINVAL;
- return -1;
+ TRACE("Decoding type");
+
+ if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
+ TRACE("Reading %u byte(s)", request->len);
+
+ if (qemu_co_recv(csock, req->data, request->len) != request->len) {
+ LOG("reading from socket failed");
+ rc = -EIO;
+ goto out;
+ }
}
+ rc = 0;
- TRACE("Decoding type");
+out:
+ client->recv_coroutine = NULL;
+ return rc;
+}
+
+static void nbd_trip(void *opaque)
+{
+ NBDClient *client = opaque;
+ NBDRequest *req = nbd_request_get(client);
+ NBDExport *exp = client->exp;
+ struct nbd_request request;
+ struct nbd_reply reply;
+ int ret;
+
+ TRACE("Reading request.");
+
+ ret = nbd_co_receive_request(req, &request);
+ if (ret == -EIO) {
+ goto out;
+ }
reply.handle = request.handle;
reply.error = 0;
- switch (request.type) {
+ if (ret < 0) {
+ reply.error = -ret;
+ goto error_reply;
+ }
+
+ if ((request.from + request.len) > exp->size) {
+ LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
+ ", Offset: %" PRIu64 "\n",
+ request.from, request.len,
+ (uint64_t)exp->size, exp->dev_offset);
+ LOG("requested operation past EOF--bad client?");
+ goto invalid_request;
+ }
+
+ switch (request.type & NBD_CMD_MASK_COMMAND) {
case NBD_CMD_READ:
TRACE("Request type is READ");
- if (bdrv_read(bs, (request.from + dev_offset) / 512,
- data + NBD_REPLY_SIZE,
- request.len / 512) == -1) {
+ ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
+ req->data, request.len / 512);
+ if (ret < 0) {
LOG("reading from file failed");
- errno = EINVAL;
- return -1;
+ reply.error = -ret;
+ goto error_reply;
}
- *offset += request.len;
TRACE("Read %u byte(s)", request.len);
-
- /* Reply
- [ 0 .. 3] magic (NBD_REPLY_MAGIC)
- [ 4 .. 7] error (0 == no error)
- [ 7 .. 15] handle
- */
-
- cpu_to_be32w((uint32_t*)data, NBD_REPLY_MAGIC);
- cpu_to_be32w((uint32_t*)(data + 4), reply.error);
- cpu_to_be64w((uint64_t*)(data + 8), reply.handle);
-
- TRACE("Sending data to client");
-
- if (write_sync(csock, data,
- request.len + NBD_REPLY_SIZE) !=
- request.len + NBD_REPLY_SIZE) {
- LOG("writing to socket failed");
- errno = EINVAL;
- return -1;
- }
+ if (nbd_co_send_reply(req, &reply, request.len) < 0)
+ goto out;
break;
case NBD_CMD_WRITE:
TRACE("Request type is WRITE");
- TRACE("Reading %u byte(s)", request.len);
-
- if (read_sync(csock, data, request.len) != request.len) {
- LOG("reading from socket failed");
- errno = EINVAL;
- return -1;
+ if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
+ TRACE("Server is read-only, return error");
+ reply.error = EROFS;
+ goto error_reply;
}
- if (nbdflags & NBD_FLAG_READ_ONLY) {
- TRACE("Server is read-only, return error");
- reply.error = 1;
- } else {
- TRACE("Writing to device");
+ TRACE("Writing to device");
- if (bdrv_write(bs, (request.from + dev_offset) / 512,
- data, request.len / 512) == -1) {
- LOG("writing to file failed");
- errno = EINVAL;
- return -1;
- }
+ ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
+ req->data, request.len / 512);
+ if (ret < 0) {
+ LOG("writing to file failed");
+ reply.error = -ret;
+ goto error_reply;
+ }
- *offset += request.len;
+ if (request.type & NBD_CMD_FLAG_FUA) {
+ ret = bdrv_co_flush(exp->bs);
+ if (ret < 0) {
+ LOG("flush failed");
+ reply.error = -ret;
+ goto error_reply;
+ }
}
- if (nbd_send_reply(csock, &reply) == -1)
- return -1;
+ if (nbd_co_send_reply(req, &reply, 0) < 0)
+ goto out;
break;
case NBD_CMD_DISC:
TRACE("Request type is DISCONNECT");
errno = 0;
- return 1;
+ goto out;
+ case NBD_CMD_FLUSH:
+ TRACE("Request type is FLUSH");
+
+ ret = bdrv_co_flush(exp->bs);
+ if (ret < 0) {
+ LOG("flush failed");
+ reply.error = -ret;
+ }
+
+ if (nbd_co_send_reply(req, &reply, 0) < 0)
+ goto out;
+ break;
+ case NBD_CMD_TRIM:
+ TRACE("Request type is TRIM");
+ ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
+ request.len / 512);
+ if (ret < 0) {
+ LOG("discard failed");
+ reply.error = -ret;
+ }
+ if (nbd_co_send_reply(req, &reply, 0) < 0)
+ goto out;
+ break;
default:
LOG("invalid request type (%u) received", request.type);
- errno = EINVAL;
- return -1;
+ invalid_request:
+ reply.error = -EINVAL;
+ error_reply:
+ if (nbd_co_send_reply(req, &reply, 0) == -1)
+ goto out;
+ break;
}
TRACE("Request/Reply complete");
- return 0;
+ nbd_request_put(req);
+ return;
+
+out:
+ nbd_request_put(req);
+ nbd_client_close(client);
+}
+
+static int nbd_can_read(void *opaque)
+{
+ NBDClient *client = opaque;
+
+ return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
+}
+
+static void nbd_read(void *opaque)
+{
+ NBDClient *client = opaque;
+
+ if (client->recv_coroutine) {
+ qemu_coroutine_enter(client->recv_coroutine, NULL);
+ } else {
+ qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
+ }
+}
+
+static void nbd_restart_write(void *opaque)
+{
+ NBDClient *client = opaque;
+
+ qemu_coroutine_enter(client->send_coroutine, NULL);
+}
+
+NBDClient *nbd_client_new(NBDExport *exp, int csock,
+ void (*close)(NBDClient *))
+{
+ NBDClient *client;
+ if (nbd_send_negotiate(csock, exp->size, exp->nbdflags) == -1) {
+ return NULL;
+ }
+ client = g_malloc0(sizeof(NBDClient));
+ client->refcount = 1;
+ client->exp = exp;
+ client->sock = csock;
+ client->close = close;
+ qemu_co_mutex_init(&client->send_lock);
+ qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
+ return client;
}
diff --git a/nbd.h b/nbd.h
index 61553f412..a8382f096 100644
--- a/nbd.h
+++ b/nbd.h
@@ -57,6 +57,8 @@ enum {
#define NBD_DEFAULT_PORT 10809
+#define NBD_BUFFER_SIZE (1024*1024)
+
size_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read);
int tcp_socket_outgoing(const char *address, uint16_t port);
int tcp_socket_incoming(const char *address, uint16_t port);
@@ -65,15 +67,21 @@ int tcp_socket_incoming_spec(const char *address_and_port);
int unix_socket_outgoing(const char *path);
int unix_socket_incoming(const char *path);
-int nbd_negotiate(int csock, off_t size, uint32_t flags);
int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
off_t *size, size_t *blocksize);
int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize);
int nbd_send_request(int csock, struct nbd_request *request);
int nbd_receive_reply(int csock, struct nbd_reply *reply);
-int nbd_trip(BlockDriverState *bs, int csock, off_t size, uint64_t dev_offset,
- off_t *offset, uint32_t nbdflags, uint8_t *data, int data_size);
int nbd_client(int fd);
int nbd_disconnect(int fd);
+typedef struct NBDExport NBDExport;
+typedef struct NBDClient NBDClient;
+
+NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
+ off_t size, uint32_t nbdflags);
+void nbd_export_close(NBDExport *exp);
+NBDClient *nbd_client_new(NBDExport *exp, int csock,
+ void (*close)(NBDClient *));
+
#endif
diff --git a/os-posix.c b/os-posix.c
index dc4a6bb3f..5c437ca12 100644
--- a/os-posix.c
+++ b/os-posix.c
@@ -42,11 +42,6 @@
#ifdef CONFIG_LINUX
#include <sys/prctl.h>
-#include <sys/syscall.h>
-#endif
-
-#ifdef CONFIG_EVENTFD
-#include <sys/eventfd.h>
#endif
static struct passwd *user_pwd;
@@ -333,34 +328,6 @@ void os_set_line_buffering(void)
setvbuf(stdout, NULL, _IOLBF, 0);
}
-/*
- * Creates an eventfd that looks like a pipe and has EFD_CLOEXEC set.
- */
-int qemu_eventfd(int fds[2])
-{
-#ifdef CONFIG_EVENTFD
- int ret;
-
- ret = eventfd(0, 0);
- if (ret >= 0) {
- fds[0] = ret;
- qemu_set_cloexec(ret);
- if ((fds[1] = dup(ret)) == -1) {
- close(ret);
- return -1;
- }
- qemu_set_cloexec(fds[1]);
- return 0;
- }
-
- if (errno != ENOSYS) {
- return -1;
- }
-#endif
-
- return qemu_pipe(fds);
-}
-
int qemu_create_pidfile(const char *filename)
{
char buffer[128];
@@ -384,12 +351,3 @@ int qemu_create_pidfile(const char *filename)
close(fd);
return 0;
}
-
-int qemu_get_thread_id(void)
-{
-#if defined (__linux__)
- return syscall(SYS_gettid);
-#else
- return getpid();
-#endif
-}
diff --git a/os-win32.c b/os-win32.c
index 8523d8d0c..ad76370c7 100644
--- a/os-win32.c
+++ b/os-win32.c
@@ -151,8 +151,3 @@ int qemu_create_pidfile(const char *filename)
}
return 0;
}
-
-int qemu_get_thread_id(void)
-{
- return GetCurrentThreadId();
-}
diff --git a/osdep.c b/osdep.c
index 56e6963f1..3e6badac1 100644
--- a/osdep.c
+++ b/osdep.c
@@ -48,6 +48,15 @@ extern int madvise(caddr_t, size_t, int);
#include "trace.h"
#include "qemu_socket.h"
+int socket_set_cork(int fd, int v)
+{
+#if defined(SOL_TCP) && defined(TCP_CORK)
+ return setsockopt(fd, SOL_TCP, TCP_CORK, &v, sizeof(v));
+#else
+ return 0;
+#endif
+}
+
int qemu_madvise(void *addr, size_t len, int advice)
{
if (advice == QEMU_MADV_INVALID) {
@@ -166,3 +175,70 @@ int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen)
return ret;
}
+
+/*
+ * A variant of send(2) which handles partial write.
+ *
+ * Return the number of bytes transferred, which is only
+ * smaller than `count' if there is an error.
+ *
+ * This function won't work with non-blocking fd's.
+ * Any of the possibilities with non-bloking fd's is bad:
+ * - return a short write (then name is wrong)
+ * - busy wait adding (errno == EAGAIN) to the loop
+ */
+ssize_t qemu_send_full(int fd, const void *buf, size_t count, int flags)
+{
+ ssize_t ret = 0;
+ ssize_t total = 0;
+
+ while (count) {
+ ret = send(fd, buf, count, flags);
+ if (ret < 0) {
+ if (errno == EINTR) {
+ continue;
+ }
+ break;
+ }
+
+ count -= ret;
+ buf += ret;
+ total += ret;
+ }
+
+ return total;
+}
+
+/*
+ * A variant of recv(2) which handles partial write.
+ *
+ * Return the number of bytes transferred, which is only
+ * smaller than `count' if there is an error.
+ *
+ * This function won't work with non-blocking fd's.
+ * Any of the possibilities with non-bloking fd's is bad:
+ * - return a short write (then name is wrong)
+ * - busy wait adding (errno == EAGAIN) to the loop
+ */
+ssize_t qemu_recv_full(int fd, void *buf, size_t count, int flags)
+{
+ ssize_t ret = 0;
+ ssize_t total = 0;
+
+ while (count) {
+ ret = qemu_recv(fd, buf, count, flags);
+ if (ret <= 0) {
+ if (ret < 0 && errno == EINTR) {
+ continue;
+ }
+ break;
+ }
+
+ count -= ret;
+ buf += ret;
+ total += ret;
+ }
+
+ return total;
+}
+
diff --git a/oslib-posix.c b/oslib-posix.c
index ce755496b..b6a3c7fc5 100644
--- a/oslib-posix.c
+++ b/oslib-posix.c
@@ -55,6 +55,21 @@ static int running_on_valgrind = -1;
#else
# define running_on_valgrind 0
#endif
+#ifdef CONFIG_LINUX
+#include <sys/syscall.h>
+#endif
+#ifdef CONFIG_EVENTFD
+#include <sys/eventfd.h>
+#endif
+
+int qemu_get_thread_id(void)
+{
+#if defined(__linux__)
+ return syscall(SYS_gettid);
+#else
+ return getpid();
+#endif
+}
int qemu_daemon(int nochdir, int noclose)
{
@@ -162,6 +177,34 @@ int qemu_pipe(int pipefd[2])
return ret;
}
+/*
+ * Creates an eventfd that looks like a pipe and has EFD_CLOEXEC set.
+ */
+int qemu_eventfd(int fds[2])
+{
+#ifdef CONFIG_EVENTFD
+ int ret;
+
+ ret = eventfd(0, 0);
+ if (ret >= 0) {
+ fds[0] = ret;
+ fds[1] = dup(ret);
+ if (fds[1] == -1) {
+ close(ret);
+ return -1;
+ }
+ qemu_set_cloexec(ret);
+ qemu_set_cloexec(fds[1]);
+ return 0;
+ }
+ if (errno != ENOSYS) {
+ return -1;
+ }
+#endif
+
+ return qemu_pipe(fds);
+}
+
int qemu_utimens(const char *path, const struct timespec *times)
{
struct timeval tv[2], tv_now;
diff --git a/oslib-win32.c b/oslib-win32.c
index 5e3de7dc8..ce3021e6c 100644
--- a/oslib-win32.c
+++ b/oslib-win32.c
@@ -118,3 +118,8 @@ int qemu_gettimeofday(qemu_timeval *tp)
Do not set errno on error. */
return 0;
}
+
+int qemu_get_thread_id(void)
+{
+ return GetCurrentThreadId();
+}
diff --git a/qemu-common.h b/qemu-common.h
index b2de01562..6ab7dfb1b 100644
--- a/qemu-common.h
+++ b/qemu-common.h
@@ -173,6 +173,10 @@ void *qemu_oom_check(void *ptr);
int qemu_open(const char *name, int flags, ...);
ssize_t qemu_write_full(int fd, const void *buf, size_t count)
QEMU_WARN_UNUSED_RESULT;
+ssize_t qemu_send_full(int fd, const void *buf, size_t count, int flags)
+ QEMU_WARN_UNUSED_RESULT;
+ssize_t qemu_recv_full(int fd, void *buf, size_t count, int flags)
+ QEMU_WARN_UNUSED_RESULT;
void qemu_set_cloexec(int fd);
#ifndef _WIN32
@@ -186,6 +190,9 @@ int qemu_pipe(int pipefd[2]);
#define qemu_recv(sockfd, buf, len, flags) recv(sockfd, buf, len, flags)
#endif
+int qemu_recvv(int sockfd, struct iovec *iov, int len, int iov_offset);
+int qemu_sendv(int sockfd, struct iovec *iov, int len, int iov_offset);
+
/* Error handling. */
void QEMU_NORETURN hw_error(const char *fmt, ...) GCC_FMT_ATTR(1, 2);
@@ -272,6 +279,33 @@ struct qemu_work_item {
void qemu_init_vcpu(void *env);
#endif
+/**
+ * Sends an iovec (or optionally a part of it) down a socket, yielding
+ * when the socket is full.
+ */
+int qemu_co_sendv(int sockfd, struct iovec *iov,
+ int len, int iov_offset);
+
+/**
+ * Receives data into an iovec (or optionally into a part of it) from
+ * a socket, yielding when there is no data in the socket.
+ */
+int qemu_co_recvv(int sockfd, struct iovec *iov,
+ int len, int iov_offset);
+
+
+/**
+ * Sends a buffer down a socket, yielding when the socket is full.
+ */
+int qemu_co_send(int sockfd, void *buf, int len);
+
+/**
+ * Receives data into a buffer from a socket, yielding when there
+ * is no data in the socket.
+ */
+int qemu_co_recv(int sockfd, void *buf, int len);
+
+
typedef struct QEMUIOVector {
struct iovec *iov;
int niov;
diff --git a/qemu-coroutine-io.c b/qemu-coroutine-io.c
new file mode 100644
index 000000000..40fd51439
--- /dev/null
+++ b/qemu-coroutine-io.c
@@ -0,0 +1,96 @@
+/*
+ * Coroutine-aware I/O functions
+ *
+ * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation.
+ * Copyright (c) 2011, Red Hat, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "qemu-common.h"
+#include "qemu_socket.h"
+#include "qemu-coroutine.h"
+
+int coroutine_fn qemu_co_recvv(int sockfd, struct iovec *iov,
+ int len, int iov_offset)
+{
+ int total = 0;
+ int ret;
+ while (len) {
+ ret = qemu_recvv(sockfd, iov, len, iov_offset + total);
+ if (ret < 0) {
+ if (errno == EAGAIN) {
+ qemu_coroutine_yield();
+ continue;
+ }
+ if (total == 0) {
+ total = -1;
+ }
+ break;
+ }
+ if (ret == 0) {
+ break;
+ }
+ total += ret, len -= ret;
+ }
+
+ return total;
+}
+
+int coroutine_fn qemu_co_sendv(int sockfd, struct iovec *iov,
+ int len, int iov_offset)
+{
+ int total = 0;
+ int ret;
+ while (len) {
+ ret = qemu_sendv(sockfd, iov, len, iov_offset + total);
+ if (ret < 0) {
+ if (errno == EAGAIN) {
+ qemu_coroutine_yield();
+ continue;
+ }
+ if (total == 0) {
+ total = -1;
+ }
+ break;
+ }
+ total += ret, len -= ret;
+ }
+
+ return total;
+}
+
+int coroutine_fn qemu_co_recv(int sockfd, void *buf, int len)
+{
+ struct iovec iov;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ return qemu_co_recvv(sockfd, &iov, len, 0);
+}
+
+int coroutine_fn qemu_co_send(int sockfd, void *buf, int len)
+{
+ struct iovec iov;
+
+ iov.iov_base = buf;
+ iov.iov_len = len;
+
+ return qemu_co_sendv(sockfd, &iov, len, 0);
+}
diff --git a/qemu-nbd.c b/qemu-nbd.c
index 291cba2ea..155b05840 100644
--- a/qemu-nbd.c
+++ b/qemu-nbd.c
@@ -35,13 +35,15 @@
#define SOCKET_PATH "/var/lock/qemu-nbd-%s"
-#define NBD_BUFFER_SIZE (1024*1024)
-
-static int sigterm_wfd;
+static NBDExport *exp;
static int verbose;
static char *device;
static char *srcpath;
static char *sockpath;
+static bool sigterm_reported;
+static bool nbd_started;
+static int shared = 1;
+static int nb_fds;
static void usage(const char *name)
{
@@ -170,10 +172,8 @@ static int find_partition(BlockDriverState *bs, int partition,
static void termsig_handler(int signum)
{
- static int sigterm_reported;
- if (!sigterm_reported) {
- sigterm_reported = (write(sigterm_wfd, "", 1) == 1);
- }
+ sigterm_reported = true;
+ qemu_notify_event();
}
static void *show_parts(void *arg)
@@ -244,17 +244,38 @@ out:
return (void *) EXIT_FAILURE;
}
+static int nbd_can_accept(void *opaque)
+{
+ return nb_fds < shared;
+}
+
+static void nbd_client_closed(NBDClient *client)
+{
+ nb_fds--;
+ qemu_notify_event();
+}
+
+static void nbd_accept(void *opaque)
+{
+ int server_fd = (uintptr_t) opaque;
+ struct sockaddr_in addr;
+ socklen_t addr_len = sizeof(addr);
+
+ int fd = accept(server_fd, (struct sockaddr *)&addr, &addr_len);
+ nbd_started = true;
+ if (fd != -1 && nbd_client_new(exp, fd, nbd_client_closed)) {
+ nb_fds++;
+ }
+}
+
int main(int argc, char **argv)
{
BlockDriverState *bs;
off_t dev_offset = 0;
- off_t offset = 0;
uint32_t nbdflags = 0;
bool disconnect = false;
const char *bindto = "0.0.0.0";
int port = NBD_DEFAULT_PORT;
- struct sockaddr_in addr;
- socklen_t addr_len = sizeof(addr);
off_t fd_size;
const char *sopt = "hVb:o:p:rsnP:c:dvk:e:t";
struct option lopt[] = {
@@ -282,14 +303,7 @@ int main(int argc, char **argv)
int flags = BDRV_O_RDWR;
int partition = -1;
int ret;
- int shared = 1;
- uint8_t *data;
- fd_set fds;
- int *sharing_fds;
int fd;
- int i;
- int nb_fds = 0;
- int max_fd;
int persistent = 0;
pthread_t client_thread;
@@ -297,12 +311,6 @@ int main(int argc, char **argv)
* handler ensures that "qemu-nbd -v -c" exits with a nice status code.
*/
struct sigaction sa_sigterm;
- int sigterm_fd[2];
- if (qemu_pipe(sigterm_fd) == -1) {
- err(EXIT_FAILURE, "Error setting up communication pipe");
- }
-
- sigterm_wfd = sigterm_fd[1];
memset(&sa_sigterm, 0, sizeof(sa_sigterm));
sa_sigterm.sa_handler = termsig_handler;
sigaction(SIGTERM, &sa_sigterm, NULL);
@@ -492,16 +500,17 @@ int main(int argc, char **argv)
err(EXIT_FAILURE, "Could not find partition %d", partition);
}
- sharing_fds = g_malloc((shared + 1) * sizeof(int));
+ exp = nbd_export_new(bs, dev_offset, fd_size, nbdflags);
if (sockpath) {
- sharing_fds[0] = unix_socket_incoming(sockpath);
+ fd = unix_socket_incoming(sockpath);
} else {
- sharing_fds[0] = tcp_socket_incoming(bindto, port);
+ fd = tcp_socket_incoming(bindto, port);
}
- if (sharing_fds[0] == -1)
+ if (fd == -1) {
return 1;
+ }
if (device) {
int ret;
@@ -516,60 +525,15 @@ int main(int argc, char **argv)
memset(&client_thread, 0, sizeof(client_thread));
}
- max_fd = sharing_fds[0];
- nb_fds++;
-
- data = qemu_blockalign(bs, NBD_BUFFER_SIZE);
- if (data == NULL) {
- errx(EXIT_FAILURE, "Cannot allocate data buffer");
- }
+ qemu_init_main_loop();
+ qemu_set_fd_handler2(fd, nbd_can_accept, nbd_accept, NULL,
+ (void *)(uintptr_t)fd);
do {
- FD_ZERO(&fds);
- FD_SET(sigterm_fd[0], &fds);
- for (i = 0; i < nb_fds; i++)
- FD_SET(sharing_fds[i], &fds);
-
- do {
- ret = select(max_fd + 1, &fds, NULL, NULL, NULL);
- } while (ret == -1 && errno == EINTR);
- if (ret == -1 || FD_ISSET(sigterm_fd[0], &fds)) {
- break;
- }
-
- if (FD_ISSET(sharing_fds[0], &fds))
- ret--;
- for (i = 1; i < nb_fds && ret; i++) {
- if (FD_ISSET(sharing_fds[i], &fds)) {
- if (nbd_trip(bs, sharing_fds[i], fd_size, dev_offset,
- &offset, nbdflags, data, NBD_BUFFER_SIZE) != 0) {
- close(sharing_fds[i]);
- nb_fds--;
- sharing_fds[i] = sharing_fds[nb_fds];
- i--;
- }
- ret--;
- }
- }
- /* new connection ? */
- if (FD_ISSET(sharing_fds[0], &fds)) {
- if (nb_fds < shared + 1) {
- sharing_fds[nb_fds] = accept(sharing_fds[0],
- (struct sockaddr *)&addr,
- &addr_len);
- if (sharing_fds[nb_fds] != -1 &&
- nbd_negotiate(sharing_fds[nb_fds], fd_size, nbdflags) != -1) {
- if (sharing_fds[nb_fds] > max_fd)
- max_fd = sharing_fds[nb_fds];
- nb_fds++;
- }
- }
- }
- } while (persistent || nb_fds > 1);
- qemu_vfree(data);
+ main_loop_wait(false);
+ } while (!sigterm_reported && (persistent || !nbd_started || nb_fds > 0));
- close(sharing_fds[0]);
- g_free(sharing_fds);
+ nbd_export_close(exp);
if (sockpath) {
unlink(sockpath);
}
diff --git a/qemu-tool.c b/qemu-tool.c
index 5df727974..226b6e890 100644
--- a/qemu-tool.c
+++ b/qemu-tool.c
@@ -16,12 +16,12 @@
#include "qemu-timer.h"
#include "qemu-log.h"
#include "migration.h"
+#include "main-loop.h"
+#include "qemu_socket.h"
+#include "slirp/libslirp.h"
#include <sys/time.h>
-QEMUClock *rt_clock;
-QEMUClock *vm_clock;
-
FILE *logfile;
struct QEMUBH
@@ -57,41 +57,45 @@ void monitor_protocol_event(MonitorEvent event, QObject *data)
{
}
-int qemu_set_fd_handler2(int fd,
- IOCanReadHandler *fd_read_poll,
- IOHandler *fd_read,
- IOHandler *fd_write,
- void *opaque)
+int64 cpu_get_clock(void)
{
- return 0;
+ abort();
}
-void qemu_notify_event(void)
+int64 cpu_get_icount(void)
{
+ abort();
}
-QEMUTimer *qemu_new_timer(QEMUClock *clock, int scale,
- QEMUTimerCB *cb, void *opaque)
+void qemu_mutex_lock_iothread(void)
{
- return g_malloc(1);
}
-void qemu_free_timer(QEMUTimer *ts)
+void qemu_mutex_unlock_iothread(void)
{
- g_free(ts);
}
-void qemu_del_timer(QEMUTimer *ts)
+int use_icount;
+
+void qemu_clock_warp(QEMUClock *clock)
{
}
-void qemu_mod_timer(QEMUTimer *ts, int64_t expire_time)
+static void __attribute__((constructor)) init_main_loop(void)
{
+ init_clocks();
+ init_timer_alarm();
+ qemu_clock_enable(vm_clock, false);
}
-int64_t qemu_get_clock_ns(QEMUClock *clock)
+void slirp_select_fill(int *pnfds, fd_set *readfds,
+ fd_set *writefds, fd_set *xfds)
+{
+}
+
+void slirp_select_poll(fd_set *readfds, fd_set *writefds,
+ fd_set *xfds, int select_error)
{
- return 0;
}
void migrate_add_blocker(Error *reason)
diff --git a/qemu_socket.h b/qemu_socket.h
index 9e32fac65..fe4cf6ca6 100644
--- a/qemu_socket.h
+++ b/qemu_socket.h
@@ -35,6 +35,7 @@ int inet_aton(const char *cp, struct in_addr *ia);
/* misc helpers */
int qemu_socket(int domain, int type, int protocol);
int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen);
+int socket_set_cork(int fd, int v);
void socket_set_block(int fd);
void socket_set_nonblock(int fd);
int send_all(int fd, const void *buf, int len1);