2
0

nbd.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988
  1. /*
  2. * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
  3. *
  4. * Network Block Device
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; under version 2 of the License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include "nbd.h"
  19. #include "block.h"
  20. #include "qemu-coroutine.h"
  21. #include <errno.h>
  22. #include <string.h>
  23. #ifndef _WIN32
  24. #include <sys/ioctl.h>
  25. #endif
  26. #if defined(__sun__) || defined(__HAIKU__)
  27. #include <sys/ioccom.h>
  28. #endif
  29. #include <ctype.h>
  30. #include <inttypes.h>
  31. #ifdef __linux__
  32. #include <linux/fs.h>
  33. #endif
  34. #include "qemu_socket.h"
  35. #include "qemu-queue.h"
  36. //#define DEBUG_NBD
  37. #ifdef DEBUG_NBD
  38. #define TRACE(msg, ...) do { \
  39. LOG(msg, ## __VA_ARGS__); \
  40. } while(0)
  41. #else
  42. #define TRACE(msg, ...) \
  43. do { } while (0)
  44. #endif
  45. #define LOG(msg, ...) do { \
  46. fprintf(stderr, "%s:%s():L%d: " msg "\n", \
  47. __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
  48. } while(0)
  49. /* This is all part of the "official" NBD API */
  50. #define NBD_REPLY_SIZE (4 + 4 + 8)
  51. #define NBD_REQUEST_MAGIC 0x25609513
  52. #define NBD_REPLY_MAGIC 0x67446698
  53. #define NBD_SET_SOCK _IO(0xab, 0)
  54. #define NBD_SET_BLKSIZE _IO(0xab, 1)
  55. #define NBD_SET_SIZE _IO(0xab, 2)
  56. #define NBD_DO_IT _IO(0xab, 3)
  57. #define NBD_CLEAR_SOCK _IO(0xab, 4)
  58. #define NBD_CLEAR_QUE _IO(0xab, 5)
  59. #define NBD_PRINT_DEBUG _IO(0xab, 6)
  60. #define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
  61. #define NBD_DISCONNECT _IO(0xab, 8)
  62. #define NBD_SET_TIMEOUT _IO(0xab, 9)
  63. #define NBD_SET_FLAGS _IO(0xab, 10)
  64. #define NBD_OPT_EXPORT_NAME (1 << 0)
  65. /* That's all folks */
  66. ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
  67. {
  68. size_t offset = 0;
  69. int err;
  70. if (qemu_in_coroutine()) {
  71. if (do_read) {
  72. return qemu_co_recv(fd, buffer, size);
  73. } else {
  74. return qemu_co_send(fd, buffer, size);
  75. }
  76. }
  77. while (offset < size) {
  78. ssize_t len;
  79. if (do_read) {
  80. len = qemu_recv(fd, buffer + offset, size - offset, 0);
  81. } else {
  82. len = send(fd, buffer + offset, size - offset, 0);
  83. }
  84. if (len < 0) {
  85. err = socket_error();
  86. /* recoverable error */
  87. if (err == EINTR || (offset > 0 && err == EAGAIN)) {
  88. continue;
  89. }
  90. /* unrecoverable error */
  91. return -err;
  92. }
  93. /* eof */
  94. if (len == 0) {
  95. break;
  96. }
  97. offset += len;
  98. }
  99. return offset;
  100. }
  101. static ssize_t read_sync(int fd, void *buffer, size_t size)
  102. {
  103. /* Sockets are kept in blocking mode in the negotiation phase. After
  104. * that, a non-readable socket simply means that another thread stole
  105. * our request/reply. Synchronization is done with recv_coroutine, so
  106. * that this is coroutine-safe.
  107. */
  108. return nbd_wr_sync(fd, buffer, size, true);
  109. }
  110. static ssize_t write_sync(int fd, void *buffer, size_t size)
  111. {
  112. int ret;
  113. do {
  114. /* For writes, we do expect the socket to be writable. */
  115. ret = nbd_wr_sync(fd, buffer, size, false);
  116. } while (ret == -EAGAIN);
  117. return ret;
  118. }
  119. static void combine_addr(char *buf, size_t len, const char* address,
  120. uint16_t port)
  121. {
  122. /* If the address-part contains a colon, it's an IPv6 IP so needs [] */
  123. if (strstr(address, ":")) {
  124. snprintf(buf, len, "[%s]:%u", address, port);
  125. } else {
  126. snprintf(buf, len, "%s:%u", address, port);
  127. }
  128. }
  129. int tcp_socket_outgoing(const char *address, uint16_t port)
  130. {
  131. char address_and_port[128];
  132. combine_addr(address_and_port, 128, address, port);
  133. return tcp_socket_outgoing_spec(address_and_port);
  134. }
  135. int tcp_socket_outgoing_spec(const char *address_and_port)
  136. {
  137. return inet_connect(address_and_port, true, NULL);
  138. }
  139. int tcp_socket_incoming(const char *address, uint16_t port)
  140. {
  141. char address_and_port[128];
  142. combine_addr(address_and_port, 128, address, port);
  143. return tcp_socket_incoming_spec(address_and_port);
  144. }
  145. int tcp_socket_incoming_spec(const char *address_and_port)
  146. {
  147. char *ostr = NULL;
  148. int olen = 0;
  149. return inet_listen(address_and_port, ostr, olen, SOCK_STREAM, 0, NULL);
  150. }
  151. int unix_socket_incoming(const char *path)
  152. {
  153. char *ostr = NULL;
  154. int olen = 0;
  155. return unix_listen(path, ostr, olen);
  156. }
  157. int unix_socket_outgoing(const char *path)
  158. {
  159. return unix_connect(path);
  160. }
  161. /* Basic flow
  162. Server Client
  163. Negotiate
  164. Request
  165. Response
  166. Request
  167. Response
  168. ...
  169. ...
  170. Request (type == 2)
  171. */
  172. static int nbd_send_negotiate(int csock, off_t size, uint32_t flags)
  173. {
  174. char buf[8 + 8 + 8 + 128];
  175. int rc;
  176. /* Negotiate
  177. [ 0 .. 7] passwd ("NBDMAGIC")
  178. [ 8 .. 15] magic (0x00420281861253)
  179. [16 .. 23] size
  180. [24 .. 27] flags
  181. [28 .. 151] reserved (0)
  182. */
  183. socket_set_block(csock);
  184. rc = -EINVAL;
  185. TRACE("Beginning negotiation.");
  186. memcpy(buf, "NBDMAGIC", 8);
  187. cpu_to_be64w((uint64_t*)(buf + 8), 0x00420281861253LL);
  188. cpu_to_be64w((uint64_t*)(buf + 16), size);
  189. cpu_to_be32w((uint32_t*)(buf + 24),
  190. flags | NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
  191. NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
  192. memset(buf + 28, 0, 124);
  193. if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
  194. LOG("write failed");
  195. goto fail;
  196. }
  197. TRACE("Negotiation succeeded.");
  198. rc = 0;
  199. fail:
  200. socket_set_nonblock(csock);
  201. return rc;
  202. }
  203. int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
  204. off_t *size, size_t *blocksize)
  205. {
  206. char buf[256];
  207. uint64_t magic, s;
  208. uint16_t tmp;
  209. int rc;
  210. TRACE("Receiving negotiation.");
  211. socket_set_block(csock);
  212. rc = -EINVAL;
  213. if (read_sync(csock, buf, 8) != 8) {
  214. LOG("read failed");
  215. goto fail;
  216. }
  217. buf[8] = '\0';
  218. if (strlen(buf) == 0) {
  219. LOG("server connection closed");
  220. goto fail;
  221. }
  222. TRACE("Magic is %c%c%c%c%c%c%c%c",
  223. qemu_isprint(buf[0]) ? buf[0] : '.',
  224. qemu_isprint(buf[1]) ? buf[1] : '.',
  225. qemu_isprint(buf[2]) ? buf[2] : '.',
  226. qemu_isprint(buf[3]) ? buf[3] : '.',
  227. qemu_isprint(buf[4]) ? buf[4] : '.',
  228. qemu_isprint(buf[5]) ? buf[5] : '.',
  229. qemu_isprint(buf[6]) ? buf[6] : '.',
  230. qemu_isprint(buf[7]) ? buf[7] : '.');
  231. if (memcmp(buf, "NBDMAGIC", 8) != 0) {
  232. LOG("Invalid magic received");
  233. goto fail;
  234. }
  235. if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  236. LOG("read failed");
  237. goto fail;
  238. }
  239. magic = be64_to_cpu(magic);
  240. TRACE("Magic is 0x%" PRIx64, magic);
  241. if (name) {
  242. uint32_t reserved = 0;
  243. uint32_t opt;
  244. uint32_t namesize;
  245. TRACE("Checking magic (opts_magic)");
  246. if (magic != 0x49484156454F5054LL) {
  247. LOG("Bad magic received");
  248. goto fail;
  249. }
  250. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  251. LOG("flags read failed");
  252. goto fail;
  253. }
  254. *flags = be16_to_cpu(tmp) << 16;
  255. /* reserved for future use */
  256. if (write_sync(csock, &reserved, sizeof(reserved)) !=
  257. sizeof(reserved)) {
  258. LOG("write failed (reserved)");
  259. goto fail;
  260. }
  261. /* write the export name */
  262. magic = cpu_to_be64(magic);
  263. if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  264. LOG("write failed (magic)");
  265. goto fail;
  266. }
  267. opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
  268. if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
  269. LOG("write failed (opt)");
  270. goto fail;
  271. }
  272. namesize = cpu_to_be32(strlen(name));
  273. if (write_sync(csock, &namesize, sizeof(namesize)) !=
  274. sizeof(namesize)) {
  275. LOG("write failed (namesize)");
  276. goto fail;
  277. }
  278. if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
  279. LOG("write failed (name)");
  280. goto fail;
  281. }
  282. } else {
  283. TRACE("Checking magic (cli_magic)");
  284. if (magic != 0x00420281861253LL) {
  285. LOG("Bad magic received");
  286. goto fail;
  287. }
  288. }
  289. if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
  290. LOG("read failed");
  291. goto fail;
  292. }
  293. *size = be64_to_cpu(s);
  294. *blocksize = 1024;
  295. TRACE("Size is %" PRIu64, *size);
  296. if (!name) {
  297. if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
  298. LOG("read failed (flags)");
  299. goto fail;
  300. }
  301. *flags = be32_to_cpup(flags);
  302. } else {
  303. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  304. LOG("read failed (tmp)");
  305. goto fail;
  306. }
  307. *flags |= be32_to_cpu(tmp);
  308. }
  309. if (read_sync(csock, &buf, 124) != 124) {
  310. LOG("read failed (buf)");
  311. goto fail;
  312. }
  313. rc = 0;
  314. fail:
  315. socket_set_nonblock(csock);
  316. return rc;
  317. }
  318. #ifdef __linux__
  319. int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
  320. {
  321. TRACE("Setting NBD socket");
  322. if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
  323. int serrno = errno;
  324. LOG("Failed to set NBD socket");
  325. return -serrno;
  326. }
  327. TRACE("Setting block size to %lu", (unsigned long)blocksize);
  328. if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
  329. int serrno = errno;
  330. LOG("Failed setting NBD block size");
  331. return -serrno;
  332. }
  333. TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
  334. if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
  335. int serrno = errno;
  336. LOG("Failed setting size (in blocks)");
  337. return -serrno;
  338. }
  339. if (flags & NBD_FLAG_READ_ONLY) {
  340. int read_only = 1;
  341. TRACE("Setting readonly attribute");
  342. if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
  343. int serrno = errno;
  344. LOG("Failed setting read-only attribute");
  345. return -serrno;
  346. }
  347. }
  348. if (ioctl(fd, NBD_SET_FLAGS, flags) < 0
  349. && errno != ENOTTY) {
  350. int serrno = errno;
  351. LOG("Failed setting flags");
  352. return -serrno;
  353. }
  354. TRACE("Negotiation ended");
  355. return 0;
  356. }
  357. int nbd_disconnect(int fd)
  358. {
  359. ioctl(fd, NBD_CLEAR_QUE);
  360. ioctl(fd, NBD_DISCONNECT);
  361. ioctl(fd, NBD_CLEAR_SOCK);
  362. return 0;
  363. }
  364. int nbd_client(int fd)
  365. {
  366. int ret;
  367. int serrno;
  368. TRACE("Doing NBD loop");
  369. ret = ioctl(fd, NBD_DO_IT);
  370. if (ret < 0 && errno == EPIPE) {
  371. /* NBD_DO_IT normally returns EPIPE when someone has disconnected
  372. * the socket via NBD_DISCONNECT. We do not want to return 1 in
  373. * that case.
  374. */
  375. ret = 0;
  376. }
  377. serrno = errno;
  378. TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
  379. TRACE("Clearing NBD queue");
  380. ioctl(fd, NBD_CLEAR_QUE);
  381. TRACE("Clearing NBD socket");
  382. ioctl(fd, NBD_CLEAR_SOCK);
  383. errno = serrno;
  384. return ret;
  385. }
  386. #else
  387. int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
  388. {
  389. return -ENOTSUP;
  390. }
  391. int nbd_disconnect(int fd)
  392. {
  393. return -ENOTSUP;
  394. }
  395. int nbd_client(int fd)
  396. {
  397. return -ENOTSUP;
  398. }
  399. #endif
  400. ssize_t nbd_send_request(int csock, struct nbd_request *request)
  401. {
  402. uint8_t buf[4 + 4 + 8 + 8 + 4];
  403. ssize_t ret;
  404. cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
  405. cpu_to_be32w((uint32_t*)(buf + 4), request->type);
  406. cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
  407. cpu_to_be64w((uint64_t*)(buf + 16), request->from);
  408. cpu_to_be32w((uint32_t*)(buf + 24), request->len);
  409. TRACE("Sending request to client: "
  410. "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
  411. request->from, request->len, request->handle, request->type);
  412. ret = write_sync(csock, buf, sizeof(buf));
  413. if (ret < 0) {
  414. return ret;
  415. }
  416. if (ret != sizeof(buf)) {
  417. LOG("writing to socket failed");
  418. return -EINVAL;
  419. }
  420. return 0;
  421. }
  422. static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
  423. {
  424. uint8_t buf[4 + 4 + 8 + 8 + 4];
  425. uint32_t magic;
  426. ssize_t ret;
  427. ret = read_sync(csock, buf, sizeof(buf));
  428. if (ret < 0) {
  429. return ret;
  430. }
  431. if (ret != sizeof(buf)) {
  432. LOG("read failed");
  433. return -EINVAL;
  434. }
  435. /* Request
  436. [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
  437. [ 4 .. 7] type (0 == READ, 1 == WRITE)
  438. [ 8 .. 15] handle
  439. [16 .. 23] from
  440. [24 .. 27] len
  441. */
  442. magic = be32_to_cpup((uint32_t*)buf);
  443. request->type = be32_to_cpup((uint32_t*)(buf + 4));
  444. request->handle = be64_to_cpup((uint64_t*)(buf + 8));
  445. request->from = be64_to_cpup((uint64_t*)(buf + 16));
  446. request->len = be32_to_cpup((uint32_t*)(buf + 24));
  447. TRACE("Got request: "
  448. "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
  449. magic, request->type, request->from, request->len);
  450. if (magic != NBD_REQUEST_MAGIC) {
  451. LOG("invalid magic (got 0x%x)", magic);
  452. return -EINVAL;
  453. }
  454. return 0;
  455. }
  456. ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
  457. {
  458. uint8_t buf[NBD_REPLY_SIZE];
  459. uint32_t magic;
  460. ssize_t ret;
  461. ret = read_sync(csock, buf, sizeof(buf));
  462. if (ret < 0) {
  463. return ret;
  464. }
  465. if (ret != sizeof(buf)) {
  466. LOG("read failed");
  467. return -EINVAL;
  468. }
  469. /* Reply
  470. [ 0 .. 3] magic (NBD_REPLY_MAGIC)
  471. [ 4 .. 7] error (0 == no error)
  472. [ 7 .. 15] handle
  473. */
  474. magic = be32_to_cpup((uint32_t*)buf);
  475. reply->error = be32_to_cpup((uint32_t*)(buf + 4));
  476. reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
  477. TRACE("Got reply: "
  478. "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
  479. magic, reply->error, reply->handle);
  480. if (magic != NBD_REPLY_MAGIC) {
  481. LOG("invalid magic (got 0x%x)", magic);
  482. return -EINVAL;
  483. }
  484. return 0;
  485. }
  486. static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
  487. {
  488. uint8_t buf[4 + 4 + 8];
  489. ssize_t ret;
  490. /* Reply
  491. [ 0 .. 3] magic (NBD_REPLY_MAGIC)
  492. [ 4 .. 7] error (0 == no error)
  493. [ 7 .. 15] handle
  494. */
  495. cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
  496. cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
  497. cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
  498. TRACE("Sending response to client");
  499. ret = write_sync(csock, buf, sizeof(buf));
  500. if (ret < 0) {
  501. return ret;
  502. }
  503. if (ret != sizeof(buf)) {
  504. LOG("writing to socket failed");
  505. return -EINVAL;
  506. }
  507. return 0;
  508. }
  509. #define MAX_NBD_REQUESTS 16
  510. typedef struct NBDRequest NBDRequest;
  511. struct NBDRequest {
  512. QSIMPLEQ_ENTRY(NBDRequest) entry;
  513. NBDClient *client;
  514. uint8_t *data;
  515. };
  516. struct NBDExport {
  517. BlockDriverState *bs;
  518. off_t dev_offset;
  519. off_t size;
  520. uint32_t nbdflags;
  521. QSIMPLEQ_HEAD(, NBDRequest) requests;
  522. };
  523. struct NBDClient {
  524. int refcount;
  525. void (*close)(NBDClient *client);
  526. NBDExport *exp;
  527. int sock;
  528. Coroutine *recv_coroutine;
  529. CoMutex send_lock;
  530. Coroutine *send_coroutine;
  531. int nb_requests;
  532. };
  533. static void nbd_client_get(NBDClient *client)
  534. {
  535. client->refcount++;
  536. }
  537. static void nbd_client_put(NBDClient *client)
  538. {
  539. if (--client->refcount == 0) {
  540. g_free(client);
  541. }
  542. }
  543. static void nbd_client_close(NBDClient *client)
  544. {
  545. qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
  546. close(client->sock);
  547. client->sock = -1;
  548. if (client->close) {
  549. client->close(client);
  550. }
  551. nbd_client_put(client);
  552. }
  553. static NBDRequest *nbd_request_get(NBDClient *client)
  554. {
  555. NBDRequest *req;
  556. NBDExport *exp = client->exp;
  557. assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
  558. client->nb_requests++;
  559. if (QSIMPLEQ_EMPTY(&exp->requests)) {
  560. req = g_malloc0(sizeof(NBDRequest));
  561. req->data = qemu_blockalign(exp->bs, NBD_BUFFER_SIZE);
  562. } else {
  563. req = QSIMPLEQ_FIRST(&exp->requests);
  564. QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
  565. }
  566. nbd_client_get(client);
  567. req->client = client;
  568. return req;
  569. }
  570. static void nbd_request_put(NBDRequest *req)
  571. {
  572. NBDClient *client = req->client;
  573. QSIMPLEQ_INSERT_HEAD(&client->exp->requests, req, entry);
  574. if (client->nb_requests-- == MAX_NBD_REQUESTS) {
  575. qemu_notify_event();
  576. }
  577. nbd_client_put(client);
  578. }
  579. NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
  580. off_t size, uint32_t nbdflags)
  581. {
  582. NBDExport *exp = g_malloc0(sizeof(NBDExport));
  583. QSIMPLEQ_INIT(&exp->requests);
  584. exp->bs = bs;
  585. exp->dev_offset = dev_offset;
  586. exp->nbdflags = nbdflags;
  587. exp->size = size == -1 ? bdrv_getlength(bs) : size;
  588. return exp;
  589. }
  590. void nbd_export_close(NBDExport *exp)
  591. {
  592. while (!QSIMPLEQ_EMPTY(&exp->requests)) {
  593. NBDRequest *first = QSIMPLEQ_FIRST(&exp->requests);
  594. QSIMPLEQ_REMOVE_HEAD(&exp->requests, entry);
  595. qemu_vfree(first->data);
  596. g_free(first);
  597. }
  598. bdrv_close(exp->bs);
  599. g_free(exp);
  600. }
  601. static int nbd_can_read(void *opaque);
  602. static void nbd_read(void *opaque);
  603. static void nbd_restart_write(void *opaque);
  604. static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
  605. int len)
  606. {
  607. NBDClient *client = req->client;
  608. int csock = client->sock;
  609. ssize_t rc, ret;
  610. qemu_co_mutex_lock(&client->send_lock);
  611. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
  612. nbd_restart_write, client);
  613. client->send_coroutine = qemu_coroutine_self();
  614. if (!len) {
  615. rc = nbd_send_reply(csock, reply);
  616. } else {
  617. socket_set_cork(csock, 1);
  618. rc = nbd_send_reply(csock, reply);
  619. if (rc >= 0) {
  620. ret = qemu_co_send(csock, req->data, len);
  621. if (ret != len) {
  622. rc = -EIO;
  623. }
  624. }
  625. socket_set_cork(csock, 0);
  626. }
  627. client->send_coroutine = NULL;
  628. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
  629. qemu_co_mutex_unlock(&client->send_lock);
  630. return rc;
  631. }
  632. static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
  633. {
  634. NBDClient *client = req->client;
  635. int csock = client->sock;
  636. ssize_t rc;
  637. client->recv_coroutine = qemu_coroutine_self();
  638. rc = nbd_receive_request(csock, request);
  639. if (rc < 0) {
  640. if (rc != -EAGAIN) {
  641. rc = -EIO;
  642. }
  643. goto out;
  644. }
  645. if (request->len > NBD_BUFFER_SIZE) {
  646. LOG("len (%u) is larger than max len (%u)",
  647. request->len, NBD_BUFFER_SIZE);
  648. rc = -EINVAL;
  649. goto out;
  650. }
  651. if ((request->from + request->len) < request->from) {
  652. LOG("integer overflow detected! "
  653. "you're probably being attacked");
  654. rc = -EINVAL;
  655. goto out;
  656. }
  657. TRACE("Decoding type");
  658. if ((request->type & NBD_CMD_MASK_COMMAND) == NBD_CMD_WRITE) {
  659. TRACE("Reading %u byte(s)", request->len);
  660. if (qemu_co_recv(csock, req->data, request->len) != request->len) {
  661. LOG("reading from socket failed");
  662. rc = -EIO;
  663. goto out;
  664. }
  665. }
  666. rc = 0;
  667. out:
  668. client->recv_coroutine = NULL;
  669. return rc;
  670. }
  671. static void nbd_trip(void *opaque)
  672. {
  673. NBDClient *client = opaque;
  674. NBDRequest *req = nbd_request_get(client);
  675. NBDExport *exp = client->exp;
  676. struct nbd_request request;
  677. struct nbd_reply reply;
  678. ssize_t ret;
  679. TRACE("Reading request.");
  680. ret = nbd_co_receive_request(req, &request);
  681. if (ret == -EAGAIN) {
  682. goto done;
  683. }
  684. if (ret == -EIO) {
  685. goto out;
  686. }
  687. reply.handle = request.handle;
  688. reply.error = 0;
  689. if (ret < 0) {
  690. reply.error = -ret;
  691. goto error_reply;
  692. }
  693. if ((request.from + request.len) > exp->size) {
  694. LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
  695. ", Offset: %" PRIu64 "\n",
  696. request.from, request.len,
  697. (uint64_t)exp->size, (uint64_t)exp->dev_offset);
  698. LOG("requested operation past EOF--bad client?");
  699. goto invalid_request;
  700. }
  701. switch (request.type & NBD_CMD_MASK_COMMAND) {
  702. case NBD_CMD_READ:
  703. TRACE("Request type is READ");
  704. if (request.type & NBD_CMD_FLAG_FUA) {
  705. ret = bdrv_co_flush(exp->bs);
  706. if (ret < 0) {
  707. LOG("flush failed");
  708. reply.error = -ret;
  709. goto error_reply;
  710. }
  711. }
  712. ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
  713. req->data, request.len / 512);
  714. if (ret < 0) {
  715. LOG("reading from file failed");
  716. reply.error = -ret;
  717. goto error_reply;
  718. }
  719. TRACE("Read %u byte(s)", request.len);
  720. if (nbd_co_send_reply(req, &reply, request.len) < 0)
  721. goto out;
  722. break;
  723. case NBD_CMD_WRITE:
  724. TRACE("Request type is WRITE");
  725. if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
  726. TRACE("Server is read-only, return error");
  727. reply.error = EROFS;
  728. goto error_reply;
  729. }
  730. TRACE("Writing to device");
  731. ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
  732. req->data, request.len / 512);
  733. if (ret < 0) {
  734. LOG("writing to file failed");
  735. reply.error = -ret;
  736. goto error_reply;
  737. }
  738. if (request.type & NBD_CMD_FLAG_FUA) {
  739. ret = bdrv_co_flush(exp->bs);
  740. if (ret < 0) {
  741. LOG("flush failed");
  742. reply.error = -ret;
  743. goto error_reply;
  744. }
  745. }
  746. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  747. goto out;
  748. }
  749. break;
  750. case NBD_CMD_DISC:
  751. TRACE("Request type is DISCONNECT");
  752. errno = 0;
  753. goto out;
  754. case NBD_CMD_FLUSH:
  755. TRACE("Request type is FLUSH");
  756. ret = bdrv_co_flush(exp->bs);
  757. if (ret < 0) {
  758. LOG("flush failed");
  759. reply.error = -ret;
  760. }
  761. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  762. goto out;
  763. }
  764. break;
  765. case NBD_CMD_TRIM:
  766. TRACE("Request type is TRIM");
  767. ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
  768. request.len / 512);
  769. if (ret < 0) {
  770. LOG("discard failed");
  771. reply.error = -ret;
  772. }
  773. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  774. goto out;
  775. }
  776. break;
  777. default:
  778. LOG("invalid request type (%u) received", request.type);
  779. invalid_request:
  780. reply.error = -EINVAL;
  781. error_reply:
  782. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  783. goto out;
  784. }
  785. break;
  786. }
  787. TRACE("Request/Reply complete");
  788. done:
  789. nbd_request_put(req);
  790. return;
  791. out:
  792. nbd_request_put(req);
  793. nbd_client_close(client);
  794. }
  795. static int nbd_can_read(void *opaque)
  796. {
  797. NBDClient *client = opaque;
  798. return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
  799. }
  800. static void nbd_read(void *opaque)
  801. {
  802. NBDClient *client = opaque;
  803. if (client->recv_coroutine) {
  804. qemu_coroutine_enter(client->recv_coroutine, NULL);
  805. } else {
  806. qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
  807. }
  808. }
  809. static void nbd_restart_write(void *opaque)
  810. {
  811. NBDClient *client = opaque;
  812. qemu_coroutine_enter(client->send_coroutine, NULL);
  813. }
  814. NBDClient *nbd_client_new(NBDExport *exp, int csock,
  815. void (*close)(NBDClient *))
  816. {
  817. NBDClient *client;
  818. if (nbd_send_negotiate(csock, exp->size, exp->nbdflags) < 0) {
  819. return NULL;
  820. }
  821. client = g_malloc0(sizeof(NBDClient));
  822. client->refcount = 1;
  823. client->exp = exp;
  824. client->sock = csock;
  825. client->close = close;
  826. qemu_co_mutex_init(&client->send_lock);
  827. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
  828. return client;
  829. }