2
0

nbd.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308
  1. /*
  2. * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
  3. *
  4. * Network Block Device
  5. *
  6. * This program is free software; you can redistribute it and/or modify
  7. * it under the terms of the GNU General Public License as published by
  8. * the Free Software Foundation; under version 2 of the License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, see <http://www.gnu.org/licenses/>.
  17. */
  18. #include "block/nbd.h"
  19. #include "block/block.h"
  20. #include "block/coroutine.h"
  21. #include <errno.h>
  22. #include <string.h>
  23. #ifndef _WIN32
  24. #include <sys/ioctl.h>
  25. #endif
  26. #if defined(__sun__) || defined(__HAIKU__)
  27. #include <sys/ioccom.h>
  28. #endif
  29. #include <ctype.h>
  30. #include <inttypes.h>
  31. #ifdef __linux__
  32. #include <linux/fs.h>
  33. #endif
  34. #include "qemu/sockets.h"
  35. #include "qemu/queue.h"
  36. #include "qemu/main-loop.h"
  37. //#define DEBUG_NBD
  38. #ifdef DEBUG_NBD
  39. #define TRACE(msg, ...) do { \
  40. LOG(msg, ## __VA_ARGS__); \
  41. } while(0)
  42. #else
  43. #define TRACE(msg, ...) \
  44. do { } while (0)
  45. #endif
  46. #define LOG(msg, ...) do { \
  47. fprintf(stderr, "%s:%s():L%d: " msg "\n", \
  48. __FILE__, __FUNCTION__, __LINE__, ## __VA_ARGS__); \
  49. } while(0)
  50. /* This is all part of the "official" NBD API.
  51. *
  52. * The most up-to-date documentation is available at:
  53. * https://github.com/yoe/nbd/blob/master/doc/proto.txt
  54. */
  55. #define NBD_REQUEST_SIZE (4 + 4 + 8 + 8 + 4)
  56. #define NBD_REPLY_SIZE (4 + 4 + 8)
  57. #define NBD_REQUEST_MAGIC 0x25609513
  58. #define NBD_REPLY_MAGIC 0x67446698
  59. #define NBD_OPTS_MAGIC 0x49484156454F5054LL
  60. #define NBD_CLIENT_MAGIC 0x0000420281861253LL
  61. #define NBD_REP_MAGIC 0x3e889045565a9LL
  62. #define NBD_SET_SOCK _IO(0xab, 0)
  63. #define NBD_SET_BLKSIZE _IO(0xab, 1)
  64. #define NBD_SET_SIZE _IO(0xab, 2)
  65. #define NBD_DO_IT _IO(0xab, 3)
  66. #define NBD_CLEAR_SOCK _IO(0xab, 4)
  67. #define NBD_CLEAR_QUE _IO(0xab, 5)
  68. #define NBD_PRINT_DEBUG _IO(0xab, 6)
  69. #define NBD_SET_SIZE_BLOCKS _IO(0xab, 7)
  70. #define NBD_DISCONNECT _IO(0xab, 8)
  71. #define NBD_SET_TIMEOUT _IO(0xab, 9)
  72. #define NBD_SET_FLAGS _IO(0xab, 10)
  73. #define NBD_OPT_EXPORT_NAME (1)
  74. #define NBD_OPT_ABORT (2)
  75. #define NBD_OPT_LIST (3)
  76. /* Definitions for opaque data types */
  77. typedef struct NBDRequest NBDRequest;
  78. struct NBDRequest {
  79. QSIMPLEQ_ENTRY(NBDRequest) entry;
  80. NBDClient *client;
  81. uint8_t *data;
  82. };
  83. struct NBDExport {
  84. int refcount;
  85. void (*close)(NBDExport *exp);
  86. BlockDriverState *bs;
  87. char *name;
  88. off_t dev_offset;
  89. off_t size;
  90. uint32_t nbdflags;
  91. QTAILQ_HEAD(, NBDClient) clients;
  92. QTAILQ_ENTRY(NBDExport) next;
  93. };
  94. static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
  95. struct NBDClient {
  96. int refcount;
  97. void (*close)(NBDClient *client);
  98. NBDExport *exp;
  99. int sock;
  100. Coroutine *recv_coroutine;
  101. CoMutex send_lock;
  102. Coroutine *send_coroutine;
  103. QTAILQ_ENTRY(NBDClient) next;
  104. int nb_requests;
  105. bool closing;
  106. };
  107. /* That's all folks */
  108. ssize_t nbd_wr_sync(int fd, void *buffer, size_t size, bool do_read)
  109. {
  110. size_t offset = 0;
  111. int err;
  112. if (qemu_in_coroutine()) {
  113. if (do_read) {
  114. return qemu_co_recv(fd, buffer, size);
  115. } else {
  116. return qemu_co_send(fd, buffer, size);
  117. }
  118. }
  119. while (offset < size) {
  120. ssize_t len;
  121. if (do_read) {
  122. len = qemu_recv(fd, buffer + offset, size - offset, 0);
  123. } else {
  124. len = send(fd, buffer + offset, size - offset, 0);
  125. }
  126. if (len < 0) {
  127. err = socket_error();
  128. /* recoverable error */
  129. if (err == EINTR || (offset > 0 && err == EAGAIN)) {
  130. continue;
  131. }
  132. /* unrecoverable error */
  133. return -err;
  134. }
  135. /* eof */
  136. if (len == 0) {
  137. break;
  138. }
  139. offset += len;
  140. }
  141. return offset;
  142. }
  143. static ssize_t read_sync(int fd, void *buffer, size_t size)
  144. {
  145. /* Sockets are kept in blocking mode in the negotiation phase. After
  146. * that, a non-readable socket simply means that another thread stole
  147. * our request/reply. Synchronization is done with recv_coroutine, so
  148. * that this is coroutine-safe.
  149. */
  150. return nbd_wr_sync(fd, buffer, size, true);
  151. }
  152. static ssize_t write_sync(int fd, void *buffer, size_t size)
  153. {
  154. int ret;
  155. do {
  156. /* For writes, we do expect the socket to be writable. */
  157. ret = nbd_wr_sync(fd, buffer, size, false);
  158. } while (ret == -EAGAIN);
  159. return ret;
  160. }
  161. /* Basic flow for negotiation
  162. Server Client
  163. Negotiate
  164. or
  165. Server Client
  166. Negotiate #1
  167. Option
  168. Negotiate #2
  169. ----
  170. followed by
  171. Server Client
  172. Request
  173. Response
  174. Request
  175. Response
  176. ...
  177. ...
  178. Request (type == 2)
  179. */
  180. static int nbd_send_rep(int csock, uint32_t type, uint32_t opt)
  181. {
  182. uint64_t magic;
  183. uint32_t len;
  184. magic = cpu_to_be64(NBD_REP_MAGIC);
  185. if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  186. LOG("write failed (rep magic)");
  187. return -EINVAL;
  188. }
  189. opt = cpu_to_be32(opt);
  190. if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
  191. LOG("write failed (rep opt)");
  192. return -EINVAL;
  193. }
  194. type = cpu_to_be32(type);
  195. if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
  196. LOG("write failed (rep type)");
  197. return -EINVAL;
  198. }
  199. len = cpu_to_be32(0);
  200. if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
  201. LOG("write failed (rep data length)");
  202. return -EINVAL;
  203. }
  204. return 0;
  205. }
  206. static int nbd_send_rep_list(int csock, NBDExport *exp)
  207. {
  208. uint64_t magic, name_len;
  209. uint32_t opt, type, len;
  210. name_len = strlen(exp->name);
  211. magic = cpu_to_be64(NBD_REP_MAGIC);
  212. if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  213. LOG("write failed (magic)");
  214. return -EINVAL;
  215. }
  216. opt = cpu_to_be32(NBD_OPT_LIST);
  217. if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
  218. LOG("write failed (opt)");
  219. return -EINVAL;
  220. }
  221. type = cpu_to_be32(NBD_REP_SERVER);
  222. if (write_sync(csock, &type, sizeof(type)) != sizeof(type)) {
  223. LOG("write failed (reply type)");
  224. return -EINVAL;
  225. }
  226. len = cpu_to_be32(name_len + sizeof(len));
  227. if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
  228. LOG("write failed (length)");
  229. return -EINVAL;
  230. }
  231. len = cpu_to_be32(name_len);
  232. if (write_sync(csock, &len, sizeof(len)) != sizeof(len)) {
  233. LOG("write failed (length)");
  234. return -EINVAL;
  235. }
  236. if (write_sync(csock, exp->name, name_len) != name_len) {
  237. LOG("write failed (buffer)");
  238. return -EINVAL;
  239. }
  240. return 0;
  241. }
  242. static int nbd_handle_list(NBDClient *client, uint32_t length)
  243. {
  244. int csock;
  245. NBDExport *exp;
  246. csock = client->sock;
  247. if (length) {
  248. return nbd_send_rep(csock, NBD_REP_ERR_INVALID, NBD_OPT_LIST);
  249. }
  250. /* For each export, send a NBD_REP_SERVER reply. */
  251. QTAILQ_FOREACH(exp, &exports, next) {
  252. if (nbd_send_rep_list(csock, exp)) {
  253. return -EINVAL;
  254. }
  255. }
  256. /* Finish with a NBD_REP_ACK. */
  257. return nbd_send_rep(csock, NBD_REP_ACK, NBD_OPT_LIST);
  258. }
  259. static int nbd_handle_export_name(NBDClient *client, uint32_t length)
  260. {
  261. int rc = -EINVAL, csock = client->sock;
  262. char name[256];
  263. /* Client sends:
  264. [20 .. xx] export name (length bytes)
  265. */
  266. TRACE("Checking length");
  267. if (length > 255) {
  268. LOG("Bad length received");
  269. goto fail;
  270. }
  271. if (read_sync(csock, name, length) != length) {
  272. LOG("read failed");
  273. goto fail;
  274. }
  275. name[length] = '\0';
  276. client->exp = nbd_export_find(name);
  277. if (!client->exp) {
  278. LOG("export not found");
  279. goto fail;
  280. }
  281. QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
  282. nbd_export_get(client->exp);
  283. rc = 0;
  284. fail:
  285. return rc;
  286. }
  287. static int nbd_receive_options(NBDClient *client)
  288. {
  289. while (1) {
  290. int csock = client->sock;
  291. uint32_t tmp, length;
  292. uint64_t magic;
  293. /* Client sends:
  294. [ 0 .. 3] client flags
  295. [ 4 .. 11] NBD_OPTS_MAGIC
  296. [12 .. 15] NBD option
  297. [16 .. 19] length
  298. ... Rest of request
  299. */
  300. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  301. LOG("read failed");
  302. return -EINVAL;
  303. }
  304. TRACE("Checking client flags");
  305. tmp = be32_to_cpu(tmp);
  306. if (tmp != 0 && tmp != NBD_FLAG_C_FIXED_NEWSTYLE) {
  307. LOG("Bad client flags received");
  308. return -EINVAL;
  309. }
  310. if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  311. LOG("read failed");
  312. return -EINVAL;
  313. }
  314. TRACE("Checking opts magic");
  315. if (magic != be64_to_cpu(NBD_OPTS_MAGIC)) {
  316. LOG("Bad magic received");
  317. return -EINVAL;
  318. }
  319. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  320. LOG("read failed");
  321. return -EINVAL;
  322. }
  323. if (read_sync(csock, &length, sizeof(length)) != sizeof(length)) {
  324. LOG("read failed");
  325. return -EINVAL;
  326. }
  327. length = be32_to_cpu(length);
  328. TRACE("Checking option");
  329. switch (be32_to_cpu(tmp)) {
  330. case NBD_OPT_LIST:
  331. if (nbd_handle_list(client, length) < 0) {
  332. return 1;
  333. }
  334. break;
  335. case NBD_OPT_ABORT:
  336. return -EINVAL;
  337. case NBD_OPT_EXPORT_NAME:
  338. return nbd_handle_export_name(client, length);
  339. default:
  340. tmp = be32_to_cpu(tmp);
  341. LOG("Unsupported option 0x%x", tmp);
  342. nbd_send_rep(client->sock, NBD_REP_ERR_UNSUP, tmp);
  343. return -EINVAL;
  344. }
  345. }
  346. }
  347. static int nbd_send_negotiate(NBDClient *client)
  348. {
  349. int csock = client->sock;
  350. char buf[8 + 8 + 8 + 128];
  351. int rc;
  352. const int myflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_TRIM |
  353. NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_FUA);
  354. /* Negotiation header without options:
  355. [ 0 .. 7] passwd ("NBDMAGIC")
  356. [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
  357. [16 .. 23] size
  358. [24 .. 25] server flags (0)
  359. [26 .. 27] export flags
  360. [28 .. 151] reserved (0)
  361. Negotiation header with options, part 1:
  362. [ 0 .. 7] passwd ("NBDMAGIC")
  363. [ 8 .. 15] magic (NBD_OPTS_MAGIC)
  364. [16 .. 17] server flags (0)
  365. part 2 (after options are sent):
  366. [18 .. 25] size
  367. [26 .. 27] export flags
  368. [28 .. 151] reserved (0)
  369. */
  370. qemu_set_block(csock);
  371. rc = -EINVAL;
  372. TRACE("Beginning negotiation.");
  373. memset(buf, 0, sizeof(buf));
  374. memcpy(buf, "NBDMAGIC", 8);
  375. if (client->exp) {
  376. assert ((client->exp->nbdflags & ~65535) == 0);
  377. cpu_to_be64w((uint64_t*)(buf + 8), NBD_CLIENT_MAGIC);
  378. cpu_to_be64w((uint64_t*)(buf + 16), client->exp->size);
  379. cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
  380. } else {
  381. cpu_to_be64w((uint64_t*)(buf + 8), NBD_OPTS_MAGIC);
  382. cpu_to_be16w((uint16_t *)(buf + 16), NBD_FLAG_FIXED_NEWSTYLE);
  383. }
  384. if (client->exp) {
  385. if (write_sync(csock, buf, sizeof(buf)) != sizeof(buf)) {
  386. LOG("write failed");
  387. goto fail;
  388. }
  389. } else {
  390. if (write_sync(csock, buf, 18) != 18) {
  391. LOG("write failed");
  392. goto fail;
  393. }
  394. rc = nbd_receive_options(client);
  395. if (rc != 0) {
  396. LOG("option negotiation failed");
  397. goto fail;
  398. }
  399. assert ((client->exp->nbdflags & ~65535) == 0);
  400. cpu_to_be64w((uint64_t*)(buf + 18), client->exp->size);
  401. cpu_to_be16w((uint16_t*)(buf + 26), client->exp->nbdflags | myflags);
  402. if (write_sync(csock, buf + 18, sizeof(buf) - 18) != sizeof(buf) - 18) {
  403. LOG("write failed");
  404. goto fail;
  405. }
  406. }
  407. TRACE("Negotiation succeeded.");
  408. rc = 0;
  409. fail:
  410. qemu_set_nonblock(csock);
  411. return rc;
  412. }
  413. int nbd_receive_negotiate(int csock, const char *name, uint32_t *flags,
  414. off_t *size, size_t *blocksize)
  415. {
  416. char buf[256];
  417. uint64_t magic, s;
  418. uint16_t tmp;
  419. int rc;
  420. TRACE("Receiving negotiation.");
  421. rc = -EINVAL;
  422. if (read_sync(csock, buf, 8) != 8) {
  423. LOG("read failed");
  424. goto fail;
  425. }
  426. buf[8] = '\0';
  427. if (strlen(buf) == 0) {
  428. LOG("server connection closed");
  429. goto fail;
  430. }
  431. TRACE("Magic is %c%c%c%c%c%c%c%c",
  432. qemu_isprint(buf[0]) ? buf[0] : '.',
  433. qemu_isprint(buf[1]) ? buf[1] : '.',
  434. qemu_isprint(buf[2]) ? buf[2] : '.',
  435. qemu_isprint(buf[3]) ? buf[3] : '.',
  436. qemu_isprint(buf[4]) ? buf[4] : '.',
  437. qemu_isprint(buf[5]) ? buf[5] : '.',
  438. qemu_isprint(buf[6]) ? buf[6] : '.',
  439. qemu_isprint(buf[7]) ? buf[7] : '.');
  440. if (memcmp(buf, "NBDMAGIC", 8) != 0) {
  441. LOG("Invalid magic received");
  442. goto fail;
  443. }
  444. if (read_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  445. LOG("read failed");
  446. goto fail;
  447. }
  448. magic = be64_to_cpu(magic);
  449. TRACE("Magic is 0x%" PRIx64, magic);
  450. if (name) {
  451. uint32_t reserved = 0;
  452. uint32_t opt;
  453. uint32_t namesize;
  454. TRACE("Checking magic (opts_magic)");
  455. if (magic != NBD_OPTS_MAGIC) {
  456. LOG("Bad magic received");
  457. goto fail;
  458. }
  459. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  460. LOG("flags read failed");
  461. goto fail;
  462. }
  463. *flags = be16_to_cpu(tmp) << 16;
  464. /* reserved for future use */
  465. if (write_sync(csock, &reserved, sizeof(reserved)) !=
  466. sizeof(reserved)) {
  467. LOG("write failed (reserved)");
  468. goto fail;
  469. }
  470. /* write the export name */
  471. magic = cpu_to_be64(magic);
  472. if (write_sync(csock, &magic, sizeof(magic)) != sizeof(magic)) {
  473. LOG("write failed (magic)");
  474. goto fail;
  475. }
  476. opt = cpu_to_be32(NBD_OPT_EXPORT_NAME);
  477. if (write_sync(csock, &opt, sizeof(opt)) != sizeof(opt)) {
  478. LOG("write failed (opt)");
  479. goto fail;
  480. }
  481. namesize = cpu_to_be32(strlen(name));
  482. if (write_sync(csock, &namesize, sizeof(namesize)) !=
  483. sizeof(namesize)) {
  484. LOG("write failed (namesize)");
  485. goto fail;
  486. }
  487. if (write_sync(csock, (char*)name, strlen(name)) != strlen(name)) {
  488. LOG("write failed (name)");
  489. goto fail;
  490. }
  491. } else {
  492. TRACE("Checking magic (cli_magic)");
  493. if (magic != NBD_CLIENT_MAGIC) {
  494. LOG("Bad magic received");
  495. goto fail;
  496. }
  497. }
  498. if (read_sync(csock, &s, sizeof(s)) != sizeof(s)) {
  499. LOG("read failed");
  500. goto fail;
  501. }
  502. *size = be64_to_cpu(s);
  503. *blocksize = 1024;
  504. TRACE("Size is %" PRIu64, *size);
  505. if (!name) {
  506. if (read_sync(csock, flags, sizeof(*flags)) != sizeof(*flags)) {
  507. LOG("read failed (flags)");
  508. goto fail;
  509. }
  510. *flags = be32_to_cpup(flags);
  511. } else {
  512. if (read_sync(csock, &tmp, sizeof(tmp)) != sizeof(tmp)) {
  513. LOG("read failed (tmp)");
  514. goto fail;
  515. }
  516. *flags |= be32_to_cpu(tmp);
  517. }
  518. if (read_sync(csock, &buf, 124) != 124) {
  519. LOG("read failed (buf)");
  520. goto fail;
  521. }
  522. rc = 0;
  523. fail:
  524. return rc;
  525. }
  526. #ifdef __linux__
  527. int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
  528. {
  529. TRACE("Setting NBD socket");
  530. if (ioctl(fd, NBD_SET_SOCK, csock) < 0) {
  531. int serrno = errno;
  532. LOG("Failed to set NBD socket");
  533. return -serrno;
  534. }
  535. TRACE("Setting block size to %lu", (unsigned long)blocksize);
  536. if (ioctl(fd, NBD_SET_BLKSIZE, blocksize) < 0) {
  537. int serrno = errno;
  538. LOG("Failed setting NBD block size");
  539. return -serrno;
  540. }
  541. TRACE("Setting size to %zd block(s)", (size_t)(size / blocksize));
  542. if (ioctl(fd, NBD_SET_SIZE_BLOCKS, size / blocksize) < 0) {
  543. int serrno = errno;
  544. LOG("Failed setting size (in blocks)");
  545. return -serrno;
  546. }
  547. if (ioctl(fd, NBD_SET_FLAGS, flags) < 0) {
  548. if (errno == ENOTTY) {
  549. int read_only = (flags & NBD_FLAG_READ_ONLY) != 0;
  550. TRACE("Setting readonly attribute");
  551. if (ioctl(fd, BLKROSET, (unsigned long) &read_only) < 0) {
  552. int serrno = errno;
  553. LOG("Failed setting read-only attribute");
  554. return -serrno;
  555. }
  556. } else {
  557. int serrno = errno;
  558. LOG("Failed setting flags");
  559. return -serrno;
  560. }
  561. }
  562. TRACE("Negotiation ended");
  563. return 0;
  564. }
  565. int nbd_disconnect(int fd)
  566. {
  567. ioctl(fd, NBD_CLEAR_QUE);
  568. ioctl(fd, NBD_DISCONNECT);
  569. ioctl(fd, NBD_CLEAR_SOCK);
  570. return 0;
  571. }
  572. int nbd_client(int fd)
  573. {
  574. int ret;
  575. int serrno;
  576. TRACE("Doing NBD loop");
  577. ret = ioctl(fd, NBD_DO_IT);
  578. if (ret < 0 && errno == EPIPE) {
  579. /* NBD_DO_IT normally returns EPIPE when someone has disconnected
  580. * the socket via NBD_DISCONNECT. We do not want to return 1 in
  581. * that case.
  582. */
  583. ret = 0;
  584. }
  585. serrno = errno;
  586. TRACE("NBD loop returned %d: %s", ret, strerror(serrno));
  587. TRACE("Clearing NBD queue");
  588. ioctl(fd, NBD_CLEAR_QUE);
  589. TRACE("Clearing NBD socket");
  590. ioctl(fd, NBD_CLEAR_SOCK);
  591. errno = serrno;
  592. return ret;
  593. }
  594. #else
  595. int nbd_init(int fd, int csock, uint32_t flags, off_t size, size_t blocksize)
  596. {
  597. return -ENOTSUP;
  598. }
  599. int nbd_disconnect(int fd)
  600. {
  601. return -ENOTSUP;
  602. }
  603. int nbd_client(int fd)
  604. {
  605. return -ENOTSUP;
  606. }
  607. #endif
  608. ssize_t nbd_send_request(int csock, struct nbd_request *request)
  609. {
  610. uint8_t buf[NBD_REQUEST_SIZE];
  611. ssize_t ret;
  612. cpu_to_be32w((uint32_t*)buf, NBD_REQUEST_MAGIC);
  613. cpu_to_be32w((uint32_t*)(buf + 4), request->type);
  614. cpu_to_be64w((uint64_t*)(buf + 8), request->handle);
  615. cpu_to_be64w((uint64_t*)(buf + 16), request->from);
  616. cpu_to_be32w((uint32_t*)(buf + 24), request->len);
  617. TRACE("Sending request to client: "
  618. "{ .from = %" PRIu64", .len = %u, .handle = %" PRIu64", .type=%i}",
  619. request->from, request->len, request->handle, request->type);
  620. ret = write_sync(csock, buf, sizeof(buf));
  621. if (ret < 0) {
  622. return ret;
  623. }
  624. if (ret != sizeof(buf)) {
  625. LOG("writing to socket failed");
  626. return -EINVAL;
  627. }
  628. return 0;
  629. }
  630. static ssize_t nbd_receive_request(int csock, struct nbd_request *request)
  631. {
  632. uint8_t buf[NBD_REQUEST_SIZE];
  633. uint32_t magic;
  634. ssize_t ret;
  635. ret = read_sync(csock, buf, sizeof(buf));
  636. if (ret < 0) {
  637. return ret;
  638. }
  639. if (ret != sizeof(buf)) {
  640. LOG("read failed");
  641. return -EINVAL;
  642. }
  643. /* Request
  644. [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
  645. [ 4 .. 7] type (0 == READ, 1 == WRITE)
  646. [ 8 .. 15] handle
  647. [16 .. 23] from
  648. [24 .. 27] len
  649. */
  650. magic = be32_to_cpup((uint32_t*)buf);
  651. request->type = be32_to_cpup((uint32_t*)(buf + 4));
  652. request->handle = be64_to_cpup((uint64_t*)(buf + 8));
  653. request->from = be64_to_cpup((uint64_t*)(buf + 16));
  654. request->len = be32_to_cpup((uint32_t*)(buf + 24));
  655. TRACE("Got request: "
  656. "{ magic = 0x%x, .type = %d, from = %" PRIu64" , len = %u }",
  657. magic, request->type, request->from, request->len);
  658. if (magic != NBD_REQUEST_MAGIC) {
  659. LOG("invalid magic (got 0x%x)", magic);
  660. return -EINVAL;
  661. }
  662. return 0;
  663. }
  664. ssize_t nbd_receive_reply(int csock, struct nbd_reply *reply)
  665. {
  666. uint8_t buf[NBD_REPLY_SIZE];
  667. uint32_t magic;
  668. ssize_t ret;
  669. ret = read_sync(csock, buf, sizeof(buf));
  670. if (ret < 0) {
  671. return ret;
  672. }
  673. if (ret != sizeof(buf)) {
  674. LOG("read failed");
  675. return -EINVAL;
  676. }
  677. /* Reply
  678. [ 0 .. 3] magic (NBD_REPLY_MAGIC)
  679. [ 4 .. 7] error (0 == no error)
  680. [ 7 .. 15] handle
  681. */
  682. magic = be32_to_cpup((uint32_t*)buf);
  683. reply->error = be32_to_cpup((uint32_t*)(buf + 4));
  684. reply->handle = be64_to_cpup((uint64_t*)(buf + 8));
  685. TRACE("Got reply: "
  686. "{ magic = 0x%x, .error = %d, handle = %" PRIu64" }",
  687. magic, reply->error, reply->handle);
  688. if (magic != NBD_REPLY_MAGIC) {
  689. LOG("invalid magic (got 0x%x)", magic);
  690. return -EINVAL;
  691. }
  692. return 0;
  693. }
  694. static ssize_t nbd_send_reply(int csock, struct nbd_reply *reply)
  695. {
  696. uint8_t buf[NBD_REPLY_SIZE];
  697. ssize_t ret;
  698. /* Reply
  699. [ 0 .. 3] magic (NBD_REPLY_MAGIC)
  700. [ 4 .. 7] error (0 == no error)
  701. [ 7 .. 15] handle
  702. */
  703. cpu_to_be32w((uint32_t*)buf, NBD_REPLY_MAGIC);
  704. cpu_to_be32w((uint32_t*)(buf + 4), reply->error);
  705. cpu_to_be64w((uint64_t*)(buf + 8), reply->handle);
  706. TRACE("Sending response to client");
  707. ret = write_sync(csock, buf, sizeof(buf));
  708. if (ret < 0) {
  709. return ret;
  710. }
  711. if (ret != sizeof(buf)) {
  712. LOG("writing to socket failed");
  713. return -EINVAL;
  714. }
  715. return 0;
  716. }
  717. #define MAX_NBD_REQUESTS 16
  718. void nbd_client_get(NBDClient *client)
  719. {
  720. client->refcount++;
  721. }
  722. void nbd_client_put(NBDClient *client)
  723. {
  724. if (--client->refcount == 0) {
  725. /* The last reference should be dropped by client->close,
  726. * which is called by nbd_client_close.
  727. */
  728. assert(client->closing);
  729. qemu_set_fd_handler2(client->sock, NULL, NULL, NULL, NULL);
  730. close(client->sock);
  731. client->sock = -1;
  732. if (client->exp) {
  733. QTAILQ_REMOVE(&client->exp->clients, client, next);
  734. nbd_export_put(client->exp);
  735. }
  736. g_free(client);
  737. }
  738. }
  739. void nbd_client_close(NBDClient *client)
  740. {
  741. if (client->closing) {
  742. return;
  743. }
  744. client->closing = true;
  745. /* Force requests to finish. They will drop their own references,
  746. * then we'll close the socket and free the NBDClient.
  747. */
  748. shutdown(client->sock, 2);
  749. /* Also tell the client, so that they release their reference. */
  750. if (client->close) {
  751. client->close(client);
  752. }
  753. }
  754. static NBDRequest *nbd_request_get(NBDClient *client)
  755. {
  756. NBDRequest *req;
  757. assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
  758. client->nb_requests++;
  759. req = g_slice_new0(NBDRequest);
  760. nbd_client_get(client);
  761. req->client = client;
  762. return req;
  763. }
  764. static void nbd_request_put(NBDRequest *req)
  765. {
  766. NBDClient *client = req->client;
  767. if (req->data) {
  768. qemu_vfree(req->data);
  769. }
  770. g_slice_free(NBDRequest, req);
  771. if (client->nb_requests-- == MAX_NBD_REQUESTS) {
  772. qemu_notify_event();
  773. }
  774. nbd_client_put(client);
  775. }
  776. NBDExport *nbd_export_new(BlockDriverState *bs, off_t dev_offset,
  777. off_t size, uint32_t nbdflags,
  778. void (*close)(NBDExport *))
  779. {
  780. NBDExport *exp = g_malloc0(sizeof(NBDExport));
  781. exp->refcount = 1;
  782. QTAILQ_INIT(&exp->clients);
  783. exp->bs = bs;
  784. exp->dev_offset = dev_offset;
  785. exp->nbdflags = nbdflags;
  786. exp->size = size == -1 ? bdrv_getlength(bs) : size;
  787. exp->close = close;
  788. bdrv_ref(bs);
  789. return exp;
  790. }
  791. NBDExport *nbd_export_find(const char *name)
  792. {
  793. NBDExport *exp;
  794. QTAILQ_FOREACH(exp, &exports, next) {
  795. if (strcmp(name, exp->name) == 0) {
  796. return exp;
  797. }
  798. }
  799. return NULL;
  800. }
  801. void nbd_export_set_name(NBDExport *exp, const char *name)
  802. {
  803. if (exp->name == name) {
  804. return;
  805. }
  806. nbd_export_get(exp);
  807. if (exp->name != NULL) {
  808. g_free(exp->name);
  809. exp->name = NULL;
  810. QTAILQ_REMOVE(&exports, exp, next);
  811. nbd_export_put(exp);
  812. }
  813. if (name != NULL) {
  814. nbd_export_get(exp);
  815. exp->name = g_strdup(name);
  816. QTAILQ_INSERT_TAIL(&exports, exp, next);
  817. }
  818. nbd_export_put(exp);
  819. }
  820. void nbd_export_close(NBDExport *exp)
  821. {
  822. NBDClient *client, *next;
  823. nbd_export_get(exp);
  824. QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
  825. nbd_client_close(client);
  826. }
  827. nbd_export_set_name(exp, NULL);
  828. nbd_export_put(exp);
  829. if (exp->bs) {
  830. bdrv_unref(exp->bs);
  831. exp->bs = NULL;
  832. }
  833. }
  834. void nbd_export_get(NBDExport *exp)
  835. {
  836. assert(exp->refcount > 0);
  837. exp->refcount++;
  838. }
  839. void nbd_export_put(NBDExport *exp)
  840. {
  841. assert(exp->refcount > 0);
  842. if (exp->refcount == 1) {
  843. nbd_export_close(exp);
  844. }
  845. if (--exp->refcount == 0) {
  846. assert(exp->name == NULL);
  847. if (exp->close) {
  848. exp->close(exp);
  849. }
  850. g_free(exp);
  851. }
  852. }
  853. BlockDriverState *nbd_export_get_blockdev(NBDExport *exp)
  854. {
  855. return exp->bs;
  856. }
  857. void nbd_export_close_all(void)
  858. {
  859. NBDExport *exp, *next;
  860. QTAILQ_FOREACH_SAFE(exp, &exports, next, next) {
  861. nbd_export_close(exp);
  862. }
  863. }
  864. static int nbd_can_read(void *opaque);
  865. static void nbd_read(void *opaque);
  866. static void nbd_restart_write(void *opaque);
  867. static ssize_t nbd_co_send_reply(NBDRequest *req, struct nbd_reply *reply,
  868. int len)
  869. {
  870. NBDClient *client = req->client;
  871. int csock = client->sock;
  872. ssize_t rc, ret;
  873. qemu_co_mutex_lock(&client->send_lock);
  874. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read,
  875. nbd_restart_write, client);
  876. client->send_coroutine = qemu_coroutine_self();
  877. if (!len) {
  878. rc = nbd_send_reply(csock, reply);
  879. } else {
  880. socket_set_cork(csock, 1);
  881. rc = nbd_send_reply(csock, reply);
  882. if (rc >= 0) {
  883. ret = qemu_co_send(csock, req->data, len);
  884. if (ret != len) {
  885. rc = -EIO;
  886. }
  887. }
  888. socket_set_cork(csock, 0);
  889. }
  890. client->send_coroutine = NULL;
  891. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
  892. qemu_co_mutex_unlock(&client->send_lock);
  893. return rc;
  894. }
  895. static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request *request)
  896. {
  897. NBDClient *client = req->client;
  898. int csock = client->sock;
  899. uint32_t command;
  900. ssize_t rc;
  901. client->recv_coroutine = qemu_coroutine_self();
  902. rc = nbd_receive_request(csock, request);
  903. if (rc < 0) {
  904. if (rc != -EAGAIN) {
  905. rc = -EIO;
  906. }
  907. goto out;
  908. }
  909. if (request->len > NBD_MAX_BUFFER_SIZE) {
  910. LOG("len (%u) is larger than max len (%u)",
  911. request->len, NBD_MAX_BUFFER_SIZE);
  912. rc = -EINVAL;
  913. goto out;
  914. }
  915. if ((request->from + request->len) < request->from) {
  916. LOG("integer overflow detected! "
  917. "you're probably being attacked");
  918. rc = -EINVAL;
  919. goto out;
  920. }
  921. TRACE("Decoding type");
  922. command = request->type & NBD_CMD_MASK_COMMAND;
  923. if (command == NBD_CMD_READ || command == NBD_CMD_WRITE) {
  924. req->data = qemu_blockalign(client->exp->bs, request->len);
  925. }
  926. if (command == NBD_CMD_WRITE) {
  927. TRACE("Reading %u byte(s)", request->len);
  928. if (qemu_co_recv(csock, req->data, request->len) != request->len) {
  929. LOG("reading from socket failed");
  930. rc = -EIO;
  931. goto out;
  932. }
  933. }
  934. rc = 0;
  935. out:
  936. client->recv_coroutine = NULL;
  937. return rc;
  938. }
  939. static void nbd_trip(void *opaque)
  940. {
  941. NBDClient *client = opaque;
  942. NBDExport *exp = client->exp;
  943. NBDRequest *req;
  944. struct nbd_request request;
  945. struct nbd_reply reply;
  946. ssize_t ret;
  947. uint32_t command;
  948. TRACE("Reading request.");
  949. if (client->closing) {
  950. return;
  951. }
  952. req = nbd_request_get(client);
  953. ret = nbd_co_receive_request(req, &request);
  954. if (ret == -EAGAIN) {
  955. goto done;
  956. }
  957. if (ret == -EIO) {
  958. goto out;
  959. }
  960. reply.handle = request.handle;
  961. reply.error = 0;
  962. if (ret < 0) {
  963. reply.error = -ret;
  964. goto error_reply;
  965. }
  966. command = request.type & NBD_CMD_MASK_COMMAND;
  967. if (command != NBD_CMD_DISC && (request.from + request.len) > exp->size) {
  968. LOG("From: %" PRIu64 ", Len: %u, Size: %" PRIu64
  969. ", Offset: %" PRIu64 "\n",
  970. request.from, request.len,
  971. (uint64_t)exp->size, (uint64_t)exp->dev_offset);
  972. LOG("requested operation past EOF--bad client?");
  973. goto invalid_request;
  974. }
  975. switch (command) {
  976. case NBD_CMD_READ:
  977. TRACE("Request type is READ");
  978. if (request.type & NBD_CMD_FLAG_FUA) {
  979. ret = bdrv_co_flush(exp->bs);
  980. if (ret < 0) {
  981. LOG("flush failed");
  982. reply.error = -ret;
  983. goto error_reply;
  984. }
  985. }
  986. ret = bdrv_read(exp->bs, (request.from + exp->dev_offset) / 512,
  987. req->data, request.len / 512);
  988. if (ret < 0) {
  989. LOG("reading from file failed");
  990. reply.error = -ret;
  991. goto error_reply;
  992. }
  993. TRACE("Read %u byte(s)", request.len);
  994. if (nbd_co_send_reply(req, &reply, request.len) < 0)
  995. goto out;
  996. break;
  997. case NBD_CMD_WRITE:
  998. TRACE("Request type is WRITE");
  999. if (exp->nbdflags & NBD_FLAG_READ_ONLY) {
  1000. TRACE("Server is read-only, return error");
  1001. reply.error = EROFS;
  1002. goto error_reply;
  1003. }
  1004. TRACE("Writing to device");
  1005. ret = bdrv_write(exp->bs, (request.from + exp->dev_offset) / 512,
  1006. req->data, request.len / 512);
  1007. if (ret < 0) {
  1008. LOG("writing to file failed");
  1009. reply.error = -ret;
  1010. goto error_reply;
  1011. }
  1012. if (request.type & NBD_CMD_FLAG_FUA) {
  1013. ret = bdrv_co_flush(exp->bs);
  1014. if (ret < 0) {
  1015. LOG("flush failed");
  1016. reply.error = -ret;
  1017. goto error_reply;
  1018. }
  1019. }
  1020. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  1021. goto out;
  1022. }
  1023. break;
  1024. case NBD_CMD_DISC:
  1025. TRACE("Request type is DISCONNECT");
  1026. errno = 0;
  1027. goto out;
  1028. case NBD_CMD_FLUSH:
  1029. TRACE("Request type is FLUSH");
  1030. ret = bdrv_co_flush(exp->bs);
  1031. if (ret < 0) {
  1032. LOG("flush failed");
  1033. reply.error = -ret;
  1034. }
  1035. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  1036. goto out;
  1037. }
  1038. break;
  1039. case NBD_CMD_TRIM:
  1040. TRACE("Request type is TRIM");
  1041. ret = bdrv_co_discard(exp->bs, (request.from + exp->dev_offset) / 512,
  1042. request.len / 512);
  1043. if (ret < 0) {
  1044. LOG("discard failed");
  1045. reply.error = -ret;
  1046. }
  1047. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  1048. goto out;
  1049. }
  1050. break;
  1051. default:
  1052. LOG("invalid request type (%u) received", request.type);
  1053. invalid_request:
  1054. reply.error = -EINVAL;
  1055. error_reply:
  1056. if (nbd_co_send_reply(req, &reply, 0) < 0) {
  1057. goto out;
  1058. }
  1059. break;
  1060. }
  1061. TRACE("Request/Reply complete");
  1062. done:
  1063. nbd_request_put(req);
  1064. return;
  1065. out:
  1066. nbd_request_put(req);
  1067. nbd_client_close(client);
  1068. }
  1069. static int nbd_can_read(void *opaque)
  1070. {
  1071. NBDClient *client = opaque;
  1072. return client->recv_coroutine || client->nb_requests < MAX_NBD_REQUESTS;
  1073. }
  1074. static void nbd_read(void *opaque)
  1075. {
  1076. NBDClient *client = opaque;
  1077. if (client->recv_coroutine) {
  1078. qemu_coroutine_enter(client->recv_coroutine, NULL);
  1079. } else {
  1080. qemu_coroutine_enter(qemu_coroutine_create(nbd_trip), client);
  1081. }
  1082. }
  1083. static void nbd_restart_write(void *opaque)
  1084. {
  1085. NBDClient *client = opaque;
  1086. qemu_coroutine_enter(client->send_coroutine, NULL);
  1087. }
  1088. NBDClient *nbd_client_new(NBDExport *exp, int csock,
  1089. void (*close)(NBDClient *))
  1090. {
  1091. NBDClient *client;
  1092. client = g_malloc0(sizeof(NBDClient));
  1093. client->refcount = 1;
  1094. client->exp = exp;
  1095. client->sock = csock;
  1096. if (nbd_send_negotiate(client)) {
  1097. g_free(client);
  1098. return NULL;
  1099. }
  1100. client->close = close;
  1101. qemu_co_mutex_init(&client->send_lock);
  1102. qemu_set_fd_handler2(csock, nbd_can_read, nbd_read, NULL, client);
  1103. if (exp) {
  1104. QTAILQ_INSERT_TAIL(&exp->clients, client, next);
  1105. nbd_export_get(exp);
  1106. }
  1107. return client;
  1108. }