server.c 103 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302
  1. /*
  2. * Copyright Red Hat
  3. * Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
  4. *
  5. * Network Block Device Server Side
  6. *
  7. * This program is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; under version 2 of the License.
  10. *
  11. * This program is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  14. * GNU General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU General Public License
  17. * along with this program; if not, see <http://www.gnu.org/licenses/>.
  18. */
  19. #include "qemu/osdep.h"
  20. #include "block/block_int.h"
  21. #include "block/export.h"
  22. #include "block/dirty-bitmap.h"
  23. #include "qapi/error.h"
  24. #include "qemu/queue.h"
  25. #include "trace.h"
  26. #include "nbd-internal.h"
  27. #include "qemu/units.h"
  28. #include "qemu/memalign.h"
  29. #define NBD_META_ID_BASE_ALLOCATION 0
  30. #define NBD_META_ID_ALLOCATION_DEPTH 1
  31. /* Dirty bitmaps use 'NBD_META_ID_DIRTY_BITMAP + i', so keep this id last. */
  32. #define NBD_META_ID_DIRTY_BITMAP 2
  33. /*
  34. * NBD_MAX_BLOCK_STATUS_EXTENTS: 1 MiB of extents data. An empirical
  35. * constant. If an increase is needed, note that the NBD protocol
  36. * recommends no larger than 32 mb, so that the client won't consider
  37. * the reply as a denial of service attack.
  38. */
  39. #define NBD_MAX_BLOCK_STATUS_EXTENTS (1 * MiB / 8)
  40. static int system_errno_to_nbd_errno(int err)
  41. {
  42. switch (err) {
  43. case 0:
  44. return NBD_SUCCESS;
  45. case EPERM:
  46. case EROFS:
  47. return NBD_EPERM;
  48. case EIO:
  49. return NBD_EIO;
  50. case ENOMEM:
  51. return NBD_ENOMEM;
  52. #ifdef EDQUOT
  53. case EDQUOT:
  54. #endif
  55. case EFBIG:
  56. case ENOSPC:
  57. return NBD_ENOSPC;
  58. case EOVERFLOW:
  59. return NBD_EOVERFLOW;
  60. case ENOTSUP:
  61. #if ENOTSUP != EOPNOTSUPP
  62. case EOPNOTSUPP:
  63. #endif
  64. return NBD_ENOTSUP;
  65. case ESHUTDOWN:
  66. return NBD_ESHUTDOWN;
  67. case EINVAL:
  68. default:
  69. return NBD_EINVAL;
  70. }
  71. }
  72. /* Definitions for opaque data types */
  73. typedef struct NBDRequestData NBDRequestData;
  74. struct NBDRequestData {
  75. NBDClient *client;
  76. uint8_t *data;
  77. bool complete;
  78. };
  79. struct NBDExport {
  80. BlockExport common;
  81. char *name;
  82. char *description;
  83. uint64_t size;
  84. uint16_t nbdflags;
  85. QTAILQ_HEAD(, NBDClient) clients;
  86. QTAILQ_ENTRY(NBDExport) next;
  87. BlockBackend *eject_notifier_blk;
  88. Notifier eject_notifier;
  89. bool allocation_depth;
  90. BdrvDirtyBitmap **export_bitmaps;
  91. size_t nr_export_bitmaps;
  92. };
  93. static QTAILQ_HEAD(, NBDExport) exports = QTAILQ_HEAD_INITIALIZER(exports);
  94. /*
  95. * NBDMetaContexts represents a list of meta contexts in use,
  96. * as selected by NBD_OPT_SET_META_CONTEXT. Also used for
  97. * NBD_OPT_LIST_META_CONTEXT.
  98. */
  99. struct NBDMetaContexts {
  100. const NBDExport *exp; /* associated export */
  101. size_t count; /* number of negotiated contexts */
  102. bool base_allocation; /* export base:allocation context (block status) */
  103. bool allocation_depth; /* export qemu:allocation-depth */
  104. bool *bitmaps; /*
  105. * export qemu:dirty-bitmap:<export bitmap name>,
  106. * sized by exp->nr_export_bitmaps
  107. */
  108. };
  109. struct NBDClient {
  110. int refcount; /* atomic */
  111. void (*close_fn)(NBDClient *client, bool negotiated);
  112. void *owner;
  113. QemuMutex lock;
  114. NBDExport *exp;
  115. QCryptoTLSCreds *tlscreds;
  116. char *tlsauthz;
  117. uint32_t handshake_max_secs;
  118. QIOChannelSocket *sioc; /* The underlying data channel */
  119. QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */
  120. Coroutine *recv_coroutine; /* protected by lock */
  121. CoMutex send_lock;
  122. Coroutine *send_coroutine;
  123. bool read_yielding; /* protected by lock */
  124. bool quiescing; /* protected by lock */
  125. QTAILQ_ENTRY(NBDClient) next;
  126. int nb_requests; /* protected by lock */
  127. bool closing; /* protected by lock */
  128. uint32_t check_align; /* If non-zero, check for aligned client requests */
  129. NBDMode mode;
  130. NBDMetaContexts contexts; /* Negotiated meta contexts */
  131. uint32_t opt; /* Current option being negotiated */
  132. uint32_t optlen; /* remaining length of data in ioc for the option being
  133. negotiated now */
  134. };
  135. static void nbd_client_receive_next_request(NBDClient *client);
  136. /* Basic flow for negotiation
  137. Server Client
  138. Negotiate
  139. or
  140. Server Client
  141. Negotiate #1
  142. Option
  143. Negotiate #2
  144. ----
  145. followed by
  146. Server Client
  147. Request
  148. Response
  149. Request
  150. Response
  151. ...
  152. ...
  153. Request (type == 2)
  154. */
  155. static inline void set_be_option_rep(NBDOptionReply *rep, uint32_t option,
  156. uint32_t type, uint32_t length)
  157. {
  158. stq_be_p(&rep->magic, NBD_REP_MAGIC);
  159. stl_be_p(&rep->option, option);
  160. stl_be_p(&rep->type, type);
  161. stl_be_p(&rep->length, length);
  162. }
  163. /* Send a reply header, including length, but no payload.
  164. * Return -errno on error, 0 on success. */
  165. static coroutine_fn int
  166. nbd_negotiate_send_rep_len(NBDClient *client, uint32_t type,
  167. uint32_t len, Error **errp)
  168. {
  169. NBDOptionReply rep;
  170. trace_nbd_negotiate_send_rep_len(client->opt, nbd_opt_lookup(client->opt),
  171. type, nbd_rep_lookup(type), len);
  172. assert(len < NBD_MAX_BUFFER_SIZE);
  173. set_be_option_rep(&rep, client->opt, type, len);
  174. return nbd_write(client->ioc, &rep, sizeof(rep), errp);
  175. }
  176. /* Send a reply header with default 0 length.
  177. * Return -errno on error, 0 on success. */
  178. static coroutine_fn int
  179. nbd_negotiate_send_rep(NBDClient *client, uint32_t type, Error **errp)
  180. {
  181. return nbd_negotiate_send_rep_len(client, type, 0, errp);
  182. }
  183. /* Send an error reply.
  184. * Return -errno on error, 0 on success. */
  185. static coroutine_fn int G_GNUC_PRINTF(4, 0)
  186. nbd_negotiate_send_rep_verr(NBDClient *client, uint32_t type,
  187. Error **errp, const char *fmt, va_list va)
  188. {
  189. ERRP_GUARD();
  190. g_autofree char *msg = NULL;
  191. int ret;
  192. size_t len;
  193. msg = g_strdup_vprintf(fmt, va);
  194. len = strlen(msg);
  195. assert(len < NBD_MAX_STRING_SIZE);
  196. trace_nbd_negotiate_send_rep_err(msg);
  197. ret = nbd_negotiate_send_rep_len(client, type, len, errp);
  198. if (ret < 0) {
  199. return ret;
  200. }
  201. if (nbd_write(client->ioc, msg, len, errp) < 0) {
  202. error_prepend(errp, "write failed (error message): ");
  203. return -EIO;
  204. }
  205. return 0;
  206. }
  207. /*
  208. * Return a malloc'd copy of @name suitable for use in an error reply.
  209. */
  210. static char *
  211. nbd_sanitize_name(const char *name)
  212. {
  213. if (strnlen(name, 80) < 80) {
  214. return g_strdup(name);
  215. }
  216. /* XXX Should we also try to sanitize any control characters? */
  217. return g_strdup_printf("%.80s...", name);
  218. }
  219. /* Send an error reply.
  220. * Return -errno on error, 0 on success. */
  221. static coroutine_fn int G_GNUC_PRINTF(4, 5)
  222. nbd_negotiate_send_rep_err(NBDClient *client, uint32_t type,
  223. Error **errp, const char *fmt, ...)
  224. {
  225. va_list va;
  226. int ret;
  227. va_start(va, fmt);
  228. ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
  229. va_end(va);
  230. return ret;
  231. }
  232. /* Drop remainder of the current option, and send a reply with the
  233. * given error type and message. Return -errno on read or write
  234. * failure; or 0 if connection is still live. */
  235. static coroutine_fn int G_GNUC_PRINTF(4, 0)
  236. nbd_opt_vdrop(NBDClient *client, uint32_t type, Error **errp,
  237. const char *fmt, va_list va)
  238. {
  239. int ret = nbd_drop(client->ioc, client->optlen, errp);
  240. client->optlen = 0;
  241. if (!ret) {
  242. ret = nbd_negotiate_send_rep_verr(client, type, errp, fmt, va);
  243. }
  244. return ret;
  245. }
  246. static coroutine_fn int G_GNUC_PRINTF(4, 5)
  247. nbd_opt_drop(NBDClient *client, uint32_t type, Error **errp,
  248. const char *fmt, ...)
  249. {
  250. int ret;
  251. va_list va;
  252. va_start(va, fmt);
  253. ret = nbd_opt_vdrop(client, type, errp, fmt, va);
  254. va_end(va);
  255. return ret;
  256. }
  257. static coroutine_fn int G_GNUC_PRINTF(3, 4)
  258. nbd_opt_invalid(NBDClient *client, Error **errp, const char *fmt, ...)
  259. {
  260. int ret;
  261. va_list va;
  262. va_start(va, fmt);
  263. ret = nbd_opt_vdrop(client, NBD_REP_ERR_INVALID, errp, fmt, va);
  264. va_end(va);
  265. return ret;
  266. }
  267. /* Read size bytes from the unparsed payload of the current option.
  268. * If @check_nul, require that no NUL bytes appear in buffer.
  269. * Return -errno on I/O error, 0 if option was completely handled by
  270. * sending a reply about inconsistent lengths, or 1 on success. */
  271. static coroutine_fn int
  272. nbd_opt_read(NBDClient *client, void *buffer, size_t size,
  273. bool check_nul, Error **errp)
  274. {
  275. if (size > client->optlen) {
  276. return nbd_opt_invalid(client, errp,
  277. "Inconsistent lengths in option %s",
  278. nbd_opt_lookup(client->opt));
  279. }
  280. client->optlen -= size;
  281. if (qio_channel_read_all(client->ioc, buffer, size, errp) < 0) {
  282. return -EIO;
  283. }
  284. if (check_nul && strnlen(buffer, size) != size) {
  285. return nbd_opt_invalid(client, errp,
  286. "Unexpected embedded NUL in option %s",
  287. nbd_opt_lookup(client->opt));
  288. }
  289. return 1;
  290. }
  291. /* Drop size bytes from the unparsed payload of the current option.
  292. * Return -errno on I/O error, 0 if option was completely handled by
  293. * sending a reply about inconsistent lengths, or 1 on success. */
  294. static coroutine_fn int
  295. nbd_opt_skip(NBDClient *client, size_t size, Error **errp)
  296. {
  297. if (size > client->optlen) {
  298. return nbd_opt_invalid(client, errp,
  299. "Inconsistent lengths in option %s",
  300. nbd_opt_lookup(client->opt));
  301. }
  302. client->optlen -= size;
  303. return nbd_drop(client->ioc, size, errp) < 0 ? -EIO : 1;
  304. }
  305. /* nbd_opt_read_name
  306. *
  307. * Read a string with the format:
  308. * uint32_t len (<= NBD_MAX_STRING_SIZE)
  309. * len bytes string (not 0-terminated)
  310. *
  311. * On success, @name will be allocated.
  312. * If @length is non-null, it will be set to the actual string length.
  313. *
  314. * Return -errno on I/O error, 0 if option was completely handled by
  315. * sending a reply about inconsistent lengths, or 1 on success.
  316. */
  317. static coroutine_fn int
  318. nbd_opt_read_name(NBDClient *client, char **name, uint32_t *length,
  319. Error **errp)
  320. {
  321. int ret;
  322. uint32_t len;
  323. g_autofree char *local_name = NULL;
  324. *name = NULL;
  325. ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
  326. if (ret <= 0) {
  327. return ret;
  328. }
  329. len = cpu_to_be32(len);
  330. if (len > NBD_MAX_STRING_SIZE) {
  331. return nbd_opt_invalid(client, errp,
  332. "Invalid name length: %" PRIu32, len);
  333. }
  334. local_name = g_malloc(len + 1);
  335. ret = nbd_opt_read(client, local_name, len, true, errp);
  336. if (ret <= 0) {
  337. return ret;
  338. }
  339. local_name[len] = '\0';
  340. if (length) {
  341. *length = len;
  342. }
  343. *name = g_steal_pointer(&local_name);
  344. return 1;
  345. }
  346. /* Send a single NBD_REP_SERVER reply to NBD_OPT_LIST, including payload.
  347. * Return -errno on error, 0 on success. */
  348. static coroutine_fn int
  349. nbd_negotiate_send_rep_list(NBDClient *client, NBDExport *exp, Error **errp)
  350. {
  351. ERRP_GUARD();
  352. size_t name_len, desc_len;
  353. uint32_t len;
  354. const char *name = exp->name ? exp->name : "";
  355. const char *desc = exp->description ? exp->description : "";
  356. QIOChannel *ioc = client->ioc;
  357. int ret;
  358. trace_nbd_negotiate_send_rep_list(name, desc);
  359. name_len = strlen(name);
  360. desc_len = strlen(desc);
  361. assert(name_len <= NBD_MAX_STRING_SIZE && desc_len <= NBD_MAX_STRING_SIZE);
  362. len = name_len + desc_len + sizeof(len);
  363. ret = nbd_negotiate_send_rep_len(client, NBD_REP_SERVER, len, errp);
  364. if (ret < 0) {
  365. return ret;
  366. }
  367. len = cpu_to_be32(name_len);
  368. if (nbd_write(ioc, &len, sizeof(len), errp) < 0) {
  369. error_prepend(errp, "write failed (name length): ");
  370. return -EINVAL;
  371. }
  372. if (nbd_write(ioc, name, name_len, errp) < 0) {
  373. error_prepend(errp, "write failed (name buffer): ");
  374. return -EINVAL;
  375. }
  376. if (nbd_write(ioc, desc, desc_len, errp) < 0) {
  377. error_prepend(errp, "write failed (description buffer): ");
  378. return -EINVAL;
  379. }
  380. return 0;
  381. }
  382. /* Process the NBD_OPT_LIST command, with a potential series of replies.
  383. * Return -errno on error, 0 on success. */
  384. static coroutine_fn int
  385. nbd_negotiate_handle_list(NBDClient *client, Error **errp)
  386. {
  387. NBDExport *exp;
  388. assert(client->opt == NBD_OPT_LIST);
  389. /* For each export, send a NBD_REP_SERVER reply. */
  390. QTAILQ_FOREACH(exp, &exports, next) {
  391. if (nbd_negotiate_send_rep_list(client, exp, errp)) {
  392. return -EINVAL;
  393. }
  394. }
  395. /* Finish with a NBD_REP_ACK. */
  396. return nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
  397. }
  398. static coroutine_fn void
  399. nbd_check_meta_export(NBDClient *client, NBDExport *exp)
  400. {
  401. if (exp != client->contexts.exp) {
  402. client->contexts.count = 0;
  403. }
  404. }
  405. /* Send a reply to NBD_OPT_EXPORT_NAME.
  406. * Return -errno on error, 0 on success. */
  407. static coroutine_fn int
  408. nbd_negotiate_handle_export_name(NBDClient *client, bool no_zeroes,
  409. Error **errp)
  410. {
  411. ERRP_GUARD();
  412. g_autofree char *name = NULL;
  413. char buf[NBD_REPLY_EXPORT_NAME_SIZE] = "";
  414. size_t len;
  415. int ret;
  416. uint16_t myflags;
  417. /* Client sends:
  418. [20 .. xx] export name (length bytes)
  419. Server replies:
  420. [ 0 .. 7] size
  421. [ 8 .. 9] export flags
  422. [10 .. 133] reserved (0) [unless no_zeroes]
  423. */
  424. trace_nbd_negotiate_handle_export_name();
  425. if (client->mode >= NBD_MODE_EXTENDED) {
  426. error_setg(errp, "Extended headers already negotiated");
  427. return -EINVAL;
  428. }
  429. if (client->optlen > NBD_MAX_STRING_SIZE) {
  430. error_setg(errp, "Bad length received");
  431. return -EINVAL;
  432. }
  433. name = g_malloc(client->optlen + 1);
  434. if (nbd_read(client->ioc, name, client->optlen, "export name", errp) < 0) {
  435. return -EIO;
  436. }
  437. name[client->optlen] = '\0';
  438. client->optlen = 0;
  439. trace_nbd_negotiate_handle_export_name_request(name);
  440. client->exp = nbd_export_find(name);
  441. if (!client->exp) {
  442. error_setg(errp, "export not found");
  443. return -EINVAL;
  444. }
  445. nbd_check_meta_export(client, client->exp);
  446. myflags = client->exp->nbdflags;
  447. if (client->mode >= NBD_MODE_STRUCTURED) {
  448. myflags |= NBD_FLAG_SEND_DF;
  449. }
  450. if (client->mode >= NBD_MODE_EXTENDED && client->contexts.count) {
  451. myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
  452. }
  453. trace_nbd_negotiate_new_style_size_flags(client->exp->size, myflags);
  454. stq_be_p(buf, client->exp->size);
  455. stw_be_p(buf + 8, myflags);
  456. len = no_zeroes ? 10 : sizeof(buf);
  457. ret = nbd_write(client->ioc, buf, len, errp);
  458. if (ret < 0) {
  459. error_prepend(errp, "write failed: ");
  460. return ret;
  461. }
  462. QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
  463. blk_exp_ref(&client->exp->common);
  464. return 0;
  465. }
  466. /* Send a single NBD_REP_INFO, with a buffer @buf of @length bytes.
  467. * The buffer does NOT include the info type prefix.
  468. * Return -errno on error, 0 if ready to send more. */
  469. static coroutine_fn int
  470. nbd_negotiate_send_info(NBDClient *client, uint16_t info, uint32_t length,
  471. void *buf, Error **errp)
  472. {
  473. int rc;
  474. trace_nbd_negotiate_send_info(info, nbd_info_lookup(info), length);
  475. rc = nbd_negotiate_send_rep_len(client, NBD_REP_INFO,
  476. sizeof(info) + length, errp);
  477. if (rc < 0) {
  478. return rc;
  479. }
  480. info = cpu_to_be16(info);
  481. if (nbd_write(client->ioc, &info, sizeof(info), errp) < 0) {
  482. return -EIO;
  483. }
  484. if (nbd_write(client->ioc, buf, length, errp) < 0) {
  485. return -EIO;
  486. }
  487. return 0;
  488. }
  489. /* nbd_reject_length: Handle any unexpected payload.
  490. * @fatal requests that we quit talking to the client, even if we are able
  491. * to successfully send an error reply.
  492. * Return:
  493. * -errno transmission error occurred or @fatal was requested, errp is set
  494. * 0 error message successfully sent to client, errp is not set
  495. */
  496. static coroutine_fn int
  497. nbd_reject_length(NBDClient *client, bool fatal, Error **errp)
  498. {
  499. int ret;
  500. assert(client->optlen);
  501. ret = nbd_opt_invalid(client, errp, "option '%s' has unexpected length",
  502. nbd_opt_lookup(client->opt));
  503. if (fatal && !ret) {
  504. error_setg(errp, "option '%s' has unexpected length",
  505. nbd_opt_lookup(client->opt));
  506. return -EINVAL;
  507. }
  508. return ret;
  509. }
  510. /* Handle NBD_OPT_INFO and NBD_OPT_GO.
  511. * Return -errno on error, 0 if ready for next option, and 1 to move
  512. * into transmission phase. */
  513. static coroutine_fn int
  514. nbd_negotiate_handle_info(NBDClient *client, Error **errp)
  515. {
  516. int rc;
  517. g_autofree char *name = NULL;
  518. NBDExport *exp;
  519. uint16_t requests;
  520. uint16_t request;
  521. uint32_t namelen = 0;
  522. bool sendname = false;
  523. bool blocksize = false;
  524. uint32_t sizes[3];
  525. char buf[sizeof(uint64_t) + sizeof(uint16_t)];
  526. uint32_t check_align = 0;
  527. uint16_t myflags;
  528. /* Client sends:
  529. 4 bytes: L, name length (can be 0)
  530. L bytes: export name
  531. 2 bytes: N, number of requests (can be 0)
  532. N * 2 bytes: N requests
  533. */
  534. rc = nbd_opt_read_name(client, &name, &namelen, errp);
  535. if (rc <= 0) {
  536. return rc;
  537. }
  538. trace_nbd_negotiate_handle_export_name_request(name);
  539. rc = nbd_opt_read(client, &requests, sizeof(requests), false, errp);
  540. if (rc <= 0) {
  541. return rc;
  542. }
  543. requests = be16_to_cpu(requests);
  544. trace_nbd_negotiate_handle_info_requests(requests);
  545. while (requests--) {
  546. rc = nbd_opt_read(client, &request, sizeof(request), false, errp);
  547. if (rc <= 0) {
  548. return rc;
  549. }
  550. request = be16_to_cpu(request);
  551. trace_nbd_negotiate_handle_info_request(request,
  552. nbd_info_lookup(request));
  553. /* We care about NBD_INFO_NAME and NBD_INFO_BLOCK_SIZE;
  554. * everything else is either a request we don't know or
  555. * something we send regardless of request */
  556. switch (request) {
  557. case NBD_INFO_NAME:
  558. sendname = true;
  559. break;
  560. case NBD_INFO_BLOCK_SIZE:
  561. blocksize = true;
  562. break;
  563. }
  564. }
  565. if (client->optlen) {
  566. return nbd_reject_length(client, false, errp);
  567. }
  568. exp = nbd_export_find(name);
  569. if (!exp) {
  570. g_autofree char *sane_name = nbd_sanitize_name(name);
  571. return nbd_negotiate_send_rep_err(client, NBD_REP_ERR_UNKNOWN,
  572. errp, "export '%s' not present",
  573. sane_name);
  574. }
  575. if (client->opt == NBD_OPT_GO) {
  576. nbd_check_meta_export(client, exp);
  577. }
  578. /* Don't bother sending NBD_INFO_NAME unless client requested it */
  579. if (sendname) {
  580. rc = nbd_negotiate_send_info(client, NBD_INFO_NAME, namelen, name,
  581. errp);
  582. if (rc < 0) {
  583. return rc;
  584. }
  585. }
  586. /* Send NBD_INFO_DESCRIPTION only if available, regardless of
  587. * client request */
  588. if (exp->description) {
  589. size_t len = strlen(exp->description);
  590. assert(len <= NBD_MAX_STRING_SIZE);
  591. rc = nbd_negotiate_send_info(client, NBD_INFO_DESCRIPTION,
  592. len, exp->description, errp);
  593. if (rc < 0) {
  594. return rc;
  595. }
  596. }
  597. /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
  598. * according to whether the client requested it, and according to
  599. * whether this is OPT_INFO or OPT_GO. */
  600. /* minimum - 1 for back-compat, or actual if client will obey it. */
  601. if (client->opt == NBD_OPT_INFO || blocksize) {
  602. check_align = sizes[0] = blk_get_request_alignment(exp->common.blk);
  603. } else {
  604. sizes[0] = 1;
  605. }
  606. assert(sizes[0] <= NBD_MAX_BUFFER_SIZE);
  607. /* preferred - Hard-code to 4096 for now.
  608. * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
  609. sizes[1] = MAX(4096, sizes[0]);
  610. /* maximum - At most 32M, but smaller as appropriate. */
  611. sizes[2] = MIN(blk_get_max_transfer(exp->common.blk), NBD_MAX_BUFFER_SIZE);
  612. trace_nbd_negotiate_handle_info_block_size(sizes[0], sizes[1], sizes[2]);
  613. sizes[0] = cpu_to_be32(sizes[0]);
  614. sizes[1] = cpu_to_be32(sizes[1]);
  615. sizes[2] = cpu_to_be32(sizes[2]);
  616. rc = nbd_negotiate_send_info(client, NBD_INFO_BLOCK_SIZE,
  617. sizeof(sizes), sizes, errp);
  618. if (rc < 0) {
  619. return rc;
  620. }
  621. /* Send NBD_INFO_EXPORT always */
  622. myflags = exp->nbdflags;
  623. if (client->mode >= NBD_MODE_STRUCTURED) {
  624. myflags |= NBD_FLAG_SEND_DF;
  625. }
  626. if (client->mode >= NBD_MODE_EXTENDED &&
  627. (client->contexts.count || client->opt == NBD_OPT_INFO)) {
  628. myflags |= NBD_FLAG_BLOCK_STAT_PAYLOAD;
  629. }
  630. trace_nbd_negotiate_new_style_size_flags(exp->size, myflags);
  631. stq_be_p(buf, exp->size);
  632. stw_be_p(buf + 8, myflags);
  633. rc = nbd_negotiate_send_info(client, NBD_INFO_EXPORT,
  634. sizeof(buf), buf, errp);
  635. if (rc < 0) {
  636. return rc;
  637. }
  638. /*
  639. * If the client is just asking for NBD_OPT_INFO, but forgot to
  640. * request block sizes in a situation that would impact
  641. * performance, then return an error. But for NBD_OPT_GO, we
  642. * tolerate all clients, regardless of alignments.
  643. */
  644. if (client->opt == NBD_OPT_INFO && !blocksize &&
  645. blk_get_request_alignment(exp->common.blk) > 1) {
  646. return nbd_negotiate_send_rep_err(client,
  647. NBD_REP_ERR_BLOCK_SIZE_REQD,
  648. errp,
  649. "request NBD_INFO_BLOCK_SIZE to "
  650. "use this export");
  651. }
  652. /* Final reply */
  653. rc = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
  654. if (rc < 0) {
  655. return rc;
  656. }
  657. if (client->opt == NBD_OPT_GO) {
  658. client->exp = exp;
  659. client->check_align = check_align;
  660. QTAILQ_INSERT_TAIL(&client->exp->clients, client, next);
  661. blk_exp_ref(&client->exp->common);
  662. rc = 1;
  663. }
  664. return rc;
  665. }
  666. /* Callback to learn when QIO TLS upgrade is complete */
  667. struct NBDTLSServerHandshakeData {
  668. bool complete;
  669. Error *error;
  670. Coroutine *co;
  671. };
  672. static void
  673. nbd_server_tls_handshake(QIOTask *task, void *opaque)
  674. {
  675. struct NBDTLSServerHandshakeData *data = opaque;
  676. qio_task_propagate_error(task, &data->error);
  677. data->complete = true;
  678. if (!qemu_coroutine_entered(data->co)) {
  679. aio_co_wake(data->co);
  680. }
  681. }
  682. /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
  683. * new channel for all further (now-encrypted) communication. */
  684. static coroutine_fn QIOChannel *
  685. nbd_negotiate_handle_starttls(NBDClient *client, Error **errp)
  686. {
  687. QIOChannel *ioc;
  688. QIOChannelTLS *tioc;
  689. struct NBDTLSServerHandshakeData data = { 0 };
  690. assert(client->opt == NBD_OPT_STARTTLS);
  691. trace_nbd_negotiate_handle_starttls();
  692. ioc = client->ioc;
  693. if (nbd_negotiate_send_rep(client, NBD_REP_ACK, errp) < 0) {
  694. return NULL;
  695. }
  696. tioc = qio_channel_tls_new_server(ioc,
  697. client->tlscreds,
  698. client->tlsauthz,
  699. errp);
  700. if (!tioc) {
  701. return NULL;
  702. }
  703. qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-server-tls");
  704. trace_nbd_negotiate_handle_starttls_handshake();
  705. data.co = qemu_coroutine_self();
  706. qio_channel_tls_handshake(tioc,
  707. nbd_server_tls_handshake,
  708. &data,
  709. NULL,
  710. NULL);
  711. if (!data.complete) {
  712. qemu_coroutine_yield();
  713. assert(data.complete);
  714. }
  715. if (data.error) {
  716. object_unref(OBJECT(tioc));
  717. error_propagate(errp, data.error);
  718. return NULL;
  719. }
  720. return QIO_CHANNEL(tioc);
  721. }
  722. /* nbd_negotiate_send_meta_context
  723. *
  724. * Send one chunk of reply to NBD_OPT_{LIST,SET}_META_CONTEXT
  725. *
  726. * For NBD_OPT_LIST_META_CONTEXT @context_id is ignored, 0 is used instead.
  727. */
  728. static coroutine_fn int
  729. nbd_negotiate_send_meta_context(NBDClient *client, const char *context,
  730. uint32_t context_id, Error **errp)
  731. {
  732. NBDOptionReplyMetaContext opt;
  733. struct iovec iov[] = {
  734. {.iov_base = &opt, .iov_len = sizeof(opt)},
  735. {.iov_base = (void *)context, .iov_len = strlen(context)}
  736. };
  737. assert(iov[1].iov_len <= NBD_MAX_STRING_SIZE);
  738. if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
  739. context_id = 0;
  740. }
  741. trace_nbd_negotiate_meta_query_reply(context, context_id);
  742. set_be_option_rep(&opt.h, client->opt, NBD_REP_META_CONTEXT,
  743. sizeof(opt) - sizeof(opt.h) + iov[1].iov_len);
  744. stl_be_p(&opt.context_id, context_id);
  745. return qio_channel_writev_all(client->ioc, iov, 2, errp) < 0 ? -EIO : 0;
  746. }
  747. /*
  748. * Return true if @query matches @pattern, or if @query is empty when
  749. * the @client is performing _LIST_.
  750. */
  751. static coroutine_fn bool
  752. nbd_meta_empty_or_pattern(NBDClient *client, const char *pattern,
  753. const char *query)
  754. {
  755. if (!*query) {
  756. trace_nbd_negotiate_meta_query_parse("empty");
  757. return client->opt == NBD_OPT_LIST_META_CONTEXT;
  758. }
  759. if (strcmp(query, pattern) == 0) {
  760. trace_nbd_negotiate_meta_query_parse(pattern);
  761. return true;
  762. }
  763. trace_nbd_negotiate_meta_query_skip("pattern not matched");
  764. return false;
  765. }
  766. /*
  767. * Return true and adjust @str in place if it begins with @prefix.
  768. */
  769. static coroutine_fn bool
  770. nbd_strshift(const char **str, const char *prefix)
  771. {
  772. size_t len = strlen(prefix);
  773. if (strncmp(*str, prefix, len) == 0) {
  774. *str += len;
  775. return true;
  776. }
  777. return false;
  778. }
  779. /* nbd_meta_base_query
  780. *
  781. * Handle queries to 'base' namespace. For now, only the base:allocation
  782. * context is available. Return true if @query has been handled.
  783. */
  784. static coroutine_fn bool
  785. nbd_meta_base_query(NBDClient *client, NBDMetaContexts *meta,
  786. const char *query)
  787. {
  788. if (!nbd_strshift(&query, "base:")) {
  789. return false;
  790. }
  791. trace_nbd_negotiate_meta_query_parse("base:");
  792. if (nbd_meta_empty_or_pattern(client, "allocation", query)) {
  793. meta->base_allocation = true;
  794. }
  795. return true;
  796. }
  797. /* nbd_meta_qemu_query
  798. *
  799. * Handle queries to 'qemu' namespace. For now, only the qemu:dirty-bitmap:
  800. * and qemu:allocation-depth contexts are available. Return true if @query
  801. * has been handled.
  802. */
  803. static coroutine_fn bool
  804. nbd_meta_qemu_query(NBDClient *client, NBDMetaContexts *meta,
  805. const char *query)
  806. {
  807. size_t i;
  808. if (!nbd_strshift(&query, "qemu:")) {
  809. return false;
  810. }
  811. trace_nbd_negotiate_meta_query_parse("qemu:");
  812. if (!*query) {
  813. if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
  814. meta->allocation_depth = meta->exp->allocation_depth;
  815. if (meta->exp->nr_export_bitmaps) {
  816. memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
  817. }
  818. }
  819. trace_nbd_negotiate_meta_query_parse("empty");
  820. return true;
  821. }
  822. if (strcmp(query, "allocation-depth") == 0) {
  823. trace_nbd_negotiate_meta_query_parse("allocation-depth");
  824. meta->allocation_depth = meta->exp->allocation_depth;
  825. return true;
  826. }
  827. if (nbd_strshift(&query, "dirty-bitmap:")) {
  828. trace_nbd_negotiate_meta_query_parse("dirty-bitmap:");
  829. if (!*query) {
  830. if (client->opt == NBD_OPT_LIST_META_CONTEXT &&
  831. meta->exp->nr_export_bitmaps) {
  832. memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
  833. }
  834. trace_nbd_negotiate_meta_query_parse("empty");
  835. return true;
  836. }
  837. for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
  838. const char *bm_name;
  839. bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
  840. if (strcmp(bm_name, query) == 0) {
  841. meta->bitmaps[i] = true;
  842. trace_nbd_negotiate_meta_query_parse(query);
  843. return true;
  844. }
  845. }
  846. trace_nbd_negotiate_meta_query_skip("no dirty-bitmap match");
  847. return true;
  848. }
  849. trace_nbd_negotiate_meta_query_skip("unknown qemu context");
  850. return true;
  851. }
  852. /* nbd_negotiate_meta_query
  853. *
  854. * Parse namespace name and call corresponding function to parse body of the
  855. * query.
  856. *
  857. * The only supported namespaces are 'base' and 'qemu'.
  858. *
  859. * Return -errno on I/O error, 0 if option was completely handled by
  860. * sending a reply about inconsistent lengths, or 1 on success. */
  861. static coroutine_fn int
  862. nbd_negotiate_meta_query(NBDClient *client,
  863. NBDMetaContexts *meta, Error **errp)
  864. {
  865. int ret;
  866. g_autofree char *query = NULL;
  867. uint32_t len;
  868. ret = nbd_opt_read(client, &len, sizeof(len), false, errp);
  869. if (ret <= 0) {
  870. return ret;
  871. }
  872. len = cpu_to_be32(len);
  873. if (len > NBD_MAX_STRING_SIZE) {
  874. trace_nbd_negotiate_meta_query_skip("length too long");
  875. return nbd_opt_skip(client, len, errp);
  876. }
  877. query = g_malloc(len + 1);
  878. ret = nbd_opt_read(client, query, len, true, errp);
  879. if (ret <= 0) {
  880. return ret;
  881. }
  882. query[len] = '\0';
  883. if (nbd_meta_base_query(client, meta, query)) {
  884. return 1;
  885. }
  886. if (nbd_meta_qemu_query(client, meta, query)) {
  887. return 1;
  888. }
  889. trace_nbd_negotiate_meta_query_skip("unknown namespace");
  890. return 1;
  891. }
  892. /* nbd_negotiate_meta_queries
  893. * Handle NBD_OPT_LIST_META_CONTEXT and NBD_OPT_SET_META_CONTEXT
  894. *
  895. * Return -errno on I/O error, or 0 if option was completely handled. */
  896. static coroutine_fn int
  897. nbd_negotiate_meta_queries(NBDClient *client, Error **errp)
  898. {
  899. int ret;
  900. g_autofree char *export_name = NULL;
  901. /* Mark unused to work around https://bugs.llvm.org/show_bug.cgi?id=3888 */
  902. g_autofree G_GNUC_UNUSED bool *bitmaps = NULL;
  903. NBDMetaContexts local_meta = {0};
  904. NBDMetaContexts *meta;
  905. uint32_t nb_queries;
  906. size_t i;
  907. size_t count = 0;
  908. if (client->opt == NBD_OPT_SET_META_CONTEXT &&
  909. client->mode < NBD_MODE_STRUCTURED) {
  910. return nbd_opt_invalid(client, errp,
  911. "request option '%s' when structured reply "
  912. "is not negotiated",
  913. nbd_opt_lookup(client->opt));
  914. }
  915. if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
  916. /* Only change the caller's meta on SET. */
  917. meta = &local_meta;
  918. } else {
  919. meta = &client->contexts;
  920. }
  921. g_free(meta->bitmaps);
  922. memset(meta, 0, sizeof(*meta));
  923. ret = nbd_opt_read_name(client, &export_name, NULL, errp);
  924. if (ret <= 0) {
  925. return ret;
  926. }
  927. meta->exp = nbd_export_find(export_name);
  928. if (meta->exp == NULL) {
  929. g_autofree char *sane_name = nbd_sanitize_name(export_name);
  930. return nbd_opt_drop(client, NBD_REP_ERR_UNKNOWN, errp,
  931. "export '%s' not present", sane_name);
  932. }
  933. meta->bitmaps = g_new0(bool, meta->exp->nr_export_bitmaps);
  934. if (client->opt == NBD_OPT_LIST_META_CONTEXT) {
  935. bitmaps = meta->bitmaps;
  936. }
  937. ret = nbd_opt_read(client, &nb_queries, sizeof(nb_queries), false, errp);
  938. if (ret <= 0) {
  939. return ret;
  940. }
  941. nb_queries = cpu_to_be32(nb_queries);
  942. trace_nbd_negotiate_meta_context(nbd_opt_lookup(client->opt),
  943. export_name, nb_queries);
  944. if (client->opt == NBD_OPT_LIST_META_CONTEXT && !nb_queries) {
  945. /* enable all known contexts */
  946. meta->base_allocation = true;
  947. meta->allocation_depth = meta->exp->allocation_depth;
  948. if (meta->exp->nr_export_bitmaps) {
  949. memset(meta->bitmaps, 1, meta->exp->nr_export_bitmaps);
  950. }
  951. } else {
  952. for (i = 0; i < nb_queries; ++i) {
  953. ret = nbd_negotiate_meta_query(client, meta, errp);
  954. if (ret <= 0) {
  955. return ret;
  956. }
  957. }
  958. }
  959. if (meta->base_allocation) {
  960. ret = nbd_negotiate_send_meta_context(client, "base:allocation",
  961. NBD_META_ID_BASE_ALLOCATION,
  962. errp);
  963. if (ret < 0) {
  964. return ret;
  965. }
  966. count++;
  967. }
  968. if (meta->allocation_depth) {
  969. ret = nbd_negotiate_send_meta_context(client, "qemu:allocation-depth",
  970. NBD_META_ID_ALLOCATION_DEPTH,
  971. errp);
  972. if (ret < 0) {
  973. return ret;
  974. }
  975. count++;
  976. }
  977. for (i = 0; i < meta->exp->nr_export_bitmaps; i++) {
  978. const char *bm_name;
  979. g_autofree char *context = NULL;
  980. if (!meta->bitmaps[i]) {
  981. continue;
  982. }
  983. bm_name = bdrv_dirty_bitmap_name(meta->exp->export_bitmaps[i]);
  984. context = g_strdup_printf("qemu:dirty-bitmap:%s", bm_name);
  985. ret = nbd_negotiate_send_meta_context(client, context,
  986. NBD_META_ID_DIRTY_BITMAP + i,
  987. errp);
  988. if (ret < 0) {
  989. return ret;
  990. }
  991. count++;
  992. }
  993. ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
  994. if (ret == 0) {
  995. meta->count = count;
  996. }
  997. return ret;
  998. }
  999. /* nbd_negotiate_options
  1000. * Process all NBD_OPT_* client option commands, during fixed newstyle
  1001. * negotiation.
  1002. * Return:
  1003. * -errno on error, errp is set
  1004. * 0 on successful negotiation, errp is not set
  1005. * 1 if client sent NBD_OPT_ABORT (i.e. on valid disconnect) or never
  1006. * wrote anything (i.e. port probe); errp is not set
  1007. */
  1008. static coroutine_fn int
  1009. nbd_negotiate_options(NBDClient *client, Error **errp)
  1010. {
  1011. uint32_t flags;
  1012. bool fixedNewstyle = false;
  1013. bool no_zeroes = false;
  1014. /* Client sends:
  1015. [ 0 .. 3] client flags
  1016. Then we loop until NBD_OPT_EXPORT_NAME or NBD_OPT_GO:
  1017. [ 0 .. 7] NBD_OPTS_MAGIC
  1018. [ 8 .. 11] NBD option
  1019. [12 .. 15] Data length
  1020. ... Rest of request
  1021. [ 0 .. 7] NBD_OPTS_MAGIC
  1022. [ 8 .. 11] Second NBD option
  1023. [12 .. 15] Data length
  1024. ... Rest of request
  1025. */
  1026. /*
  1027. * Intentionally ignore errors on this first read - we do not want
  1028. * to be noisy about a mere port probe, but only for clients that
  1029. * start talking the protocol and then quit abruptly.
  1030. */
  1031. if (nbd_read32(client->ioc, &flags, "flags", NULL) < 0) {
  1032. return 1;
  1033. }
  1034. client->mode = NBD_MODE_EXPORT_NAME;
  1035. trace_nbd_negotiate_options_flags(flags);
  1036. if (flags & NBD_FLAG_C_FIXED_NEWSTYLE) {
  1037. fixedNewstyle = true;
  1038. flags &= ~NBD_FLAG_C_FIXED_NEWSTYLE;
  1039. client->mode = NBD_MODE_SIMPLE;
  1040. }
  1041. if (flags & NBD_FLAG_C_NO_ZEROES) {
  1042. no_zeroes = true;
  1043. flags &= ~NBD_FLAG_C_NO_ZEROES;
  1044. }
  1045. if (flags != 0) {
  1046. error_setg(errp, "Unknown client flags 0x%" PRIx32 " received", flags);
  1047. return -EINVAL;
  1048. }
  1049. while (1) {
  1050. int ret;
  1051. uint32_t option, length;
  1052. uint64_t magic;
  1053. if (nbd_read64(client->ioc, &magic, "opts magic", errp) < 0) {
  1054. return -EINVAL;
  1055. }
  1056. trace_nbd_negotiate_options_check_magic(magic);
  1057. if (magic != NBD_OPTS_MAGIC) {
  1058. error_setg(errp, "Bad magic received");
  1059. return -EINVAL;
  1060. }
  1061. if (nbd_read32(client->ioc, &option, "option", errp) < 0) {
  1062. return -EINVAL;
  1063. }
  1064. client->opt = option;
  1065. if (nbd_read32(client->ioc, &length, "option length", errp) < 0) {
  1066. return -EINVAL;
  1067. }
  1068. assert(!client->optlen);
  1069. client->optlen = length;
  1070. if (length > NBD_MAX_BUFFER_SIZE) {
  1071. error_setg(errp, "len (%" PRIu32 ") is larger than max len (%u)",
  1072. length, NBD_MAX_BUFFER_SIZE);
  1073. return -EINVAL;
  1074. }
  1075. trace_nbd_negotiate_options_check_option(option,
  1076. nbd_opt_lookup(option));
  1077. if (client->tlscreds &&
  1078. client->ioc == (QIOChannel *)client->sioc) {
  1079. QIOChannel *tioc;
  1080. if (!fixedNewstyle) {
  1081. error_setg(errp, "Unsupported option 0x%" PRIx32, option);
  1082. return -EINVAL;
  1083. }
  1084. switch (option) {
  1085. case NBD_OPT_STARTTLS:
  1086. if (length) {
  1087. /* Unconditionally drop the connection if the client
  1088. * can't start a TLS negotiation correctly */
  1089. return nbd_reject_length(client, true, errp);
  1090. }
  1091. tioc = nbd_negotiate_handle_starttls(client, errp);
  1092. if (!tioc) {
  1093. return -EIO;
  1094. }
  1095. ret = 0;
  1096. object_unref(OBJECT(client->ioc));
  1097. client->ioc = tioc;
  1098. break;
  1099. case NBD_OPT_EXPORT_NAME:
  1100. /* No way to return an error to client, so drop connection */
  1101. error_setg(errp, "Option 0x%x not permitted before TLS",
  1102. option);
  1103. return -EINVAL;
  1104. default:
  1105. /* Let the client keep trying, unless they asked to
  1106. * quit. Always try to give an error back to the
  1107. * client; but when replying to OPT_ABORT, be aware
  1108. * that the client may hang up before receiving the
  1109. * error, in which case we are fine ignoring the
  1110. * resulting EPIPE. */
  1111. ret = nbd_opt_drop(client, NBD_REP_ERR_TLS_REQD,
  1112. option == NBD_OPT_ABORT ? NULL : errp,
  1113. "Option 0x%" PRIx32
  1114. " not permitted before TLS", option);
  1115. if (option == NBD_OPT_ABORT) {
  1116. return 1;
  1117. }
  1118. break;
  1119. }
  1120. } else if (fixedNewstyle) {
  1121. switch (option) {
  1122. case NBD_OPT_LIST:
  1123. if (length) {
  1124. ret = nbd_reject_length(client, false, errp);
  1125. } else {
  1126. ret = nbd_negotiate_handle_list(client, errp);
  1127. }
  1128. break;
  1129. case NBD_OPT_ABORT:
  1130. /* NBD spec says we must try to reply before
  1131. * disconnecting, but that we must also tolerate
  1132. * guests that don't wait for our reply. */
  1133. nbd_negotiate_send_rep(client, NBD_REP_ACK, NULL);
  1134. return 1;
  1135. case NBD_OPT_EXPORT_NAME:
  1136. return nbd_negotiate_handle_export_name(client, no_zeroes,
  1137. errp);
  1138. case NBD_OPT_INFO:
  1139. case NBD_OPT_GO:
  1140. ret = nbd_negotiate_handle_info(client, errp);
  1141. if (ret == 1) {
  1142. assert(option == NBD_OPT_GO);
  1143. return 0;
  1144. }
  1145. break;
  1146. case NBD_OPT_STARTTLS:
  1147. if (length) {
  1148. ret = nbd_reject_length(client, false, errp);
  1149. } else if (client->tlscreds) {
  1150. ret = nbd_negotiate_send_rep_err(client,
  1151. NBD_REP_ERR_INVALID, errp,
  1152. "TLS already enabled");
  1153. } else {
  1154. ret = nbd_negotiate_send_rep_err(client,
  1155. NBD_REP_ERR_POLICY, errp,
  1156. "TLS not configured");
  1157. }
  1158. break;
  1159. case NBD_OPT_STRUCTURED_REPLY:
  1160. if (length) {
  1161. ret = nbd_reject_length(client, false, errp);
  1162. } else if (client->mode >= NBD_MODE_EXTENDED) {
  1163. ret = nbd_negotiate_send_rep_err(
  1164. client, NBD_REP_ERR_EXT_HEADER_REQD, errp,
  1165. "extended headers already negotiated");
  1166. } else if (client->mode >= NBD_MODE_STRUCTURED) {
  1167. ret = nbd_negotiate_send_rep_err(
  1168. client, NBD_REP_ERR_INVALID, errp,
  1169. "structured reply already negotiated");
  1170. } else {
  1171. ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
  1172. client->mode = NBD_MODE_STRUCTURED;
  1173. }
  1174. break;
  1175. case NBD_OPT_LIST_META_CONTEXT:
  1176. case NBD_OPT_SET_META_CONTEXT:
  1177. ret = nbd_negotiate_meta_queries(client, errp);
  1178. break;
  1179. case NBD_OPT_EXTENDED_HEADERS:
  1180. if (length) {
  1181. ret = nbd_reject_length(client, false, errp);
  1182. } else if (client->mode >= NBD_MODE_EXTENDED) {
  1183. ret = nbd_negotiate_send_rep_err(
  1184. client, NBD_REP_ERR_INVALID, errp,
  1185. "extended headers already negotiated");
  1186. } else {
  1187. ret = nbd_negotiate_send_rep(client, NBD_REP_ACK, errp);
  1188. client->mode = NBD_MODE_EXTENDED;
  1189. }
  1190. break;
  1191. default:
  1192. ret = nbd_opt_drop(client, NBD_REP_ERR_UNSUP, errp,
  1193. "Unsupported option %" PRIu32 " (%s)",
  1194. option, nbd_opt_lookup(option));
  1195. break;
  1196. }
  1197. } else {
  1198. /*
  1199. * If broken new-style we should drop the connection
  1200. * for anything except NBD_OPT_EXPORT_NAME
  1201. */
  1202. switch (option) {
  1203. case NBD_OPT_EXPORT_NAME:
  1204. return nbd_negotiate_handle_export_name(client, no_zeroes,
  1205. errp);
  1206. default:
  1207. error_setg(errp, "Unsupported option %" PRIu32 " (%s)",
  1208. option, nbd_opt_lookup(option));
  1209. return -EINVAL;
  1210. }
  1211. }
  1212. if (ret < 0) {
  1213. return ret;
  1214. }
  1215. }
  1216. }
  1217. /* nbd_negotiate
  1218. * Return:
  1219. * -errno on error, errp is set
  1220. * 0 on successful negotiation, errp is not set
  1221. * 1 if client sent NBD_OPT_ABORT (i.e. on valid disconnect) or never
  1222. * wrote anything (i.e. port probe); errp is not set
  1223. */
  1224. static coroutine_fn int nbd_negotiate(NBDClient *client, Error **errp)
  1225. {
  1226. ERRP_GUARD();
  1227. char buf[NBD_OLDSTYLE_NEGOTIATE_SIZE] = "";
  1228. int ret;
  1229. /* Old style negotiation header, no room for options
  1230. [ 0 .. 7] passwd ("NBDMAGIC")
  1231. [ 8 .. 15] magic (NBD_CLIENT_MAGIC)
  1232. [16 .. 23] size
  1233. [24 .. 27] export flags (zero-extended)
  1234. [28 .. 151] reserved (0)
  1235. New style negotiation header, client can send options
  1236. [ 0 .. 7] passwd ("NBDMAGIC")
  1237. [ 8 .. 15] magic (NBD_OPTS_MAGIC)
  1238. [16 .. 17] server flags (0)
  1239. ....options sent, ending in NBD_OPT_EXPORT_NAME or NBD_OPT_GO....
  1240. */
  1241. qio_channel_set_blocking(client->ioc, false, NULL);
  1242. qio_channel_set_follow_coroutine_ctx(client->ioc, true);
  1243. trace_nbd_negotiate_begin();
  1244. memcpy(buf, "NBDMAGIC", 8);
  1245. stq_be_p(buf + 8, NBD_OPTS_MAGIC);
  1246. stw_be_p(buf + 16, NBD_FLAG_FIXED_NEWSTYLE | NBD_FLAG_NO_ZEROES);
  1247. /*
  1248. * Be silent about failure to write our greeting: there is nothing
  1249. * wrong with a client testing if our port is alive.
  1250. */
  1251. if (nbd_write(client->ioc, buf, 18, NULL) < 0) {
  1252. return 1;
  1253. }
  1254. ret = nbd_negotiate_options(client, errp);
  1255. if (ret != 0) {
  1256. if (ret < 0) {
  1257. error_prepend(errp, "option negotiation failed: ");
  1258. }
  1259. return ret;
  1260. }
  1261. assert(!client->optlen);
  1262. trace_nbd_negotiate_success();
  1263. return 0;
  1264. }
  1265. /* nbd_read_eof
  1266. * Tries to read @size bytes from @ioc. This is a local implementation of
  1267. * qio_channel_readv_all_eof. We have it here because we need it to be
  1268. * interruptible and to know when the coroutine is yielding.
  1269. * Returns 1 on success
  1270. * 0 on eof, when no data was read (errp is not set)
  1271. * negative errno on failure (errp is set)
  1272. */
  1273. static inline int coroutine_fn
  1274. nbd_read_eof(NBDClient *client, void *buffer, size_t size, Error **errp)
  1275. {
  1276. bool partial = false;
  1277. assert(size);
  1278. while (size > 0) {
  1279. struct iovec iov = { .iov_base = buffer, .iov_len = size };
  1280. ssize_t len;
  1281. len = qio_channel_readv(client->ioc, &iov, 1, errp);
  1282. if (len == QIO_CHANNEL_ERR_BLOCK) {
  1283. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1284. client->read_yielding = true;
  1285. /* Prompt main loop thread to re-run nbd_drained_poll() */
  1286. aio_wait_kick();
  1287. }
  1288. qio_channel_yield(client->ioc, G_IO_IN);
  1289. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1290. client->read_yielding = false;
  1291. if (client->quiescing) {
  1292. return -EAGAIN;
  1293. }
  1294. }
  1295. continue;
  1296. } else if (len < 0) {
  1297. return -EIO;
  1298. } else if (len == 0) {
  1299. if (partial) {
  1300. error_setg(errp,
  1301. "Unexpected end-of-file before all bytes were read");
  1302. return -EIO;
  1303. } else {
  1304. return 0;
  1305. }
  1306. }
  1307. partial = true;
  1308. size -= len;
  1309. buffer = (uint8_t *) buffer + len;
  1310. }
  1311. return 1;
  1312. }
  1313. static int coroutine_fn nbd_receive_request(NBDClient *client, NBDRequest *request,
  1314. Error **errp)
  1315. {
  1316. uint8_t buf[NBD_EXTENDED_REQUEST_SIZE];
  1317. uint32_t magic, expect;
  1318. int ret;
  1319. size_t size = client->mode >= NBD_MODE_EXTENDED ?
  1320. NBD_EXTENDED_REQUEST_SIZE : NBD_REQUEST_SIZE;
  1321. ret = nbd_read_eof(client, buf, size, errp);
  1322. if (ret < 0) {
  1323. return ret;
  1324. }
  1325. if (ret == 0) {
  1326. return -EIO;
  1327. }
  1328. /*
  1329. * Compact request
  1330. * [ 0 .. 3] magic (NBD_REQUEST_MAGIC)
  1331. * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, ...)
  1332. * [ 6 .. 7] type (NBD_CMD_READ, ...)
  1333. * [ 8 .. 15] cookie
  1334. * [16 .. 23] from
  1335. * [24 .. 27] len
  1336. * Extended request
  1337. * [ 0 .. 3] magic (NBD_EXTENDED_REQUEST_MAGIC)
  1338. * [ 4 .. 5] flags (NBD_CMD_FLAG_FUA, NBD_CMD_FLAG_PAYLOAD_LEN, ...)
  1339. * [ 6 .. 7] type (NBD_CMD_READ, ...)
  1340. * [ 8 .. 15] cookie
  1341. * [16 .. 23] from
  1342. * [24 .. 31] len
  1343. */
  1344. magic = ldl_be_p(buf);
  1345. request->flags = lduw_be_p(buf + 4);
  1346. request->type = lduw_be_p(buf + 6);
  1347. request->cookie = ldq_be_p(buf + 8);
  1348. request->from = ldq_be_p(buf + 16);
  1349. if (client->mode >= NBD_MODE_EXTENDED) {
  1350. request->len = ldq_be_p(buf + 24);
  1351. expect = NBD_EXTENDED_REQUEST_MAGIC;
  1352. } else {
  1353. request->len = (uint32_t)ldl_be_p(buf + 24); /* widen 32 to 64 bits */
  1354. expect = NBD_REQUEST_MAGIC;
  1355. }
  1356. trace_nbd_receive_request(magic, request->flags, request->type,
  1357. request->from, request->len);
  1358. if (magic != expect) {
  1359. error_setg(errp, "invalid magic (got 0x%" PRIx32 ", expected 0x%"
  1360. PRIx32 ")", magic, expect);
  1361. return -EINVAL;
  1362. }
  1363. return 0;
  1364. }
  1365. #define MAX_NBD_REQUESTS 16
  1366. /* Runs in export AioContext and main loop thread */
  1367. void nbd_client_get(NBDClient *client)
  1368. {
  1369. qatomic_inc(&client->refcount);
  1370. }
  1371. void nbd_client_put(NBDClient *client)
  1372. {
  1373. assert(qemu_in_main_thread());
  1374. if (qatomic_fetch_dec(&client->refcount) == 1) {
  1375. /* The last reference should be dropped by client->close,
  1376. * which is called by client_close.
  1377. */
  1378. assert(client->closing);
  1379. object_unref(OBJECT(client->sioc));
  1380. object_unref(OBJECT(client->ioc));
  1381. if (client->tlscreds) {
  1382. object_unref(OBJECT(client->tlscreds));
  1383. }
  1384. g_free(client->tlsauthz);
  1385. if (client->exp) {
  1386. QTAILQ_REMOVE(&client->exp->clients, client, next);
  1387. blk_exp_unref(&client->exp->common);
  1388. }
  1389. g_free(client->contexts.bitmaps);
  1390. qemu_mutex_destroy(&client->lock);
  1391. g_free(client);
  1392. }
  1393. }
  1394. /*
  1395. * Tries to release the reference to @client, but only if other references
  1396. * remain. This is an optimization for the common case where we want to avoid
  1397. * the expense of scheduling nbd_client_put() in the main loop thread.
  1398. *
  1399. * Returns true upon success or false if the reference was not released because
  1400. * it is the last reference.
  1401. */
  1402. static bool nbd_client_put_nonzero(NBDClient *client)
  1403. {
  1404. int old = qatomic_read(&client->refcount);
  1405. int expected;
  1406. do {
  1407. if (old == 1) {
  1408. return false;
  1409. }
  1410. expected = old;
  1411. old = qatomic_cmpxchg(&client->refcount, expected, expected - 1);
  1412. } while (old != expected);
  1413. return true;
  1414. }
  1415. static void client_close(NBDClient *client, bool negotiated)
  1416. {
  1417. assert(qemu_in_main_thread());
  1418. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1419. if (client->closing) {
  1420. return;
  1421. }
  1422. client->closing = true;
  1423. }
  1424. /* Force requests to finish. They will drop their own references,
  1425. * then we'll close the socket and free the NBDClient.
  1426. */
  1427. qio_channel_shutdown(client->ioc, QIO_CHANNEL_SHUTDOWN_BOTH,
  1428. NULL);
  1429. /* Also tell the client, so that they release their reference. */
  1430. if (client->close_fn) {
  1431. client->close_fn(client, negotiated);
  1432. }
  1433. }
  1434. /* Runs in export AioContext with client->lock held */
  1435. static NBDRequestData *nbd_request_get(NBDClient *client)
  1436. {
  1437. NBDRequestData *req;
  1438. assert(client->nb_requests <= MAX_NBD_REQUESTS - 1);
  1439. client->nb_requests++;
  1440. req = g_new0(NBDRequestData, 1);
  1441. req->client = client;
  1442. return req;
  1443. }
  1444. /* Runs in export AioContext with client->lock held */
  1445. static void nbd_request_put(NBDRequestData *req)
  1446. {
  1447. NBDClient *client = req->client;
  1448. if (req->data) {
  1449. qemu_vfree(req->data);
  1450. }
  1451. g_free(req);
  1452. client->nb_requests--;
  1453. if (client->quiescing && client->nb_requests == 0) {
  1454. aio_wait_kick();
  1455. }
  1456. nbd_client_receive_next_request(client);
  1457. }
  1458. static void blk_aio_attached(AioContext *ctx, void *opaque)
  1459. {
  1460. NBDExport *exp = opaque;
  1461. NBDClient *client;
  1462. assert(qemu_in_main_thread());
  1463. trace_nbd_blk_aio_attached(exp->name, ctx);
  1464. exp->common.ctx = ctx;
  1465. QTAILQ_FOREACH(client, &exp->clients, next) {
  1466. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1467. assert(client->nb_requests == 0);
  1468. assert(client->recv_coroutine == NULL);
  1469. assert(client->send_coroutine == NULL);
  1470. }
  1471. }
  1472. }
  1473. static void blk_aio_detach(void *opaque)
  1474. {
  1475. NBDExport *exp = opaque;
  1476. assert(qemu_in_main_thread());
  1477. trace_nbd_blk_aio_detach(exp->name, exp->common.ctx);
  1478. exp->common.ctx = NULL;
  1479. }
  1480. static void nbd_drained_begin(void *opaque)
  1481. {
  1482. NBDExport *exp = opaque;
  1483. NBDClient *client;
  1484. assert(qemu_in_main_thread());
  1485. QTAILQ_FOREACH(client, &exp->clients, next) {
  1486. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1487. client->quiescing = true;
  1488. }
  1489. }
  1490. }
  1491. static void nbd_drained_end(void *opaque)
  1492. {
  1493. NBDExport *exp = opaque;
  1494. NBDClient *client;
  1495. assert(qemu_in_main_thread());
  1496. QTAILQ_FOREACH(client, &exp->clients, next) {
  1497. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1498. client->quiescing = false;
  1499. nbd_client_receive_next_request(client);
  1500. }
  1501. }
  1502. }
  1503. /* Runs in export AioContext */
  1504. static void nbd_wake_read_bh(void *opaque)
  1505. {
  1506. NBDClient *client = opaque;
  1507. qio_channel_wake_read(client->ioc);
  1508. }
  1509. static bool nbd_drained_poll(void *opaque)
  1510. {
  1511. NBDExport *exp = opaque;
  1512. NBDClient *client;
  1513. assert(qemu_in_main_thread());
  1514. QTAILQ_FOREACH(client, &exp->clients, next) {
  1515. WITH_QEMU_LOCK_GUARD(&client->lock) {
  1516. if (client->nb_requests != 0) {
  1517. /*
  1518. * If there's a coroutine waiting for a request on nbd_read_eof()
  1519. * enter it here so we don't depend on the client to wake it up.
  1520. *
  1521. * Schedule a BH in the export AioContext to avoid missing the
  1522. * wake up due to the race between qio_channel_wake_read() and
  1523. * qio_channel_yield().
  1524. */
  1525. if (client->recv_coroutine != NULL && client->read_yielding) {
  1526. aio_bh_schedule_oneshot(nbd_export_aio_context(client->exp),
  1527. nbd_wake_read_bh, client);
  1528. }
  1529. return true;
  1530. }
  1531. }
  1532. }
  1533. return false;
  1534. }
  1535. static void nbd_eject_notifier(Notifier *n, void *data)
  1536. {
  1537. NBDExport *exp = container_of(n, NBDExport, eject_notifier);
  1538. assert(qemu_in_main_thread());
  1539. blk_exp_request_shutdown(&exp->common);
  1540. }
  1541. void nbd_export_set_on_eject_blk(BlockExport *exp, BlockBackend *blk)
  1542. {
  1543. NBDExport *nbd_exp = container_of(exp, NBDExport, common);
  1544. assert(exp->drv == &blk_exp_nbd);
  1545. assert(nbd_exp->eject_notifier_blk == NULL);
  1546. blk_ref(blk);
  1547. nbd_exp->eject_notifier_blk = blk;
  1548. nbd_exp->eject_notifier.notify = nbd_eject_notifier;
  1549. blk_add_remove_bs_notifier(blk, &nbd_exp->eject_notifier);
  1550. }
  1551. static const BlockDevOps nbd_block_ops = {
  1552. .drained_begin = nbd_drained_begin,
  1553. .drained_end = nbd_drained_end,
  1554. .drained_poll = nbd_drained_poll,
  1555. };
  1556. static int nbd_export_create(BlockExport *blk_exp, BlockExportOptions *exp_args,
  1557. Error **errp)
  1558. {
  1559. NBDExport *exp = container_of(blk_exp, NBDExport, common);
  1560. BlockExportOptionsNbd *arg = &exp_args->u.nbd;
  1561. const char *name = arg->name ?: exp_args->node_name;
  1562. BlockBackend *blk = blk_exp->blk;
  1563. int64_t size;
  1564. uint64_t perm, shared_perm;
  1565. bool readonly = !exp_args->writable;
  1566. BlockDirtyBitmapOrStrList *bitmaps;
  1567. size_t i;
  1568. int ret;
  1569. GLOBAL_STATE_CODE();
  1570. assert(exp_args->type == BLOCK_EXPORT_TYPE_NBD);
  1571. if (!nbd_server_is_running()) {
  1572. error_setg(errp, "NBD server not running");
  1573. return -EINVAL;
  1574. }
  1575. if (strlen(name) > NBD_MAX_STRING_SIZE) {
  1576. error_setg(errp, "export name '%s' too long", name);
  1577. return -EINVAL;
  1578. }
  1579. if (arg->description && strlen(arg->description) > NBD_MAX_STRING_SIZE) {
  1580. error_setg(errp, "description '%s' too long", arg->description);
  1581. return -EINVAL;
  1582. }
  1583. if (nbd_export_find(name)) {
  1584. error_setg(errp, "NBD server already has export named '%s'", name);
  1585. return -EEXIST;
  1586. }
  1587. size = blk_getlength(blk);
  1588. if (size < 0) {
  1589. error_setg_errno(errp, -size,
  1590. "Failed to determine the NBD export's length");
  1591. return size;
  1592. }
  1593. /* Don't allow resize while the NBD server is running, otherwise we don't
  1594. * care what happens with the node. */
  1595. blk_get_perm(blk, &perm, &shared_perm);
  1596. ret = blk_set_perm(blk, perm, shared_perm & ~BLK_PERM_RESIZE, errp);
  1597. if (ret < 0) {
  1598. return ret;
  1599. }
  1600. QTAILQ_INIT(&exp->clients);
  1601. exp->name = g_strdup(name);
  1602. exp->description = g_strdup(arg->description);
  1603. exp->nbdflags = (NBD_FLAG_HAS_FLAGS | NBD_FLAG_SEND_FLUSH |
  1604. NBD_FLAG_SEND_FUA | NBD_FLAG_SEND_CACHE);
  1605. if (nbd_server_max_connections() != 1) {
  1606. exp->nbdflags |= NBD_FLAG_CAN_MULTI_CONN;
  1607. }
  1608. if (readonly) {
  1609. exp->nbdflags |= NBD_FLAG_READ_ONLY;
  1610. } else {
  1611. exp->nbdflags |= (NBD_FLAG_SEND_TRIM | NBD_FLAG_SEND_WRITE_ZEROES |
  1612. NBD_FLAG_SEND_FAST_ZERO);
  1613. }
  1614. exp->size = QEMU_ALIGN_DOWN(size, BDRV_SECTOR_SIZE);
  1615. bdrv_graph_rdlock_main_loop();
  1616. for (bitmaps = arg->bitmaps; bitmaps; bitmaps = bitmaps->next) {
  1617. exp->nr_export_bitmaps++;
  1618. }
  1619. exp->export_bitmaps = g_new0(BdrvDirtyBitmap *, exp->nr_export_bitmaps);
  1620. for (i = 0, bitmaps = arg->bitmaps; bitmaps;
  1621. i++, bitmaps = bitmaps->next)
  1622. {
  1623. const char *bitmap;
  1624. BlockDriverState *bs = blk_bs(blk);
  1625. BdrvDirtyBitmap *bm = NULL;
  1626. switch (bitmaps->value->type) {
  1627. case QTYPE_QSTRING:
  1628. bitmap = bitmaps->value->u.local;
  1629. while (bs) {
  1630. bm = bdrv_find_dirty_bitmap(bs, bitmap);
  1631. if (bm != NULL) {
  1632. break;
  1633. }
  1634. bs = bdrv_filter_or_cow_bs(bs);
  1635. }
  1636. if (bm == NULL) {
  1637. ret = -ENOENT;
  1638. error_setg(errp, "Bitmap '%s' is not found",
  1639. bitmaps->value->u.local);
  1640. goto fail;
  1641. }
  1642. if (readonly && bdrv_is_writable(bs) &&
  1643. bdrv_dirty_bitmap_enabled(bm)) {
  1644. ret = -EINVAL;
  1645. error_setg(errp, "Enabled bitmap '%s' incompatible with "
  1646. "readonly export", bitmap);
  1647. goto fail;
  1648. }
  1649. break;
  1650. case QTYPE_QDICT:
  1651. bitmap = bitmaps->value->u.external.name;
  1652. bm = block_dirty_bitmap_lookup(bitmaps->value->u.external.node,
  1653. bitmap, NULL, errp);
  1654. if (!bm) {
  1655. ret = -ENOENT;
  1656. goto fail;
  1657. }
  1658. break;
  1659. default:
  1660. abort();
  1661. }
  1662. assert(bm);
  1663. if (bdrv_dirty_bitmap_check(bm, BDRV_BITMAP_ALLOW_RO, errp)) {
  1664. ret = -EINVAL;
  1665. goto fail;
  1666. }
  1667. exp->export_bitmaps[i] = bm;
  1668. assert(strlen(bitmap) <= BDRV_BITMAP_MAX_NAME_SIZE);
  1669. }
  1670. /* Mark bitmaps busy in a separate loop, to simplify roll-back concerns. */
  1671. for (i = 0; i < exp->nr_export_bitmaps; i++) {
  1672. bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], true);
  1673. }
  1674. exp->allocation_depth = arg->allocation_depth;
  1675. /*
  1676. * We need to inhibit request queuing in the block layer to ensure we can
  1677. * be properly quiesced when entering a drained section, as our coroutines
  1678. * servicing pending requests might enter blk_pread().
  1679. */
  1680. blk_set_disable_request_queuing(blk, true);
  1681. blk_add_aio_context_notifier(blk, blk_aio_attached, blk_aio_detach, exp);
  1682. blk_set_dev_ops(blk, &nbd_block_ops, exp);
  1683. QTAILQ_INSERT_TAIL(&exports, exp, next);
  1684. bdrv_graph_rdunlock_main_loop();
  1685. return 0;
  1686. fail:
  1687. bdrv_graph_rdunlock_main_loop();
  1688. g_free(exp->export_bitmaps);
  1689. g_free(exp->name);
  1690. g_free(exp->description);
  1691. return ret;
  1692. }
  1693. NBDExport *nbd_export_find(const char *name)
  1694. {
  1695. NBDExport *exp;
  1696. QTAILQ_FOREACH(exp, &exports, next) {
  1697. if (strcmp(name, exp->name) == 0) {
  1698. return exp;
  1699. }
  1700. }
  1701. return NULL;
  1702. }
  1703. AioContext *
  1704. nbd_export_aio_context(NBDExport *exp)
  1705. {
  1706. return exp->common.ctx;
  1707. }
  1708. static void nbd_export_request_shutdown(BlockExport *blk_exp)
  1709. {
  1710. NBDExport *exp = container_of(blk_exp, NBDExport, common);
  1711. NBDClient *client, *next;
  1712. blk_exp_ref(&exp->common);
  1713. /*
  1714. * TODO: Should we expand QMP BlockExportRemoveMode enum to allow a
  1715. * close mode that stops advertising the export to new clients but
  1716. * still permits existing clients to run to completion? Because of
  1717. * that possibility, nbd_export_close() can be called more than
  1718. * once on an export.
  1719. */
  1720. QTAILQ_FOREACH_SAFE(client, &exp->clients, next, next) {
  1721. client_close(client, true);
  1722. }
  1723. if (exp->name) {
  1724. g_free(exp->name);
  1725. exp->name = NULL;
  1726. QTAILQ_REMOVE(&exports, exp, next);
  1727. }
  1728. blk_exp_unref(&exp->common);
  1729. }
  1730. static void nbd_export_delete(BlockExport *blk_exp)
  1731. {
  1732. size_t i;
  1733. NBDExport *exp = container_of(blk_exp, NBDExport, common);
  1734. assert(exp->name == NULL);
  1735. assert(QTAILQ_EMPTY(&exp->clients));
  1736. g_free(exp->description);
  1737. exp->description = NULL;
  1738. if (exp->eject_notifier_blk) {
  1739. notifier_remove(&exp->eject_notifier);
  1740. blk_unref(exp->eject_notifier_blk);
  1741. }
  1742. blk_remove_aio_context_notifier(exp->common.blk, blk_aio_attached,
  1743. blk_aio_detach, exp);
  1744. blk_set_disable_request_queuing(exp->common.blk, false);
  1745. for (i = 0; i < exp->nr_export_bitmaps; i++) {
  1746. bdrv_dirty_bitmap_set_busy(exp->export_bitmaps[i], false);
  1747. }
  1748. }
  1749. const BlockExportDriver blk_exp_nbd = {
  1750. .type = BLOCK_EXPORT_TYPE_NBD,
  1751. .instance_size = sizeof(NBDExport),
  1752. .supports_inactive = true,
  1753. .create = nbd_export_create,
  1754. .delete = nbd_export_delete,
  1755. .request_shutdown = nbd_export_request_shutdown,
  1756. };
  1757. static int coroutine_fn nbd_co_send_iov(NBDClient *client, struct iovec *iov,
  1758. unsigned niov, Error **errp)
  1759. {
  1760. int ret;
  1761. g_assert(qemu_in_coroutine());
  1762. qemu_co_mutex_lock(&client->send_lock);
  1763. client->send_coroutine = qemu_coroutine_self();
  1764. ret = qio_channel_writev_all(client->ioc, iov, niov, errp) < 0 ? -EIO : 0;
  1765. client->send_coroutine = NULL;
  1766. qemu_co_mutex_unlock(&client->send_lock);
  1767. return ret;
  1768. }
  1769. static inline void set_be_simple_reply(NBDSimpleReply *reply, uint64_t error,
  1770. uint64_t cookie)
  1771. {
  1772. stl_be_p(&reply->magic, NBD_SIMPLE_REPLY_MAGIC);
  1773. stl_be_p(&reply->error, error);
  1774. stq_be_p(&reply->cookie, cookie);
  1775. }
  1776. static int coroutine_fn nbd_co_send_simple_reply(NBDClient *client,
  1777. NBDRequest *request,
  1778. uint32_t error,
  1779. void *data,
  1780. uint64_t len,
  1781. Error **errp)
  1782. {
  1783. NBDSimpleReply reply;
  1784. int nbd_err = system_errno_to_nbd_errno(error);
  1785. struct iovec iov[] = {
  1786. {.iov_base = &reply, .iov_len = sizeof(reply)},
  1787. {.iov_base = data, .iov_len = len}
  1788. };
  1789. assert(!len || !nbd_err);
  1790. assert(len <= NBD_MAX_BUFFER_SIZE);
  1791. assert(client->mode < NBD_MODE_STRUCTURED ||
  1792. (client->mode == NBD_MODE_STRUCTURED &&
  1793. request->type != NBD_CMD_READ));
  1794. trace_nbd_co_send_simple_reply(request->cookie, nbd_err,
  1795. nbd_err_lookup(nbd_err), len);
  1796. set_be_simple_reply(&reply, nbd_err, request->cookie);
  1797. return nbd_co_send_iov(client, iov, 2, errp);
  1798. }
  1799. /*
  1800. * Prepare the header of a reply chunk for network transmission.
  1801. *
  1802. * On input, @iov is partially initialized: iov[0].iov_base must point
  1803. * to an uninitialized NBDReply, while the remaining @niov elements
  1804. * (if any) must be ready for transmission. This function then
  1805. * populates iov[0] for transmission.
  1806. */
  1807. static inline void set_be_chunk(NBDClient *client, struct iovec *iov,
  1808. size_t niov, uint16_t flags, uint16_t type,
  1809. NBDRequest *request)
  1810. {
  1811. size_t i, length = 0;
  1812. for (i = 1; i < niov; i++) {
  1813. length += iov[i].iov_len;
  1814. }
  1815. assert(length <= NBD_MAX_BUFFER_SIZE + sizeof(NBDStructuredReadData));
  1816. if (client->mode >= NBD_MODE_EXTENDED) {
  1817. NBDExtendedReplyChunk *chunk = iov->iov_base;
  1818. iov[0].iov_len = sizeof(*chunk);
  1819. stl_be_p(&chunk->magic, NBD_EXTENDED_REPLY_MAGIC);
  1820. stw_be_p(&chunk->flags, flags);
  1821. stw_be_p(&chunk->type, type);
  1822. stq_be_p(&chunk->cookie, request->cookie);
  1823. stq_be_p(&chunk->offset, request->from);
  1824. stq_be_p(&chunk->length, length);
  1825. } else {
  1826. NBDStructuredReplyChunk *chunk = iov->iov_base;
  1827. iov[0].iov_len = sizeof(*chunk);
  1828. stl_be_p(&chunk->magic, NBD_STRUCTURED_REPLY_MAGIC);
  1829. stw_be_p(&chunk->flags, flags);
  1830. stw_be_p(&chunk->type, type);
  1831. stq_be_p(&chunk->cookie, request->cookie);
  1832. stl_be_p(&chunk->length, length);
  1833. }
  1834. }
  1835. static int coroutine_fn nbd_co_send_chunk_done(NBDClient *client,
  1836. NBDRequest *request,
  1837. Error **errp)
  1838. {
  1839. NBDReply hdr;
  1840. struct iovec iov[] = {
  1841. {.iov_base = &hdr},
  1842. };
  1843. trace_nbd_co_send_chunk_done(request->cookie);
  1844. set_be_chunk(client, iov, 1, NBD_REPLY_FLAG_DONE,
  1845. NBD_REPLY_TYPE_NONE, request);
  1846. return nbd_co_send_iov(client, iov, 1, errp);
  1847. }
  1848. static int coroutine_fn nbd_co_send_chunk_read(NBDClient *client,
  1849. NBDRequest *request,
  1850. uint64_t offset,
  1851. void *data,
  1852. uint64_t size,
  1853. bool final,
  1854. Error **errp)
  1855. {
  1856. NBDReply hdr;
  1857. NBDStructuredReadData chunk;
  1858. struct iovec iov[] = {
  1859. {.iov_base = &hdr},
  1860. {.iov_base = &chunk, .iov_len = sizeof(chunk)},
  1861. {.iov_base = data, .iov_len = size}
  1862. };
  1863. assert(size && size <= NBD_MAX_BUFFER_SIZE);
  1864. trace_nbd_co_send_chunk_read(request->cookie, offset, data, size);
  1865. set_be_chunk(client, iov, 3, final ? NBD_REPLY_FLAG_DONE : 0,
  1866. NBD_REPLY_TYPE_OFFSET_DATA, request);
  1867. stq_be_p(&chunk.offset, offset);
  1868. return nbd_co_send_iov(client, iov, 3, errp);
  1869. }
  1870. static int coroutine_fn nbd_co_send_chunk_error(NBDClient *client,
  1871. NBDRequest *request,
  1872. uint32_t error,
  1873. const char *msg,
  1874. Error **errp)
  1875. {
  1876. NBDReply hdr;
  1877. NBDStructuredError chunk;
  1878. int nbd_err = system_errno_to_nbd_errno(error);
  1879. struct iovec iov[] = {
  1880. {.iov_base = &hdr},
  1881. {.iov_base = &chunk, .iov_len = sizeof(chunk)},
  1882. {.iov_base = (char *)msg, .iov_len = msg ? strlen(msg) : 0},
  1883. };
  1884. assert(nbd_err);
  1885. trace_nbd_co_send_chunk_error(request->cookie, nbd_err,
  1886. nbd_err_lookup(nbd_err), msg ? msg : "");
  1887. set_be_chunk(client, iov, 3, NBD_REPLY_FLAG_DONE,
  1888. NBD_REPLY_TYPE_ERROR, request);
  1889. stl_be_p(&chunk.error, nbd_err);
  1890. stw_be_p(&chunk.message_length, iov[2].iov_len);
  1891. return nbd_co_send_iov(client, iov, 3, errp);
  1892. }
  1893. /* Do a sparse read and send the structured reply to the client.
  1894. * Returns -errno if sending fails. blk_co_block_status_above() failure is
  1895. * reported to the client, at which point this function succeeds.
  1896. */
  1897. static int coroutine_fn nbd_co_send_sparse_read(NBDClient *client,
  1898. NBDRequest *request,
  1899. uint64_t offset,
  1900. uint8_t *data,
  1901. uint64_t size,
  1902. Error **errp)
  1903. {
  1904. int ret = 0;
  1905. NBDExport *exp = client->exp;
  1906. size_t progress = 0;
  1907. assert(size <= NBD_MAX_BUFFER_SIZE);
  1908. while (progress < size) {
  1909. int64_t pnum;
  1910. int status = blk_co_block_status_above(exp->common.blk, NULL,
  1911. offset + progress,
  1912. size - progress, &pnum, NULL,
  1913. NULL);
  1914. bool final;
  1915. if (status < 0) {
  1916. char *msg = g_strdup_printf("unable to check for holes: %s",
  1917. strerror(-status));
  1918. ret = nbd_co_send_chunk_error(client, request, -status, msg, errp);
  1919. g_free(msg);
  1920. return ret;
  1921. }
  1922. assert(pnum && pnum <= size - progress);
  1923. final = progress + pnum == size;
  1924. if (status & BDRV_BLOCK_ZERO) {
  1925. NBDReply hdr;
  1926. NBDStructuredReadHole chunk;
  1927. struct iovec iov[] = {
  1928. {.iov_base = &hdr},
  1929. {.iov_base = &chunk, .iov_len = sizeof(chunk)},
  1930. };
  1931. trace_nbd_co_send_chunk_read_hole(request->cookie,
  1932. offset + progress, pnum);
  1933. set_be_chunk(client, iov, 2,
  1934. final ? NBD_REPLY_FLAG_DONE : 0,
  1935. NBD_REPLY_TYPE_OFFSET_HOLE, request);
  1936. stq_be_p(&chunk.offset, offset + progress);
  1937. stl_be_p(&chunk.length, pnum);
  1938. ret = nbd_co_send_iov(client, iov, 2, errp);
  1939. } else {
  1940. ret = blk_co_pread(exp->common.blk, offset + progress, pnum,
  1941. data + progress, 0);
  1942. if (ret < 0) {
  1943. error_setg_errno(errp, -ret, "reading from file failed");
  1944. break;
  1945. }
  1946. ret = nbd_co_send_chunk_read(client, request, offset + progress,
  1947. data + progress, pnum, final, errp);
  1948. }
  1949. if (ret < 0) {
  1950. break;
  1951. }
  1952. progress += pnum;
  1953. }
  1954. return ret;
  1955. }
  1956. typedef struct NBDExtentArray {
  1957. NBDExtent64 *extents;
  1958. unsigned int nb_alloc;
  1959. unsigned int count;
  1960. uint64_t total_length;
  1961. bool extended;
  1962. bool can_add;
  1963. bool converted_to_be;
  1964. } NBDExtentArray;
  1965. static NBDExtentArray *nbd_extent_array_new(unsigned int nb_alloc,
  1966. NBDMode mode)
  1967. {
  1968. NBDExtentArray *ea = g_new0(NBDExtentArray, 1);
  1969. assert(mode >= NBD_MODE_STRUCTURED);
  1970. ea->nb_alloc = nb_alloc;
  1971. ea->extents = g_new(NBDExtent64, nb_alloc);
  1972. ea->extended = mode >= NBD_MODE_EXTENDED;
  1973. ea->can_add = true;
  1974. return ea;
  1975. }
  1976. static void nbd_extent_array_free(NBDExtentArray *ea)
  1977. {
  1978. g_free(ea->extents);
  1979. g_free(ea);
  1980. }
  1981. G_DEFINE_AUTOPTR_CLEANUP_FUNC(NBDExtentArray, nbd_extent_array_free)
  1982. /* Further modifications of the array after conversion are abandoned */
  1983. static void nbd_extent_array_convert_to_be(NBDExtentArray *ea)
  1984. {
  1985. int i;
  1986. assert(!ea->converted_to_be);
  1987. assert(ea->extended);
  1988. ea->can_add = false;
  1989. ea->converted_to_be = true;
  1990. for (i = 0; i < ea->count; i++) {
  1991. ea->extents[i].length = cpu_to_be64(ea->extents[i].length);
  1992. ea->extents[i].flags = cpu_to_be64(ea->extents[i].flags);
  1993. }
  1994. }
  1995. /* Further modifications of the array after conversion are abandoned */
  1996. static NBDExtent32 *nbd_extent_array_convert_to_narrow(NBDExtentArray *ea)
  1997. {
  1998. int i;
  1999. NBDExtent32 *extents = g_new(NBDExtent32, ea->count);
  2000. assert(!ea->converted_to_be);
  2001. assert(!ea->extended);
  2002. ea->can_add = false;
  2003. ea->converted_to_be = true;
  2004. for (i = 0; i < ea->count; i++) {
  2005. assert((ea->extents[i].length | ea->extents[i].flags) <= UINT32_MAX);
  2006. extents[i].length = cpu_to_be32(ea->extents[i].length);
  2007. extents[i].flags = cpu_to_be32(ea->extents[i].flags);
  2008. }
  2009. return extents;
  2010. }
  2011. /*
  2012. * Add extent to NBDExtentArray. If extent can't be added (no available space),
  2013. * return -1.
  2014. * For safety, when returning -1 for the first time, .can_add is set to false,
  2015. * and further calls to nbd_extent_array_add() will crash.
  2016. * (this avoids the situation where a caller ignores failure to add one extent,
  2017. * where adding another extent that would squash into the last array entry
  2018. * would result in an incorrect range reported to the client)
  2019. */
  2020. static int nbd_extent_array_add(NBDExtentArray *ea,
  2021. uint64_t length, uint32_t flags)
  2022. {
  2023. assert(ea->can_add);
  2024. if (!length) {
  2025. return 0;
  2026. }
  2027. if (!ea->extended) {
  2028. assert(length <= UINT32_MAX);
  2029. }
  2030. /* Extend previous extent if flags are the same */
  2031. if (ea->count > 0 && flags == ea->extents[ea->count - 1].flags) {
  2032. uint64_t sum = length + ea->extents[ea->count - 1].length;
  2033. /*
  2034. * sum cannot overflow: the block layer bounds image size at
  2035. * 2^63, and ea->extents[].length comes from the block layer.
  2036. */
  2037. assert(sum >= length);
  2038. if (sum <= UINT32_MAX || ea->extended) {
  2039. ea->extents[ea->count - 1].length = sum;
  2040. ea->total_length += length;
  2041. return 0;
  2042. }
  2043. }
  2044. if (ea->count >= ea->nb_alloc) {
  2045. ea->can_add = false;
  2046. return -1;
  2047. }
  2048. ea->total_length += length;
  2049. ea->extents[ea->count] = (NBDExtent64) {.length = length, .flags = flags};
  2050. ea->count++;
  2051. return 0;
  2052. }
  2053. static int coroutine_fn blockstatus_to_extents(BlockBackend *blk,
  2054. uint64_t offset, uint64_t bytes,
  2055. NBDExtentArray *ea)
  2056. {
  2057. while (bytes) {
  2058. uint32_t flags;
  2059. int64_t num;
  2060. int ret = blk_co_block_status_above(blk, NULL, offset, bytes, &num,
  2061. NULL, NULL);
  2062. if (ret < 0) {
  2063. return ret;
  2064. }
  2065. flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) |
  2066. (ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0);
  2067. if (nbd_extent_array_add(ea, num, flags) < 0) {
  2068. return 0;
  2069. }
  2070. offset += num;
  2071. bytes -= num;
  2072. }
  2073. return 0;
  2074. }
  2075. static int coroutine_fn blockalloc_to_extents(BlockBackend *blk,
  2076. uint64_t offset, uint64_t bytes,
  2077. NBDExtentArray *ea)
  2078. {
  2079. while (bytes) {
  2080. int64_t num;
  2081. int ret = blk_co_is_allocated_above(blk, NULL, false, offset, bytes,
  2082. &num);
  2083. if (ret < 0) {
  2084. return ret;
  2085. }
  2086. if (nbd_extent_array_add(ea, num, ret) < 0) {
  2087. return 0;
  2088. }
  2089. offset += num;
  2090. bytes -= num;
  2091. }
  2092. return 0;
  2093. }
  2094. /*
  2095. * nbd_co_send_extents
  2096. *
  2097. * @ea is converted to BE by the function
  2098. * @last controls whether NBD_REPLY_FLAG_DONE is sent.
  2099. */
  2100. static int coroutine_fn
  2101. nbd_co_send_extents(NBDClient *client, NBDRequest *request, NBDExtentArray *ea,
  2102. bool last, uint32_t context_id, Error **errp)
  2103. {
  2104. NBDReply hdr;
  2105. NBDStructuredMeta meta;
  2106. NBDExtendedMeta meta_ext;
  2107. g_autofree NBDExtent32 *extents = NULL;
  2108. uint16_t type;
  2109. struct iovec iov[] = { {.iov_base = &hdr}, {0}, {0} };
  2110. if (client->mode >= NBD_MODE_EXTENDED) {
  2111. type = NBD_REPLY_TYPE_BLOCK_STATUS_EXT;
  2112. iov[1].iov_base = &meta_ext;
  2113. iov[1].iov_len = sizeof(meta_ext);
  2114. stl_be_p(&meta_ext.context_id, context_id);
  2115. stl_be_p(&meta_ext.count, ea->count);
  2116. nbd_extent_array_convert_to_be(ea);
  2117. iov[2].iov_base = ea->extents;
  2118. iov[2].iov_len = ea->count * sizeof(ea->extents[0]);
  2119. } else {
  2120. type = NBD_REPLY_TYPE_BLOCK_STATUS;
  2121. iov[1].iov_base = &meta;
  2122. iov[1].iov_len = sizeof(meta);
  2123. stl_be_p(&meta.context_id, context_id);
  2124. extents = nbd_extent_array_convert_to_narrow(ea);
  2125. iov[2].iov_base = extents;
  2126. iov[2].iov_len = ea->count * sizeof(extents[0]);
  2127. }
  2128. trace_nbd_co_send_extents(request->cookie, ea->count, context_id,
  2129. ea->total_length, last);
  2130. set_be_chunk(client, iov, 3, last ? NBD_REPLY_FLAG_DONE : 0, type,
  2131. request);
  2132. return nbd_co_send_iov(client, iov, 3, errp);
  2133. }
  2134. /* Get block status from the exported device and send it to the client */
  2135. static int
  2136. coroutine_fn nbd_co_send_block_status(NBDClient *client, NBDRequest *request,
  2137. BlockBackend *blk, uint64_t offset,
  2138. uint64_t length, bool dont_fragment,
  2139. bool last, uint32_t context_id,
  2140. Error **errp)
  2141. {
  2142. int ret;
  2143. unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
  2144. g_autoptr(NBDExtentArray) ea =
  2145. nbd_extent_array_new(nb_extents, client->mode);
  2146. if (context_id == NBD_META_ID_BASE_ALLOCATION) {
  2147. ret = blockstatus_to_extents(blk, offset, length, ea);
  2148. } else {
  2149. ret = blockalloc_to_extents(blk, offset, length, ea);
  2150. }
  2151. if (ret < 0) {
  2152. return nbd_co_send_chunk_error(client, request, -ret,
  2153. "can't get block status", errp);
  2154. }
  2155. return nbd_co_send_extents(client, request, ea, last, context_id, errp);
  2156. }
  2157. /* Populate @ea from a dirty bitmap. */
  2158. static void bitmap_to_extents(BdrvDirtyBitmap *bitmap,
  2159. uint64_t offset, uint64_t length,
  2160. NBDExtentArray *es)
  2161. {
  2162. int64_t start, dirty_start, dirty_count;
  2163. int64_t end = offset + length;
  2164. bool full = false;
  2165. int64_t bound = es->extended ? INT64_MAX : INT32_MAX;
  2166. bdrv_dirty_bitmap_lock(bitmap);
  2167. for (start = offset;
  2168. bdrv_dirty_bitmap_next_dirty_area(bitmap, start, end, bound,
  2169. &dirty_start, &dirty_count);
  2170. start = dirty_start + dirty_count)
  2171. {
  2172. if ((nbd_extent_array_add(es, dirty_start - start, 0) < 0) ||
  2173. (nbd_extent_array_add(es, dirty_count, NBD_STATE_DIRTY) < 0))
  2174. {
  2175. full = true;
  2176. break;
  2177. }
  2178. }
  2179. if (!full) {
  2180. /* last non dirty extent, nothing to do if array is now full */
  2181. (void) nbd_extent_array_add(es, end - start, 0);
  2182. }
  2183. bdrv_dirty_bitmap_unlock(bitmap);
  2184. }
  2185. static int coroutine_fn nbd_co_send_bitmap(NBDClient *client,
  2186. NBDRequest *request,
  2187. BdrvDirtyBitmap *bitmap,
  2188. uint64_t offset,
  2189. uint64_t length, bool dont_fragment,
  2190. bool last, uint32_t context_id,
  2191. Error **errp)
  2192. {
  2193. unsigned int nb_extents = dont_fragment ? 1 : NBD_MAX_BLOCK_STATUS_EXTENTS;
  2194. g_autoptr(NBDExtentArray) ea =
  2195. nbd_extent_array_new(nb_extents, client->mode);
  2196. bitmap_to_extents(bitmap, offset, length, ea);
  2197. return nbd_co_send_extents(client, request, ea, last, context_id, errp);
  2198. }
  2199. /*
  2200. * nbd_co_block_status_payload_read
  2201. * Called when a client wants a subset of negotiated contexts via a
  2202. * BLOCK_STATUS payload. Check the payload for valid length and
  2203. * contents. On success, return 0 with request updated to effective
  2204. * length. If request was invalid but all payload consumed, return 0
  2205. * with request->len and request->contexts->count set to 0 (which will
  2206. * trigger an appropriate NBD_EINVAL response later on). Return
  2207. * negative errno if the payload was not fully consumed.
  2208. */
  2209. static int
  2210. nbd_co_block_status_payload_read(NBDClient *client, NBDRequest *request,
  2211. Error **errp)
  2212. {
  2213. uint64_t payload_len = request->len;
  2214. g_autofree char *buf = NULL;
  2215. size_t count, i, nr_bitmaps;
  2216. uint32_t id;
  2217. if (payload_len > NBD_MAX_BUFFER_SIZE) {
  2218. error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
  2219. request->len, NBD_MAX_BUFFER_SIZE);
  2220. return -EINVAL;
  2221. }
  2222. assert(client->contexts.exp == client->exp);
  2223. nr_bitmaps = client->exp->nr_export_bitmaps;
  2224. request->contexts = g_new0(NBDMetaContexts, 1);
  2225. request->contexts->exp = client->exp;
  2226. if (payload_len % sizeof(uint32_t) ||
  2227. payload_len < sizeof(NBDBlockStatusPayload) ||
  2228. payload_len > (sizeof(NBDBlockStatusPayload) +
  2229. sizeof(id) * client->contexts.count)) {
  2230. goto skip;
  2231. }
  2232. buf = g_malloc(payload_len);
  2233. if (nbd_read(client->ioc, buf, payload_len,
  2234. "CMD_BLOCK_STATUS data", errp) < 0) {
  2235. return -EIO;
  2236. }
  2237. trace_nbd_co_receive_request_payload_received(request->cookie,
  2238. payload_len);
  2239. request->contexts->bitmaps = g_new0(bool, nr_bitmaps);
  2240. count = (payload_len - sizeof(NBDBlockStatusPayload)) / sizeof(id);
  2241. payload_len = 0;
  2242. for (i = 0; i < count; i++) {
  2243. id = ldl_be_p(buf + sizeof(NBDBlockStatusPayload) + sizeof(id) * i);
  2244. if (id == NBD_META_ID_BASE_ALLOCATION) {
  2245. if (!client->contexts.base_allocation ||
  2246. request->contexts->base_allocation) {
  2247. goto skip;
  2248. }
  2249. request->contexts->base_allocation = true;
  2250. } else if (id == NBD_META_ID_ALLOCATION_DEPTH) {
  2251. if (!client->contexts.allocation_depth ||
  2252. request->contexts->allocation_depth) {
  2253. goto skip;
  2254. }
  2255. request->contexts->allocation_depth = true;
  2256. } else {
  2257. unsigned idx = id - NBD_META_ID_DIRTY_BITMAP;
  2258. if (idx >= nr_bitmaps || !client->contexts.bitmaps[idx] ||
  2259. request->contexts->bitmaps[idx]) {
  2260. goto skip;
  2261. }
  2262. request->contexts->bitmaps[idx] = true;
  2263. }
  2264. }
  2265. request->len = ldq_be_p(buf);
  2266. request->contexts->count = count;
  2267. return 0;
  2268. skip:
  2269. trace_nbd_co_receive_block_status_payload_compliance(request->from,
  2270. request->len);
  2271. request->len = request->contexts->count = 0;
  2272. return nbd_drop(client->ioc, payload_len, errp);
  2273. }
  2274. /* nbd_co_receive_request
  2275. * Collect a client request. Return 0 if request looks valid, -EIO to drop
  2276. * connection right away, -EAGAIN to indicate we were interrupted and the
  2277. * channel should be quiesced, and any other negative value to report an error
  2278. * to the client (although the caller may still need to disconnect after
  2279. * reporting the error).
  2280. */
  2281. static int coroutine_fn nbd_co_receive_request(NBDRequestData *req,
  2282. NBDRequest *request,
  2283. Error **errp)
  2284. {
  2285. NBDClient *client = req->client;
  2286. bool extended_with_payload;
  2287. bool check_length = false;
  2288. bool check_rofs = false;
  2289. bool allocate_buffer = false;
  2290. bool payload_okay = false;
  2291. uint64_t payload_len = 0;
  2292. int valid_flags = NBD_CMD_FLAG_FUA;
  2293. int ret;
  2294. g_assert(qemu_in_coroutine());
  2295. ret = nbd_receive_request(client, request, errp);
  2296. if (ret < 0) {
  2297. return ret;
  2298. }
  2299. trace_nbd_co_receive_request_decode_type(request->cookie, request->type,
  2300. nbd_cmd_lookup(request->type));
  2301. extended_with_payload = client->mode >= NBD_MODE_EXTENDED &&
  2302. request->flags & NBD_CMD_FLAG_PAYLOAD_LEN;
  2303. if (extended_with_payload) {
  2304. payload_len = request->len;
  2305. check_length = true;
  2306. }
  2307. switch (request->type) {
  2308. case NBD_CMD_DISC:
  2309. /* Special case: we're going to disconnect without a reply,
  2310. * whether or not flags, from, or len are bogus */
  2311. req->complete = true;
  2312. return -EIO;
  2313. case NBD_CMD_READ:
  2314. if (client->mode >= NBD_MODE_STRUCTURED) {
  2315. valid_flags |= NBD_CMD_FLAG_DF;
  2316. }
  2317. check_length = true;
  2318. allocate_buffer = true;
  2319. break;
  2320. case NBD_CMD_WRITE:
  2321. if (client->mode >= NBD_MODE_EXTENDED) {
  2322. if (!extended_with_payload) {
  2323. /* The client is noncompliant. Trace it, but proceed. */
  2324. trace_nbd_co_receive_ext_payload_compliance(request->from,
  2325. request->len);
  2326. }
  2327. valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
  2328. }
  2329. payload_okay = true;
  2330. payload_len = request->len;
  2331. check_length = true;
  2332. allocate_buffer = true;
  2333. check_rofs = true;
  2334. break;
  2335. case NBD_CMD_FLUSH:
  2336. break;
  2337. case NBD_CMD_TRIM:
  2338. check_rofs = true;
  2339. break;
  2340. case NBD_CMD_CACHE:
  2341. check_length = true;
  2342. break;
  2343. case NBD_CMD_WRITE_ZEROES:
  2344. valid_flags |= NBD_CMD_FLAG_NO_HOLE | NBD_CMD_FLAG_FAST_ZERO;
  2345. check_rofs = true;
  2346. break;
  2347. case NBD_CMD_BLOCK_STATUS:
  2348. if (extended_with_payload) {
  2349. ret = nbd_co_block_status_payload_read(client, request, errp);
  2350. if (ret < 0) {
  2351. return ret;
  2352. }
  2353. /* payload now consumed */
  2354. check_length = false;
  2355. payload_len = 0;
  2356. valid_flags |= NBD_CMD_FLAG_PAYLOAD_LEN;
  2357. } else {
  2358. request->contexts = &client->contexts;
  2359. }
  2360. valid_flags |= NBD_CMD_FLAG_REQ_ONE;
  2361. break;
  2362. default:
  2363. /* Unrecognized, will fail later */
  2364. ;
  2365. }
  2366. /* Payload and buffer handling. */
  2367. if (!payload_len) {
  2368. req->complete = true;
  2369. }
  2370. if (check_length && request->len > NBD_MAX_BUFFER_SIZE) {
  2371. /* READ, WRITE, CACHE */
  2372. error_setg(errp, "len (%" PRIu64 ") is larger than max len (%u)",
  2373. request->len, NBD_MAX_BUFFER_SIZE);
  2374. return -EINVAL;
  2375. }
  2376. if (payload_len && !payload_okay) {
  2377. /*
  2378. * For now, we don't support payloads on other commands; but
  2379. * we can keep the connection alive by ignoring the payload.
  2380. * We will fail the command later with NBD_EINVAL for the use
  2381. * of an unsupported flag (and not for access beyond bounds).
  2382. */
  2383. assert(request->type != NBD_CMD_WRITE);
  2384. request->len = 0;
  2385. }
  2386. if (allocate_buffer) {
  2387. /* READ, WRITE */
  2388. req->data = blk_try_blockalign(client->exp->common.blk,
  2389. request->len);
  2390. if (req->data == NULL) {
  2391. error_setg(errp, "No memory");
  2392. return -ENOMEM;
  2393. }
  2394. }
  2395. if (payload_len) {
  2396. if (payload_okay) {
  2397. /* WRITE */
  2398. assert(req->data);
  2399. ret = nbd_read(client->ioc, req->data, payload_len,
  2400. "CMD_WRITE data", errp);
  2401. } else {
  2402. ret = nbd_drop(client->ioc, payload_len, errp);
  2403. }
  2404. if (ret < 0) {
  2405. return -EIO;
  2406. }
  2407. req->complete = true;
  2408. trace_nbd_co_receive_request_payload_received(request->cookie,
  2409. payload_len);
  2410. }
  2411. /* Sanity checks. */
  2412. if (client->exp->nbdflags & NBD_FLAG_READ_ONLY && check_rofs) {
  2413. /* WRITE, TRIM, WRITE_ZEROES */
  2414. error_setg(errp, "Export is read-only");
  2415. return -EROFS;
  2416. }
  2417. if (request->from > client->exp->size ||
  2418. request->len > client->exp->size - request->from) {
  2419. error_setg(errp, "operation past EOF; From: %" PRIu64 ", Len: %" PRIu64
  2420. ", Size: %" PRIu64, request->from, request->len,
  2421. client->exp->size);
  2422. return (request->type == NBD_CMD_WRITE ||
  2423. request->type == NBD_CMD_WRITE_ZEROES) ? -ENOSPC : -EINVAL;
  2424. }
  2425. if (client->check_align && !QEMU_IS_ALIGNED(request->from | request->len,
  2426. client->check_align)) {
  2427. /*
  2428. * The block layer gracefully handles unaligned requests, but
  2429. * it's still worth tracing client non-compliance
  2430. */
  2431. trace_nbd_co_receive_align_compliance(nbd_cmd_lookup(request->type),
  2432. request->from,
  2433. request->len,
  2434. client->check_align);
  2435. }
  2436. if (request->flags & ~valid_flags) {
  2437. error_setg(errp, "unsupported flags for command %s (got 0x%x)",
  2438. nbd_cmd_lookup(request->type), request->flags);
  2439. return -EINVAL;
  2440. }
  2441. return 0;
  2442. }
  2443. /* Send simple reply without a payload, or a structured error
  2444. * @error_msg is ignored if @ret >= 0
  2445. * Returns 0 if connection is still live, -errno on failure to talk to client
  2446. */
  2447. static coroutine_fn int nbd_send_generic_reply(NBDClient *client,
  2448. NBDRequest *request,
  2449. int ret,
  2450. const char *error_msg,
  2451. Error **errp)
  2452. {
  2453. if (client->mode >= NBD_MODE_STRUCTURED && ret < 0) {
  2454. return nbd_co_send_chunk_error(client, request, -ret, error_msg, errp);
  2455. } else if (client->mode >= NBD_MODE_EXTENDED) {
  2456. return nbd_co_send_chunk_done(client, request, errp);
  2457. } else {
  2458. return nbd_co_send_simple_reply(client, request, ret < 0 ? -ret : 0,
  2459. NULL, 0, errp);
  2460. }
  2461. }
  2462. /* Handle NBD_CMD_READ request.
  2463. * Return -errno if sending fails. Other errors are reported directly to the
  2464. * client as an error reply. */
  2465. static coroutine_fn int nbd_do_cmd_read(NBDClient *client, NBDRequest *request,
  2466. uint8_t *data, Error **errp)
  2467. {
  2468. int ret;
  2469. NBDExport *exp = client->exp;
  2470. assert(request->type == NBD_CMD_READ);
  2471. assert(request->len <= NBD_MAX_BUFFER_SIZE);
  2472. /* XXX: NBD Protocol only documents use of FUA with WRITE */
  2473. if (request->flags & NBD_CMD_FLAG_FUA) {
  2474. ret = blk_co_flush(exp->common.blk);
  2475. if (ret < 0) {
  2476. return nbd_send_generic_reply(client, request, ret,
  2477. "flush failed", errp);
  2478. }
  2479. }
  2480. if (client->mode >= NBD_MODE_STRUCTURED &&
  2481. !(request->flags & NBD_CMD_FLAG_DF) && request->len)
  2482. {
  2483. return nbd_co_send_sparse_read(client, request, request->from,
  2484. data, request->len, errp);
  2485. }
  2486. ret = blk_co_pread(exp->common.blk, request->from, request->len, data, 0);
  2487. if (ret < 0) {
  2488. return nbd_send_generic_reply(client, request, ret,
  2489. "reading from file failed", errp);
  2490. }
  2491. if (client->mode >= NBD_MODE_STRUCTURED) {
  2492. if (request->len) {
  2493. return nbd_co_send_chunk_read(client, request, request->from, data,
  2494. request->len, true, errp);
  2495. } else {
  2496. return nbd_co_send_chunk_done(client, request, errp);
  2497. }
  2498. } else {
  2499. return nbd_co_send_simple_reply(client, request, 0,
  2500. data, request->len, errp);
  2501. }
  2502. }
  2503. /*
  2504. * nbd_do_cmd_cache
  2505. *
  2506. * Handle NBD_CMD_CACHE request.
  2507. * Return -errno if sending fails. Other errors are reported directly to the
  2508. * client as an error reply.
  2509. */
  2510. static coroutine_fn int nbd_do_cmd_cache(NBDClient *client, NBDRequest *request,
  2511. Error **errp)
  2512. {
  2513. int ret;
  2514. NBDExport *exp = client->exp;
  2515. assert(request->type == NBD_CMD_CACHE);
  2516. assert(request->len <= NBD_MAX_BUFFER_SIZE);
  2517. ret = blk_co_preadv(exp->common.blk, request->from, request->len,
  2518. NULL, BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
  2519. return nbd_send_generic_reply(client, request, ret,
  2520. "caching data failed", errp);
  2521. }
  2522. /* Handle NBD request.
  2523. * Return -errno if sending fails. Other errors are reported directly to the
  2524. * client as an error reply. */
  2525. static coroutine_fn int nbd_handle_request(NBDClient *client,
  2526. NBDRequest *request,
  2527. uint8_t *data, Error **errp)
  2528. {
  2529. int ret;
  2530. int flags;
  2531. NBDExport *exp = client->exp;
  2532. char *msg;
  2533. size_t i;
  2534. bool inactive;
  2535. WITH_GRAPH_RDLOCK_GUARD() {
  2536. inactive = bdrv_is_inactive(blk_bs(exp->common.blk));
  2537. if (inactive) {
  2538. switch (request->type) {
  2539. case NBD_CMD_READ:
  2540. /* These commands are allowed on inactive nodes */
  2541. break;
  2542. default:
  2543. /* Return an error for the rest */
  2544. return nbd_send_generic_reply(client, request, -EPERM,
  2545. "export is inactive", errp);
  2546. }
  2547. }
  2548. }
  2549. switch (request->type) {
  2550. case NBD_CMD_CACHE:
  2551. return nbd_do_cmd_cache(client, request, errp);
  2552. case NBD_CMD_READ:
  2553. return nbd_do_cmd_read(client, request, data, errp);
  2554. case NBD_CMD_WRITE:
  2555. flags = 0;
  2556. if (request->flags & NBD_CMD_FLAG_FUA) {
  2557. flags |= BDRV_REQ_FUA;
  2558. }
  2559. assert(request->len <= NBD_MAX_BUFFER_SIZE);
  2560. ret = blk_co_pwrite(exp->common.blk, request->from, request->len, data,
  2561. flags);
  2562. return nbd_send_generic_reply(client, request, ret,
  2563. "writing to file failed", errp);
  2564. case NBD_CMD_WRITE_ZEROES:
  2565. flags = 0;
  2566. if (request->flags & NBD_CMD_FLAG_FUA) {
  2567. flags |= BDRV_REQ_FUA;
  2568. }
  2569. if (!(request->flags & NBD_CMD_FLAG_NO_HOLE)) {
  2570. flags |= BDRV_REQ_MAY_UNMAP;
  2571. }
  2572. if (request->flags & NBD_CMD_FLAG_FAST_ZERO) {
  2573. flags |= BDRV_REQ_NO_FALLBACK;
  2574. }
  2575. ret = blk_co_pwrite_zeroes(exp->common.blk, request->from, request->len,
  2576. flags);
  2577. return nbd_send_generic_reply(client, request, ret,
  2578. "writing to file failed", errp);
  2579. case NBD_CMD_DISC:
  2580. /* unreachable, thanks to special case in nbd_co_receive_request() */
  2581. abort();
  2582. case NBD_CMD_FLUSH:
  2583. ret = blk_co_flush(exp->common.blk);
  2584. return nbd_send_generic_reply(client, request, ret,
  2585. "flush failed", errp);
  2586. case NBD_CMD_TRIM:
  2587. ret = blk_co_pdiscard(exp->common.blk, request->from, request->len);
  2588. if (ret >= 0 && request->flags & NBD_CMD_FLAG_FUA) {
  2589. ret = blk_co_flush(exp->common.blk);
  2590. }
  2591. return nbd_send_generic_reply(client, request, ret,
  2592. "discard failed", errp);
  2593. case NBD_CMD_BLOCK_STATUS:
  2594. assert(request->contexts);
  2595. assert(client->mode >= NBD_MODE_EXTENDED ||
  2596. request->len <= UINT32_MAX);
  2597. if (request->contexts->count) {
  2598. bool dont_fragment = request->flags & NBD_CMD_FLAG_REQ_ONE;
  2599. int contexts_remaining = request->contexts->count;
  2600. if (!request->len) {
  2601. return nbd_send_generic_reply(client, request, -EINVAL,
  2602. "need non-zero length", errp);
  2603. }
  2604. if (request->contexts->base_allocation) {
  2605. ret = nbd_co_send_block_status(client, request,
  2606. exp->common.blk,
  2607. request->from,
  2608. request->len, dont_fragment,
  2609. !--contexts_remaining,
  2610. NBD_META_ID_BASE_ALLOCATION,
  2611. errp);
  2612. if (ret < 0) {
  2613. return ret;
  2614. }
  2615. }
  2616. if (request->contexts->allocation_depth) {
  2617. ret = nbd_co_send_block_status(client, request,
  2618. exp->common.blk,
  2619. request->from, request->len,
  2620. dont_fragment,
  2621. !--contexts_remaining,
  2622. NBD_META_ID_ALLOCATION_DEPTH,
  2623. errp);
  2624. if (ret < 0) {
  2625. return ret;
  2626. }
  2627. }
  2628. assert(request->contexts->exp == client->exp);
  2629. for (i = 0; i < client->exp->nr_export_bitmaps; i++) {
  2630. if (!request->contexts->bitmaps[i]) {
  2631. continue;
  2632. }
  2633. ret = nbd_co_send_bitmap(client, request,
  2634. client->exp->export_bitmaps[i],
  2635. request->from, request->len,
  2636. dont_fragment, !--contexts_remaining,
  2637. NBD_META_ID_DIRTY_BITMAP + i, errp);
  2638. if (ret < 0) {
  2639. return ret;
  2640. }
  2641. }
  2642. assert(!contexts_remaining);
  2643. return 0;
  2644. } else if (client->contexts.count) {
  2645. return nbd_send_generic_reply(client, request, -EINVAL,
  2646. "CMD_BLOCK_STATUS payload not valid",
  2647. errp);
  2648. } else {
  2649. return nbd_send_generic_reply(client, request, -EINVAL,
  2650. "CMD_BLOCK_STATUS not negotiated",
  2651. errp);
  2652. }
  2653. default:
  2654. msg = g_strdup_printf("invalid request type (%" PRIu32 ") received",
  2655. request->type);
  2656. ret = nbd_send_generic_reply(client, request, -EINVAL, msg,
  2657. errp);
  2658. g_free(msg);
  2659. return ret;
  2660. }
  2661. }
  2662. /* Owns a reference to the NBDClient passed as opaque. */
  2663. static coroutine_fn void nbd_trip(void *opaque)
  2664. {
  2665. NBDRequestData *req = opaque;
  2666. NBDClient *client = req->client;
  2667. NBDRequest request = { 0 }; /* GCC thinks it can be used uninitialized */
  2668. int ret;
  2669. Error *local_err = NULL;
  2670. /*
  2671. * Note that nbd_client_put() and client_close() must be called from the
  2672. * main loop thread. Use aio_co_reschedule_self() to switch AioContext
  2673. * before calling these functions.
  2674. */
  2675. trace_nbd_trip();
  2676. qemu_mutex_lock(&client->lock);
  2677. if (client->closing) {
  2678. goto done;
  2679. }
  2680. if (client->quiescing) {
  2681. /*
  2682. * We're switching between AIO contexts. Don't attempt to receive a new
  2683. * request and kick the main context which may be waiting for us.
  2684. */
  2685. client->recv_coroutine = NULL;
  2686. aio_wait_kick();
  2687. goto done;
  2688. }
  2689. /*
  2690. * nbd_co_receive_request() returns -EAGAIN when nbd_drained_begin() has
  2691. * set client->quiescing but by the time we get back nbd_drained_end() may
  2692. * have already cleared client->quiescing. In that case we try again
  2693. * because nothing else will spawn an nbd_trip() coroutine until we set
  2694. * client->recv_coroutine = NULL further down.
  2695. */
  2696. do {
  2697. assert(client->recv_coroutine == qemu_coroutine_self());
  2698. qemu_mutex_unlock(&client->lock);
  2699. ret = nbd_co_receive_request(req, &request, &local_err);
  2700. qemu_mutex_lock(&client->lock);
  2701. } while (ret == -EAGAIN && !client->quiescing);
  2702. client->recv_coroutine = NULL;
  2703. if (client->closing) {
  2704. /*
  2705. * The client may be closed when we are blocked in
  2706. * nbd_co_receive_request()
  2707. */
  2708. goto done;
  2709. }
  2710. if (ret == -EAGAIN) {
  2711. goto done;
  2712. }
  2713. nbd_client_receive_next_request(client);
  2714. if (ret == -EIO) {
  2715. goto disconnect;
  2716. }
  2717. qemu_mutex_unlock(&client->lock);
  2718. qio_channel_set_cork(client->ioc, true);
  2719. if (ret < 0) {
  2720. /* It wasn't -EIO, so, according to nbd_co_receive_request()
  2721. * semantics, we should return the error to the client. */
  2722. Error *export_err = local_err;
  2723. local_err = NULL;
  2724. ret = nbd_send_generic_reply(client, &request, -EINVAL,
  2725. error_get_pretty(export_err), &local_err);
  2726. error_free(export_err);
  2727. } else {
  2728. ret = nbd_handle_request(client, &request, req->data, &local_err);
  2729. }
  2730. if (request.contexts && request.contexts != &client->contexts) {
  2731. assert(request.type == NBD_CMD_BLOCK_STATUS);
  2732. g_free(request.contexts->bitmaps);
  2733. g_free(request.contexts);
  2734. }
  2735. qio_channel_set_cork(client->ioc, false);
  2736. qemu_mutex_lock(&client->lock);
  2737. if (ret < 0) {
  2738. error_prepend(&local_err, "Failed to send reply: ");
  2739. goto disconnect;
  2740. }
  2741. /*
  2742. * We must disconnect after NBD_CMD_WRITE or BLOCK_STATUS with
  2743. * payload if we did not read the payload.
  2744. */
  2745. if (!req->complete) {
  2746. error_setg(&local_err, "Request handling failed in intermediate state");
  2747. goto disconnect;
  2748. }
  2749. done:
  2750. nbd_request_put(req);
  2751. qemu_mutex_unlock(&client->lock);
  2752. if (!nbd_client_put_nonzero(client)) {
  2753. aio_co_reschedule_self(qemu_get_aio_context());
  2754. nbd_client_put(client);
  2755. }
  2756. return;
  2757. disconnect:
  2758. if (local_err) {
  2759. error_reportf_err(local_err, "Disconnect client, due to: ");
  2760. }
  2761. nbd_request_put(req);
  2762. qemu_mutex_unlock(&client->lock);
  2763. aio_co_reschedule_self(qemu_get_aio_context());
  2764. client_close(client, true);
  2765. nbd_client_put(client);
  2766. }
  2767. /*
  2768. * Runs in export AioContext and main loop thread. Caller must hold
  2769. * client->lock.
  2770. */
  2771. static void nbd_client_receive_next_request(NBDClient *client)
  2772. {
  2773. NBDRequestData *req;
  2774. if (!client->recv_coroutine && client->nb_requests < MAX_NBD_REQUESTS &&
  2775. !client->quiescing) {
  2776. nbd_client_get(client);
  2777. req = nbd_request_get(client);
  2778. client->recv_coroutine = qemu_coroutine_create(nbd_trip, req);
  2779. aio_co_schedule(client->exp->common.ctx, client->recv_coroutine);
  2780. }
  2781. }
  2782. static void nbd_handshake_timer_cb(void *opaque)
  2783. {
  2784. QIOChannel *ioc = opaque;
  2785. trace_nbd_handshake_timer_cb();
  2786. qio_channel_shutdown(ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
  2787. }
  2788. static coroutine_fn void nbd_co_client_start(void *opaque)
  2789. {
  2790. NBDClient *client = opaque;
  2791. Error *local_err = NULL;
  2792. QEMUTimer *handshake_timer = NULL;
  2793. qemu_co_mutex_init(&client->send_lock);
  2794. /*
  2795. * Create a timer to bound the time spent in negotiation. If the
  2796. * timer expires, it is likely nbd_negotiate will fail because the
  2797. * socket was shutdown.
  2798. */
  2799. if (client->handshake_max_secs > 0) {
  2800. handshake_timer = aio_timer_new(qemu_get_aio_context(),
  2801. QEMU_CLOCK_REALTIME,
  2802. SCALE_NS,
  2803. nbd_handshake_timer_cb,
  2804. client->sioc);
  2805. timer_mod(handshake_timer,
  2806. qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
  2807. client->handshake_max_secs * NANOSECONDS_PER_SECOND);
  2808. }
  2809. if (nbd_negotiate(client, &local_err)) {
  2810. if (local_err) {
  2811. error_report_err(local_err);
  2812. }
  2813. timer_free(handshake_timer);
  2814. client_close(client, false);
  2815. return;
  2816. }
  2817. timer_free(handshake_timer);
  2818. WITH_QEMU_LOCK_GUARD(&client->lock) {
  2819. nbd_client_receive_next_request(client);
  2820. }
  2821. }
  2822. /*
  2823. * Create a new client listener using the given channel @sioc and @owner.
  2824. * Begin servicing it in a coroutine. When the connection closes, call
  2825. * @close_fn with an indication of whether the client completed negotiation
  2826. * within @handshake_max_secs seconds (0 for unbounded).
  2827. */
  2828. void nbd_client_new(QIOChannelSocket *sioc,
  2829. uint32_t handshake_max_secs,
  2830. QCryptoTLSCreds *tlscreds,
  2831. const char *tlsauthz,
  2832. void (*close_fn)(NBDClient *, bool),
  2833. void *owner)
  2834. {
  2835. NBDClient *client;
  2836. Coroutine *co;
  2837. client = g_new0(NBDClient, 1);
  2838. qemu_mutex_init(&client->lock);
  2839. client->refcount = 1;
  2840. client->tlscreds = tlscreds;
  2841. if (tlscreds) {
  2842. object_ref(OBJECT(client->tlscreds));
  2843. }
  2844. client->tlsauthz = g_strdup(tlsauthz);
  2845. client->handshake_max_secs = handshake_max_secs;
  2846. client->sioc = sioc;
  2847. qio_channel_set_delay(QIO_CHANNEL(sioc), false);
  2848. object_ref(OBJECT(client->sioc));
  2849. client->ioc = QIO_CHANNEL(sioc);
  2850. object_ref(OBJECT(client->ioc));
  2851. client->close_fn = close_fn;
  2852. client->owner = owner;
  2853. co = qemu_coroutine_create(nbd_co_client_start, client);
  2854. qemu_coroutine_enter(co);
  2855. }
  2856. void *
  2857. nbd_client_owner(NBDClient *client)
  2858. {
  2859. return client->owner;
  2860. }