dirtylimit.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676
  1. /*
  2. * Dirty page rate limit implementation code
  3. *
  4. * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
  5. *
  6. * Authors:
  7. * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
  8. *
  9. * This work is licensed under the terms of the GNU GPL, version 2 or later.
  10. * See the COPYING file in the top-level directory.
  11. */
  12. #include "qemu/osdep.h"
  13. #include "qemu/main-loop.h"
  14. #include "qapi/qapi-commands-migration.h"
  15. #include "qobject/qdict.h"
  16. #include "qapi/error.h"
  17. #include "system/dirtyrate.h"
  18. #include "system/dirtylimit.h"
  19. #include "monitor/hmp.h"
  20. #include "monitor/monitor.h"
  21. #include "exec/memory.h"
  22. #include "exec/target_page.h"
  23. #include "hw/boards.h"
  24. #include "system/kvm.h"
  25. #include "trace.h"
  26. #include "migration/misc.h"
  27. /*
  28. * Dirtylimit stop working if dirty page rate error
  29. * value less than DIRTYLIMIT_TOLERANCE_RANGE
  30. */
  31. #define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
  32. /*
  33. * Plus or minus vcpu sleep time linearly if dirty
  34. * page rate error value percentage over
  35. * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
  36. * Otherwise, plus or minus a fixed vcpu sleep time.
  37. */
  38. #define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
  39. /*
  40. * Max vcpu sleep time percentage during a cycle
  41. * composed of dirty ring full and sleep time.
  42. */
  43. #define DIRTYLIMIT_THROTTLE_PCT_MAX 99
  44. struct {
  45. VcpuStat stat;
  46. bool running;
  47. QemuThread thread;
  48. } *vcpu_dirty_rate_stat;
  49. typedef struct VcpuDirtyLimitState {
  50. int cpu_index;
  51. bool enabled;
  52. /*
  53. * Quota dirty page rate, unit is MB/s
  54. * zero if not enabled.
  55. */
  56. uint64_t quota;
  57. } VcpuDirtyLimitState;
  58. struct {
  59. VcpuDirtyLimitState *states;
  60. /* Max cpus number configured by user */
  61. int max_cpus;
  62. /* Number of vcpu under dirtylimit */
  63. int limited_nvcpu;
  64. } *dirtylimit_state;
  65. /* protect dirtylimit_state */
  66. static QemuMutex dirtylimit_mutex;
  67. /* dirtylimit thread quit if dirtylimit_quit is true */
  68. static bool dirtylimit_quit;
  69. static void vcpu_dirty_rate_stat_collect(void)
  70. {
  71. VcpuStat stat;
  72. int i = 0;
  73. int64_t period = DIRTYLIMIT_CALC_TIME_MS;
  74. if (migrate_dirty_limit() && migration_is_running()) {
  75. period = migrate_vcpu_dirty_limit_period();
  76. }
  77. /* calculate vcpu dirtyrate */
  78. vcpu_calculate_dirtyrate(period,
  79. &stat,
  80. GLOBAL_DIRTY_LIMIT,
  81. false);
  82. for (i = 0; i < stat.nvcpu; i++) {
  83. vcpu_dirty_rate_stat->stat.rates[i].id = i;
  84. vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
  85. stat.rates[i].dirty_rate;
  86. }
  87. g_free(stat.rates);
  88. }
  89. static void *vcpu_dirty_rate_stat_thread(void *opaque)
  90. {
  91. rcu_register_thread();
  92. /* start log sync */
  93. global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
  94. while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
  95. vcpu_dirty_rate_stat_collect();
  96. if (dirtylimit_in_service()) {
  97. dirtylimit_process();
  98. }
  99. }
  100. /* stop log sync */
  101. global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
  102. rcu_unregister_thread();
  103. return NULL;
  104. }
  105. int64_t vcpu_dirty_rate_get(int cpu_index)
  106. {
  107. DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
  108. return qatomic_read_i64(&rates[cpu_index].dirty_rate);
  109. }
  110. void vcpu_dirty_rate_stat_start(void)
  111. {
  112. if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
  113. return;
  114. }
  115. qatomic_set(&vcpu_dirty_rate_stat->running, 1);
  116. qemu_thread_create(&vcpu_dirty_rate_stat->thread,
  117. "dirtyrate-stat",
  118. vcpu_dirty_rate_stat_thread,
  119. NULL,
  120. QEMU_THREAD_JOINABLE);
  121. }
  122. void vcpu_dirty_rate_stat_stop(void)
  123. {
  124. qatomic_set(&vcpu_dirty_rate_stat->running, 0);
  125. dirtylimit_state_unlock();
  126. bql_unlock();
  127. qemu_thread_join(&vcpu_dirty_rate_stat->thread);
  128. bql_lock();
  129. dirtylimit_state_lock();
  130. }
  131. void vcpu_dirty_rate_stat_initialize(void)
  132. {
  133. MachineState *ms = MACHINE(qdev_get_machine());
  134. int max_cpus = ms->smp.max_cpus;
  135. vcpu_dirty_rate_stat =
  136. g_malloc0(sizeof(*vcpu_dirty_rate_stat));
  137. vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
  138. vcpu_dirty_rate_stat->stat.rates =
  139. g_new0(DirtyRateVcpu, max_cpus);
  140. vcpu_dirty_rate_stat->running = false;
  141. }
  142. void vcpu_dirty_rate_stat_finalize(void)
  143. {
  144. g_free(vcpu_dirty_rate_stat->stat.rates);
  145. vcpu_dirty_rate_stat->stat.rates = NULL;
  146. g_free(vcpu_dirty_rate_stat);
  147. vcpu_dirty_rate_stat = NULL;
  148. }
  149. void dirtylimit_state_lock(void)
  150. {
  151. qemu_mutex_lock(&dirtylimit_mutex);
  152. }
  153. void dirtylimit_state_unlock(void)
  154. {
  155. qemu_mutex_unlock(&dirtylimit_mutex);
  156. }
  157. static void
  158. __attribute__((__constructor__)) dirtylimit_mutex_init(void)
  159. {
  160. qemu_mutex_init(&dirtylimit_mutex);
  161. }
  162. static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
  163. {
  164. return &dirtylimit_state->states[cpu_index];
  165. }
  166. void dirtylimit_state_initialize(void)
  167. {
  168. MachineState *ms = MACHINE(qdev_get_machine());
  169. int max_cpus = ms->smp.max_cpus;
  170. int i;
  171. dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
  172. dirtylimit_state->states =
  173. g_new0(VcpuDirtyLimitState, max_cpus);
  174. for (i = 0; i < max_cpus; i++) {
  175. dirtylimit_state->states[i].cpu_index = i;
  176. }
  177. dirtylimit_state->max_cpus = max_cpus;
  178. trace_dirtylimit_state_initialize(max_cpus);
  179. }
  180. void dirtylimit_state_finalize(void)
  181. {
  182. g_free(dirtylimit_state->states);
  183. dirtylimit_state->states = NULL;
  184. g_free(dirtylimit_state);
  185. dirtylimit_state = NULL;
  186. trace_dirtylimit_state_finalize();
  187. }
  188. bool dirtylimit_in_service(void)
  189. {
  190. return !!dirtylimit_state;
  191. }
  192. bool dirtylimit_vcpu_index_valid(int cpu_index)
  193. {
  194. MachineState *ms = MACHINE(qdev_get_machine());
  195. return !(cpu_index < 0 ||
  196. cpu_index >= ms->smp.max_cpus);
  197. }
  198. static uint64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
  199. {
  200. static uint64_t max_dirtyrate;
  201. uint64_t dirty_ring_size_MiB;
  202. dirty_ring_size_MiB = qemu_target_pages_to_MiB(kvm_dirty_ring_size());
  203. if (max_dirtyrate < dirtyrate) {
  204. max_dirtyrate = dirtyrate;
  205. }
  206. return dirty_ring_size_MiB * 1000000 / max_dirtyrate;
  207. }
  208. static inline bool dirtylimit_done(uint64_t quota,
  209. uint64_t current)
  210. {
  211. uint64_t min, max;
  212. min = MIN(quota, current);
  213. max = MAX(quota, current);
  214. return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
  215. }
  216. static inline bool
  217. dirtylimit_need_linear_adjustment(uint64_t quota,
  218. uint64_t current)
  219. {
  220. uint64_t min, max;
  221. min = MIN(quota, current);
  222. max = MAX(quota, current);
  223. return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
  224. }
  225. static void dirtylimit_set_throttle(CPUState *cpu,
  226. uint64_t quota,
  227. uint64_t current)
  228. {
  229. int64_t ring_full_time_us = 0;
  230. uint64_t sleep_pct = 0;
  231. uint64_t throttle_us = 0;
  232. if (current == 0) {
  233. cpu->throttle_us_per_full = 0;
  234. return;
  235. }
  236. ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
  237. if (dirtylimit_need_linear_adjustment(quota, current)) {
  238. if (quota < current) {
  239. sleep_pct = (current - quota) * 100 / current;
  240. throttle_us =
  241. ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
  242. cpu->throttle_us_per_full += throttle_us;
  243. } else {
  244. sleep_pct = (quota - current) * 100 / quota;
  245. throttle_us =
  246. ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
  247. cpu->throttle_us_per_full -= throttle_us;
  248. }
  249. trace_dirtylimit_throttle_pct(cpu->cpu_index,
  250. sleep_pct,
  251. throttle_us);
  252. } else {
  253. if (quota < current) {
  254. cpu->throttle_us_per_full += ring_full_time_us / 10;
  255. } else {
  256. cpu->throttle_us_per_full -= ring_full_time_us / 10;
  257. }
  258. }
  259. /*
  260. * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
  261. * current dirty page rate may never reach the quota, we should stop
  262. * increasing sleep time?
  263. */
  264. cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
  265. ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
  266. cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
  267. }
  268. static void dirtylimit_adjust_throttle(CPUState *cpu)
  269. {
  270. uint64_t quota = 0;
  271. uint64_t current = 0;
  272. int cpu_index = cpu->cpu_index;
  273. quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
  274. current = vcpu_dirty_rate_get(cpu_index);
  275. if (!dirtylimit_done(quota, current)) {
  276. dirtylimit_set_throttle(cpu, quota, current);
  277. }
  278. return;
  279. }
  280. void dirtylimit_process(void)
  281. {
  282. CPUState *cpu;
  283. if (!qatomic_read(&dirtylimit_quit)) {
  284. dirtylimit_state_lock();
  285. if (!dirtylimit_in_service()) {
  286. dirtylimit_state_unlock();
  287. return;
  288. }
  289. CPU_FOREACH(cpu) {
  290. if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
  291. continue;
  292. }
  293. dirtylimit_adjust_throttle(cpu);
  294. }
  295. dirtylimit_state_unlock();
  296. }
  297. }
  298. void dirtylimit_change(bool start)
  299. {
  300. if (start) {
  301. qatomic_set(&dirtylimit_quit, 0);
  302. } else {
  303. qatomic_set(&dirtylimit_quit, 1);
  304. }
  305. }
  306. void dirtylimit_set_vcpu(int cpu_index,
  307. uint64_t quota,
  308. bool enable)
  309. {
  310. trace_dirtylimit_set_vcpu(cpu_index, quota);
  311. if (enable) {
  312. dirtylimit_state->states[cpu_index].quota = quota;
  313. if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
  314. dirtylimit_state->limited_nvcpu++;
  315. }
  316. } else {
  317. dirtylimit_state->states[cpu_index].quota = 0;
  318. if (dirtylimit_state->states[cpu_index].enabled) {
  319. dirtylimit_state->limited_nvcpu--;
  320. }
  321. }
  322. dirtylimit_state->states[cpu_index].enabled = enable;
  323. }
  324. void dirtylimit_set_all(uint64_t quota,
  325. bool enable)
  326. {
  327. MachineState *ms = MACHINE(qdev_get_machine());
  328. int max_cpus = ms->smp.max_cpus;
  329. int i;
  330. for (i = 0; i < max_cpus; i++) {
  331. dirtylimit_set_vcpu(i, quota, enable);
  332. }
  333. }
  334. void dirtylimit_vcpu_execute(CPUState *cpu)
  335. {
  336. if (cpu->throttle_us_per_full) {
  337. dirtylimit_state_lock();
  338. if (dirtylimit_in_service() &&
  339. dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
  340. dirtylimit_state_unlock();
  341. trace_dirtylimit_vcpu_execute(cpu->cpu_index,
  342. cpu->throttle_us_per_full);
  343. g_usleep(cpu->throttle_us_per_full);
  344. return;
  345. }
  346. dirtylimit_state_unlock();
  347. }
  348. }
  349. static void dirtylimit_init(void)
  350. {
  351. dirtylimit_state_initialize();
  352. dirtylimit_change(true);
  353. vcpu_dirty_rate_stat_initialize();
  354. vcpu_dirty_rate_stat_start();
  355. }
  356. static void dirtylimit_cleanup(void)
  357. {
  358. vcpu_dirty_rate_stat_stop();
  359. vcpu_dirty_rate_stat_finalize();
  360. dirtylimit_change(false);
  361. dirtylimit_state_finalize();
  362. }
  363. /*
  364. * dirty page rate limit is not allowed to set if migration
  365. * is running with dirty-limit capability enabled.
  366. */
  367. static bool dirtylimit_is_allowed(void)
  368. {
  369. if (migration_is_running() &&
  370. !migration_thread_is_self() &&
  371. migrate_dirty_limit() &&
  372. dirtylimit_in_service()) {
  373. return false;
  374. }
  375. return true;
  376. }
  377. void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
  378. int64_t cpu_index,
  379. Error **errp)
  380. {
  381. if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
  382. return;
  383. }
  384. if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
  385. error_setg(errp, "incorrect cpu index specified");
  386. return;
  387. }
  388. if (!dirtylimit_is_allowed()) {
  389. error_setg(errp, "can't cancel dirty page rate limit while"
  390. " migration is running");
  391. return;
  392. }
  393. if (!dirtylimit_in_service()) {
  394. return;
  395. }
  396. dirtylimit_state_lock();
  397. if (has_cpu_index) {
  398. dirtylimit_set_vcpu(cpu_index, 0, false);
  399. } else {
  400. dirtylimit_set_all(0, false);
  401. }
  402. if (!dirtylimit_state->limited_nvcpu) {
  403. dirtylimit_cleanup();
  404. }
  405. dirtylimit_state_unlock();
  406. }
  407. void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
  408. {
  409. int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
  410. Error *err = NULL;
  411. qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
  412. if (err) {
  413. hmp_handle_error(mon, err);
  414. return;
  415. }
  416. monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
  417. "dirty limit for virtual CPU]\n");
  418. }
  419. void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
  420. int64_t cpu_index,
  421. uint64_t dirty_rate,
  422. Error **errp)
  423. {
  424. if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
  425. error_setg(errp, "dirty page limit feature requires KVM with"
  426. " accelerator property 'dirty-ring-size' set'");
  427. return;
  428. }
  429. if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
  430. error_setg(errp, "incorrect cpu index specified");
  431. return;
  432. }
  433. if (!dirtylimit_is_allowed()) {
  434. error_setg(errp, "can't set dirty page rate limit while"
  435. " migration is running");
  436. return;
  437. }
  438. if (!dirty_rate) {
  439. qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
  440. return;
  441. }
  442. dirtylimit_state_lock();
  443. if (!dirtylimit_in_service()) {
  444. dirtylimit_init();
  445. }
  446. if (has_cpu_index) {
  447. dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
  448. } else {
  449. dirtylimit_set_all(dirty_rate, true);
  450. }
  451. dirtylimit_state_unlock();
  452. }
  453. void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
  454. {
  455. int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
  456. int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
  457. Error *err = NULL;
  458. if (dirty_rate < 0) {
  459. error_setg(&err, "invalid dirty page limit %" PRId64, dirty_rate);
  460. goto out;
  461. }
  462. qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
  463. out:
  464. hmp_handle_error(mon, err);
  465. }
  466. /* Return the max throttle time of each virtual CPU */
  467. uint64_t dirtylimit_throttle_time_per_round(void)
  468. {
  469. CPUState *cpu;
  470. int64_t max = 0;
  471. CPU_FOREACH(cpu) {
  472. if (cpu->throttle_us_per_full > max) {
  473. max = cpu->throttle_us_per_full;
  474. }
  475. }
  476. return max;
  477. }
  478. /*
  479. * Estimate average dirty ring full time of each virtaul CPU.
  480. * Return 0 if guest doesn't dirty memory.
  481. */
  482. uint64_t dirtylimit_ring_full_time(void)
  483. {
  484. CPUState *cpu;
  485. uint64_t curr_rate = 0;
  486. int nvcpus = 0;
  487. CPU_FOREACH(cpu) {
  488. if (cpu->running) {
  489. nvcpus++;
  490. curr_rate += vcpu_dirty_rate_get(cpu->cpu_index);
  491. }
  492. }
  493. if (!curr_rate || !nvcpus) {
  494. return 0;
  495. }
  496. return dirtylimit_dirty_ring_full_time(curr_rate / nvcpus);
  497. }
  498. static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
  499. {
  500. DirtyLimitInfo *info = NULL;
  501. info = g_malloc0(sizeof(*info));
  502. info->cpu_index = cpu_index;
  503. info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
  504. info->current_rate = vcpu_dirty_rate_get(cpu_index);
  505. return info;
  506. }
  507. static struct DirtyLimitInfoList *dirtylimit_query_all(void)
  508. {
  509. int i, index;
  510. DirtyLimitInfo *info = NULL;
  511. DirtyLimitInfoList *head = NULL, **tail = &head;
  512. dirtylimit_state_lock();
  513. if (!dirtylimit_in_service()) {
  514. dirtylimit_state_unlock();
  515. return NULL;
  516. }
  517. for (i = 0; i < dirtylimit_state->max_cpus; i++) {
  518. index = dirtylimit_state->states[i].cpu_index;
  519. if (dirtylimit_vcpu_get_state(index)->enabled) {
  520. info = dirtylimit_query_vcpu(index);
  521. QAPI_LIST_APPEND(tail, info);
  522. }
  523. }
  524. dirtylimit_state_unlock();
  525. return head;
  526. }
  527. struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
  528. {
  529. return dirtylimit_query_all();
  530. }
  531. void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
  532. {
  533. DirtyLimitInfoList *info;
  534. g_autoptr(DirtyLimitInfoList) head = NULL;
  535. Error *err = NULL;
  536. if (!dirtylimit_in_service()) {
  537. monitor_printf(mon, "Dirty page limit not enabled!\n");
  538. return;
  539. }
  540. head = qmp_query_vcpu_dirty_limit(&err);
  541. if (err) {
  542. hmp_handle_error(mon, err);
  543. return;
  544. }
  545. for (info = head; info != NULL; info = info->next) {
  546. monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
  547. " current rate %"PRIi64 " (MB/s)\n",
  548. info->value->cpu_index,
  549. info->value->limit_rate,
  550. info->value->current_rate);
  551. }
  552. }