summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml12
-rw-r--r--.travis.yml6
-rw-r--r--accel/kvm/kvm-all.c46
-rw-r--r--accel/stubs/kvm-stub.c5
-rw-r--r--cpus-common.c8
-rw-r--r--hmp-commands-info.hx13
-rw-r--r--hmp-commands.hx32
-rw-r--r--include/exec/cpu-common.h1
-rw-r--r--include/exec/memory.h5
-rw-r--r--include/hw/core/cpu.h6
-rw-r--r--include/monitor/hmp.h3
-rw-r--r--include/sysemu/dirtylimit.h37
-rw-r--r--include/sysemu/dirtyrate.h28
-rw-r--r--include/sysemu/kvm.h2
-rw-r--r--io/channel-socket.c8
-rw-r--r--migration/channel.c9
-rw-r--r--migration/dirtyrate.c227
-rw-r--r--migration/dirtyrate.h7
-rw-r--r--migration/migration.c152
-rw-r--r--migration/migration.h44
-rw-r--r--migration/multifd-zlib.c38
-rw-r--r--migration/multifd.c6
-rw-r--r--migration/multifd.h66
-rw-r--r--migration/postcopy-ram.c186
-rw-r--r--migration/postcopy-ram.h11
-rw-r--r--migration/qemu-file.c31
-rw-r--r--migration/qemu-file.h1
-rw-r--r--migration/ram.c331
-rw-r--r--migration/ram.h6
-rw-r--r--migration/savevm.c46
-rw-r--r--migration/socket.c22
-rw-r--r--migration/socket.h1
-rw-r--r--migration/tls.c9
-rw-r--r--migration/tls.h4
-rw-r--r--migration/trace-events15
-rw-r--r--monitor/hmp-cmds.c5
-rw-r--r--qapi/migration.json94
-rw-r--r--softmmu/dirtylimit.c601
-rw-r--r--softmmu/meson.build1
-rw-r--r--softmmu/trace-events7
-rw-r--r--tests/qtest/migration-helpers.c22
-rw-r--r--tests/qtest/migration-helpers.h2
-rw-r--r--tests/qtest/migration-test.c543
-rw-r--r--tests/qtest/qmp-cmd-test.c2
44 files changed, 2400 insertions, 301 deletions
diff --git a/.gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml b/.gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml
index 9f1fe9e7dc..03e74c97db 100644
--- a/.gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml
+++ b/.gitlab-ci.d/custom-runners/ubuntu-20.04-s390x.yml
@@ -8,8 +8,6 @@ ubuntu-20.04-s390x-all-linux-static:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
- if: "$S390X_RUNNER_AVAILABLE"
@@ -29,8 +27,6 @@ ubuntu-20.04-s390x-all:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
timeout: 75m
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
@@ -48,8 +44,6 @@ ubuntu-20.04-s390x-alldbg:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
when: manual
@@ -71,8 +65,6 @@ ubuntu-20.04-s390x-clang:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
when: manual
@@ -93,8 +85,6 @@ ubuntu-20.04-s390x-tci:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
when: manual
@@ -114,8 +104,6 @@ ubuntu-20.04-s390x-notcg:
tags:
- ubuntu_20.04
- s390x
- variables:
- DFLTCC: 0
rules:
- if: '$CI_PROJECT_NAMESPACE == "qemu-project" && $CI_COMMIT_BRANCH =~ /^staging/'
when: manual
diff --git a/.travis.yml b/.travis.yml
index 4fdc9a6785..fb3baabca9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -218,7 +218,6 @@ jobs:
- TEST_CMD="make check check-tcg V=1"
- CONFIG="--disable-containers --target-list=${MAIN_SOFTMMU_TARGETS},s390x-linux-user"
- UNRELIABLE=true
- - DFLTCC=0
script:
- BUILD_RC=0 && make -j${JOBS} || BUILD_RC=$?
- |
@@ -258,7 +257,7 @@ jobs:
env:
- CONFIG="--disable-containers --audio-drv-list=sdl --disable-user
--target-list-exclude=${MAIN_SOFTMMU_TARGETS}"
- - DFLTCC=0
+
- name: "[s390x] GCC (user)"
arch: s390x
dist: focal
@@ -270,7 +269,7 @@ jobs:
- ninja-build
env:
- CONFIG="--disable-containers --disable-system"
- - DFLTCC=0
+
- name: "[s390x] Clang (disable-tcg)"
arch: s390x
dist: focal
@@ -304,4 +303,3 @@ jobs:
- CONFIG="--disable-containers --disable-tcg --enable-kvm
--disable-tools --host-cc=clang --cxx=clang++"
- UNRELIABLE=true
- - DFLTCC=0
diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index ed8b6b896e..3187656570 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -45,6 +45,7 @@
#include "qemu/guest-random.h"
#include "sysemu/hw_accel.h"
#include "kvm-cpus.h"
+#include "sysemu/dirtylimit.h"
#include "hw/boards.h"
#include "monitor/stats.h"
@@ -477,6 +478,7 @@ int kvm_init_vcpu(CPUState *cpu, Error **errp)
cpu->kvm_state = s;
cpu->vcpu_dirty = true;
cpu->dirty_pages = 0;
+ cpu->throttle_us_per_full = 0;
mmap_size = kvm_ioctl(s, KVM_GET_VCPU_MMAP_SIZE, 0);
if (mmap_size < 0) {
@@ -757,17 +759,20 @@ static uint32_t kvm_dirty_ring_reap_one(KVMState *s, CPUState *cpu)
}
/* Must be with slots_lock held */
-static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
+static uint64_t kvm_dirty_ring_reap_locked(KVMState *s, CPUState* cpu)
{
int ret;
- CPUState *cpu;
uint64_t total = 0;
int64_t stamp;
stamp = get_clock();
- CPU_FOREACH(cpu) {
- total += kvm_dirty_ring_reap_one(s, cpu);
+ if (cpu) {
+ total = kvm_dirty_ring_reap_one(s, cpu);
+ } else {
+ CPU_FOREACH(cpu) {
+ total += kvm_dirty_ring_reap_one(s, cpu);
+ }
}
if (total) {
@@ -788,7 +793,7 @@ static uint64_t kvm_dirty_ring_reap_locked(KVMState *s)
* Currently for simplicity, we must hold BQL before calling this. We can
* consider to drop the BQL if we're clear with all the race conditions.
*/
-static uint64_t kvm_dirty_ring_reap(KVMState *s)
+static uint64_t kvm_dirty_ring_reap(KVMState *s, CPUState *cpu)
{
uint64_t total;
@@ -808,7 +813,7 @@ static uint64_t kvm_dirty_ring_reap(KVMState *s)
* reset below.
*/
kvm_slots_lock();
- total = kvm_dirty_ring_reap_locked(s);
+ total = kvm_dirty_ring_reap_locked(s, cpu);
kvm_slots_unlock();
return total;
@@ -855,7 +860,7 @@ static void kvm_dirty_ring_flush(void)
* vcpus out in a synchronous way.
*/
kvm_cpu_synchronize_kick_all();
- kvm_dirty_ring_reap(kvm_state);
+ kvm_dirty_ring_reap(kvm_state, NULL);
trace_kvm_dirty_ring_flush(1);
}
@@ -1399,7 +1404,7 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml,
* Not easy. Let's cross the fingers until it's fixed.
*/
if (kvm_state->kvm_dirty_ring_size) {
- kvm_dirty_ring_reap_locked(kvm_state);
+ kvm_dirty_ring_reap_locked(kvm_state, NULL);
} else {
kvm_slot_get_dirty_log(kvm_state, mem);
}
@@ -1467,11 +1472,16 @@ static void *kvm_dirty_ring_reaper_thread(void *data)
*/
sleep(1);
+ /* keep sleeping so that dirtylimit not be interfered by reaper */
+ if (dirtylimit_in_service()) {
+ continue;
+ }
+
trace_kvm_dirty_ring_reaper("wakeup");
r->reaper_state = KVM_DIRTY_RING_REAPER_REAPING;
qemu_mutex_lock_iothread();
- kvm_dirty_ring_reap(s);
+ kvm_dirty_ring_reap(s, NULL);
qemu_mutex_unlock_iothread();
r->reaper_iteration++;
@@ -2315,6 +2325,11 @@ static void query_stats_cb(StatsResultList **result, StatsTarget target,
strList *names, strList *targets, Error **errp);
static void query_stats_schemas_cb(StatsSchemaList **result, Error **errp);
+uint32_t kvm_dirty_ring_size(void)
+{
+ return kvm_state->kvm_dirty_ring_size;
+}
+
static int kvm_init(MachineState *ms)
{
MachineClass *mc = MACHINE_GET_CLASS(ms);
@@ -2967,8 +2982,19 @@ int kvm_cpu_exec(CPUState *cpu)
*/
trace_kvm_dirty_ring_full(cpu->cpu_index);
qemu_mutex_lock_iothread();
- kvm_dirty_ring_reap(kvm_state);
+ /*
+ * We throttle vCPU by making it sleep once it exit from kernel
+ * due to dirty ring full. In the dirtylimit scenario, reaping
+ * all vCPUs after a single vCPU dirty ring get full result in
+ * the miss of sleep, so just reap the ring-fulled vCPU.
+ */
+ if (dirtylimit_in_service()) {
+ kvm_dirty_ring_reap(kvm_state, cpu);
+ } else {
+ kvm_dirty_ring_reap(kvm_state, NULL);
+ }
qemu_mutex_unlock_iothread();
+ dirtylimit_vcpu_execute(cpu);
ret = 0;
break;
case KVM_EXIT_SYSTEM_EVENT:
diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c
index 3345882d85..2ac5f9c036 100644
--- a/accel/stubs/kvm-stub.c
+++ b/accel/stubs/kvm-stub.c
@@ -148,3 +148,8 @@ bool kvm_dirty_ring_enabled(void)
{
return false;
}
+
+uint32_t kvm_dirty_ring_size(void)
+{
+ return 0;
+}
diff --git a/cpus-common.c b/cpus-common.c
index db459b41ce..793364dc0e 100644
--- a/cpus-common.c
+++ b/cpus-common.c
@@ -73,6 +73,12 @@ static int cpu_get_free_index(void)
}
CPUTailQ cpus = QTAILQ_HEAD_INITIALIZER(cpus);
+static unsigned int cpu_list_generation_id;
+
+unsigned int cpu_list_generation_id_get(void)
+{
+ return cpu_list_generation_id;
+}
void cpu_list_add(CPUState *cpu)
{
@@ -84,6 +90,7 @@ void cpu_list_add(CPUState *cpu)
assert(!cpu_index_auto_assigned);
}
QTAILQ_INSERT_TAIL_RCU(&cpus, cpu, node);
+ cpu_list_generation_id++;
}
void cpu_list_remove(CPUState *cpu)
@@ -96,6 +103,7 @@ void cpu_list_remove(CPUState *cpu)
QTAILQ_REMOVE_RCU(&cpus, cpu, node);
cpu->cpu_index = UNASSIGNED_CPU_INDEX;
+ cpu_list_generation_id++;
}
CPUState *qemu_get_cpu(int index)
diff --git a/hmp-commands-info.hx b/hmp-commands-info.hx
index 3ffa24bd67..188d9ece3b 100644
--- a/hmp-commands-info.hx
+++ b/hmp-commands-info.hx
@@ -865,6 +865,19 @@ SRST
Display the vcpu dirty rate information.
ERST
+ {
+ .name = "vcpu_dirty_limit",
+ .args_type = "",
+ .params = "",
+ .help = "show dirty page limit information of all vCPU",
+ .cmd = hmp_info_vcpu_dirty_limit,
+ },
+
+SRST
+ ``info vcpu_dirty_limit``
+ Display the vcpu dirty page limit information.
+ERST
+
#if defined(TARGET_I386)
{
.name = "sgx",
diff --git a/hmp-commands.hx b/hmp-commands.hx
index c9d465735a..182e639d14 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -1768,3 +1768,35 @@ ERST
"\n\t\t\t -b to specify dirty bitmap as method of calculation)",
.cmd = hmp_calc_dirty_rate,
},
+
+SRST
+``set_vcpu_dirty_limit``
+ Set dirty page rate limit on virtual CPU, the information about all the
+ virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+ command.
+ERST
+
+ {
+ .name = "set_vcpu_dirty_limit",
+ .args_type = "dirty_rate:l,cpu_index:l?",
+ .params = "dirty_rate [cpu_index]",
+ .help = "set dirty page rate limit, use cpu_index to set limit"
+ "\n\t\t\t\t\t on a specified virtual cpu",
+ .cmd = hmp_set_vcpu_dirty_limit,
+ },
+
+SRST
+``cancel_vcpu_dirty_limit``
+ Cancel dirty page rate limit on virtual CPU, the information about all the
+ virtual CPU dirty limit status can be observed with ``info vcpu_dirty_limit``
+ command.
+ERST
+
+ {
+ .name = "cancel_vcpu_dirty_limit",
+ .args_type = "cpu_index:l?",
+ .params = "[cpu_index]",
+ .help = "cancel dirty page rate limit, use cpu_index to cancel"
+ "\n\t\t\t\t\t limit on a specified virtual cpu",
+ .cmd = hmp_cancel_vcpu_dirty_limit,
+ },
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 5968551a05..2281be4e10 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -35,6 +35,7 @@ extern intptr_t qemu_host_page_mask;
void qemu_init_cpu_list(void);
void cpu_list_lock(void);
void cpu_list_unlock(void);
+unsigned int cpu_list_generation_id_get(void);
void tcg_flush_softmmu_tlb(CPUState *cs);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index a6a0f4d8ad..bfb1de8eea 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -69,7 +69,10 @@ static inline void fuzz_dma_read_cb(size_t addr,
/* Dirty tracking enabled because measuring dirty rate */
#define GLOBAL_DIRTY_DIRTY_RATE (1U << 1)
-#define GLOBAL_DIRTY_MASK (0x3)
+/* Dirty tracking enabled because dirty limit */
+#define GLOBAL_DIRTY_LIMIT (1U << 2)
+
+#define GLOBAL_DIRTY_MASK (0x7)
extern unsigned int global_dirty_tracking;
diff --git a/include/hw/core/cpu.h b/include/hw/core/cpu.h
index 996f94059f..500503da13 100644
--- a/include/hw/core/cpu.h
+++ b/include/hw/core/cpu.h
@@ -418,6 +418,12 @@ struct CPUState {
*/
bool throttle_thread_scheduled;
+ /*
+ * Sleep throttle_us_per_full microseconds once dirty ring is full
+ * if dirty page rate limit is enabled.
+ */
+ int64_t throttle_us_per_full;
+
bool ignore_memory_transaction_failures;
/* Used for user-only emulation of prctl(PR_SET_UNALIGN). */
diff --git a/include/monitor/hmp.h b/include/monitor/hmp.h
index 2e89a97bd6..a618eb1e4e 100644
--- a/include/monitor/hmp.h
+++ b/include/monitor/hmp.h
@@ -131,6 +131,9 @@ void hmp_replay_delete_break(Monitor *mon, const QDict *qdict);
void hmp_replay_seek(Monitor *mon, const QDict *qdict);
void hmp_info_dirty_rate(Monitor *mon, const QDict *qdict);
void hmp_calc_dirty_rate(Monitor *mon, const QDict *qdict);
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict);
void hmp_human_readable_text_helper(Monitor *mon,
HumanReadableText *(*qmp_handler)(Error **));
void hmp_info_stats(Monitor *mon, const QDict *qdict);
diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
new file mode 100644
index 0000000000..8d2c1f3a6b
--- /dev/null
+++ b/include/sysemu/dirtylimit.h
@@ -0,0 +1,37 @@
+/*
+ * Dirty page rate limit common functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#ifndef QEMU_DIRTYRLIMIT_H
+#define QEMU_DIRTYRLIMIT_H
+
+#define DIRTYLIMIT_CALC_TIME_MS 1000 /* 1000ms */
+
+int64_t vcpu_dirty_rate_get(int cpu_index);
+void vcpu_dirty_rate_stat_start(void);
+void vcpu_dirty_rate_stat_stop(void);
+void vcpu_dirty_rate_stat_initialize(void);
+void vcpu_dirty_rate_stat_finalize(void);
+
+void dirtylimit_state_lock(void);
+void dirtylimit_state_unlock(void);
+void dirtylimit_state_initialize(void);
+void dirtylimit_state_finalize(void);
+bool dirtylimit_in_service(void);
+bool dirtylimit_vcpu_index_valid(int cpu_index);
+void dirtylimit_process(void);
+void dirtylimit_change(bool start);
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable);
+void dirtylimit_set_all(uint64_t quota,
+ bool enable);
+void dirtylimit_vcpu_execute(CPUState *cpu);
+#endif
diff --git a/include/sysemu/dirtyrate.h b/include/sysemu/dirtyrate.h
new file mode 100644
index 0000000000..4d3b9a4902
--- /dev/null
+++ b/include/sysemu/dirtyrate.h
@@ -0,0 +1,28 @@
+/*
+ * dirty page rate helper functions
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef QEMU_DIRTYRATE_H
+#define QEMU_DIRTYRATE_H
+
+typedef struct VcpuStat {
+ int nvcpu; /* number of vcpu */
+ DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
+} VcpuStat;
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot);
+
+void global_dirty_log_change(unsigned int flag,
+ bool start);
+#endif
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index a783c78868..efd6dee818 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -582,4 +582,6 @@ bool kvm_cpu_check_are_resettable(void);
bool kvm_arch_cpu_check_are_resettable(void);
bool kvm_dirty_ring_enabled(void);
+
+uint32_t kvm_dirty_ring_size(void);
#endif
diff --git a/io/channel-socket.c b/io/channel-socket.c
index 4466bb1cd4..74a936cc1f 100644
--- a/io/channel-socket.c
+++ b/io/channel-socket.c
@@ -716,12 +716,18 @@ static int qio_channel_socket_flush(QIOChannel *ioc,
struct cmsghdr *cm;
char control[CMSG_SPACE(sizeof(*serr))];
int received;
- int ret = 1;
+ int ret;
+
+ if (sioc->zero_copy_queued == sioc->zero_copy_sent) {
+ return 0;
+ }
msg.msg_control = control;
msg.msg_controllen = sizeof(control);
memset(control, 0, sizeof(control));
+ ret = 1;
+
while (sioc->zero_copy_sent < sioc->zero_copy_queued) {
received = recvmsg(sioc->fd, &msg, MSG_ERRQUEUE);
if (received < 0) {
diff --git a/migration/channel.c b/migration/channel.c
index 90087d8986..1b0815039f 100644
--- a/migration/channel.c
+++ b/migration/channel.c
@@ -38,9 +38,7 @@ void migration_channel_process_incoming(QIOChannel *ioc)
trace_migration_set_incoming_channel(
ioc, object_get_typename(OBJECT(ioc)));
- if (migrate_use_tls() &&
- !object_dynamic_cast(OBJECT(ioc),
- TYPE_QIO_CHANNEL_TLS)) {
+ if (migrate_channel_requires_tls_upgrade(ioc)) {
migration_tls_channel_process_incoming(s, ioc, &local_err);
} else {
migration_ioc_register_yank(ioc);
@@ -70,10 +68,7 @@ void migration_channel_connect(MigrationState *s,
ioc, object_get_typename(OBJECT(ioc)), hostname, error);
if (!error) {
- if (s->parameters.tls_creds &&
- *s->parameters.tls_creds &&
- !object_dynamic_cast(OBJECT(ioc),
- TYPE_QIO_CHANNEL_TLS)) {
+ if (migrate_channel_requires_tls_upgrade(ioc)) {
migration_tls_channel_connect(s, ioc, hostname, &error);
if (!error) {
diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
index aace12a787..795fab5c37 100644
--- a/migration/dirtyrate.c
+++ b/migration/dirtyrate.c
@@ -46,7 +46,7 @@ static struct DirtyRateStat DirtyStat;
static DirtyRateMeasureMode dirtyrate_mode =
DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING;
-static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
+static int64_t dirty_stat_wait(int64_t msec, int64_t initial_time)
{
int64_t current_time;
@@ -60,6 +60,132 @@ static int64_t set_sample_page_period(int64_t msec, int64_t initial_time)
return msec;
}
+static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
+ CPUState *cpu, bool start)
+{
+ if (start) {
+ dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
+ } else {
+ dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
+ }
+}
+
+static int64_t do_calculate_dirtyrate(DirtyPageRecord dirty_pages,
+ int64_t calc_time_ms)
+{
+ uint64_t memory_size_MB;
+ uint64_t increased_dirty_pages =
+ dirty_pages.end_pages - dirty_pages.start_pages;
+
+ memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
+
+ return memory_size_MB * 1000 / calc_time_ms;
+}
+
+void global_dirty_log_change(unsigned int flag, bool start)
+{
+ qemu_mutex_lock_iothread();
+ if (start) {
+ memory_global_dirty_log_start(flag);
+ } else {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+/*
+ * global_dirty_log_sync
+ * 1. sync dirty log from kvm
+ * 2. stop dirty tracking if needed.
+ */
+static void global_dirty_log_sync(unsigned int flag, bool one_shot)
+{
+ qemu_mutex_lock_iothread();
+ memory_global_dirty_log_sync();
+ if (one_shot) {
+ memory_global_dirty_log_stop(flag);
+ }
+ qemu_mutex_unlock_iothread();
+}
+
+static DirtyPageRecord *vcpu_dirty_stat_alloc(VcpuStat *stat)
+{
+ CPUState *cpu;
+ DirtyPageRecord *records;
+ int nvcpu = 0;
+
+ CPU_FOREACH(cpu) {
+ nvcpu++;
+ }
+
+ stat->nvcpu = nvcpu;
+ stat->rates = g_malloc0(sizeof(DirtyRateVcpu) * nvcpu);
+
+ records = g_malloc0(sizeof(DirtyPageRecord) * nvcpu);
+
+ return records;
+}
+
+static void vcpu_dirty_stat_collect(VcpuStat *stat,
+ DirtyPageRecord *records,
+ bool start)
+{
+ CPUState *cpu;
+
+ CPU_FOREACH(cpu) {
+ record_dirtypages(records, cpu, start);
+ }
+}
+
+int64_t vcpu_calculate_dirtyrate(int64_t calc_time_ms,
+ VcpuStat *stat,
+ unsigned int flag,
+ bool one_shot)
+{
+ DirtyPageRecord *records;
+ int64_t init_time_ms;
+ int64_t duration;
+ int64_t dirtyrate;
+ int i = 0;
+ unsigned int gen_id;
+
+retry:
+ init_time_ms = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+
+ cpu_list_lock();
+ gen_id = cpu_list_generation_id_get();
+ records = vcpu_dirty_stat_alloc(stat);
+ vcpu_dirty_stat_collect(stat, records, true);
+ cpu_list_unlock();
+
+ duration = dirty_stat_wait(calc_time_ms, init_time_ms);
+
+ global_dirty_log_sync(flag, one_shot);
+
+ cpu_list_lock();
+ if (gen_id != cpu_list_generation_id_get()) {
+ g_free(records);
+ g_free(stat->rates);
+ cpu_list_unlock();
+ goto retry;
+ }
+ vcpu_dirty_stat_collect(stat, records, false);
+ cpu_list_unlock();
+
+ for (i = 0; i < stat->nvcpu; i++) {
+ dirtyrate = do_calculate_dirtyrate(records[i], duration);
+
+ stat->rates[i].id = i;
+ stat->rates[i].dirty_rate = dirtyrate;
+
+ trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
+ }
+
+ g_free(records);
+
+ return duration;
+}
+
static bool is_sample_period_valid(int64_t sec)
{
if (sec < MIN_FETCH_DIRTYRATE_TIME_SEC ||
@@ -396,44 +522,6 @@ static bool compare_page_hash_info(struct RamblockDirtyInfo *info,
return true;
}
-static inline void record_dirtypages(DirtyPageRecord *dirty_pages,
- CPUState *cpu, bool start)
-{
- if (start) {
- dirty_pages[cpu->cpu_index].start_pages = cpu->dirty_pages;
- } else {
- dirty_pages[cpu->cpu_index].end_pages = cpu->dirty_pages;
- }
-}
-
-static void dirtyrate_global_dirty_log_start(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_start(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static void dirtyrate_global_dirty_log_stop(void)
-{
- qemu_mutex_lock_iothread();
- memory_global_dirty_log_sync();
- memory_global_dirty_log_stop(GLOBAL_DIRTY_DIRTY_RATE);
- qemu_mutex_unlock_iothread();
-}
-
-static int64_t do_calculate_dirtyrate_vcpu(DirtyPageRecord dirty_pages)
-{
- uint64_t memory_size_MB;
- int64_t time_s;
- uint64_t increased_dirty_pages =
- dirty_pages.end_pages - dirty_pages.start_pages;
-
- memory_size_MB = (increased_dirty_pages * TARGET_PAGE_SIZE) >> 20;
- time_s = DirtyStat.calc_time;
-
- return memory_size_MB / time_s;
-}
-
static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
bool start)
{
@@ -444,11 +532,6 @@ static inline void record_dirtypages_bitmap(DirtyPageRecord *dirty_pages,
}
}
-static void do_calculate_dirtyrate_bitmap(DirtyPageRecord dirty_pages)
-{
- DirtyStat.dirty_rate = do_calculate_dirtyrate_vcpu(dirty_pages);
-}
-
static inline void dirtyrate_manual_reset_protect(void)
{
RAMBlock *block = NULL;
@@ -492,71 +575,49 @@ static void calculate_dirtyrate_dirty_bitmap(struct DirtyRateConfig config)
DirtyStat.start_time = start_time / 1000;
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
+ msec = dirty_stat_wait(msec, start_time);
DirtyStat.calc_time = msec / 1000;
/*
- * dirtyrate_global_dirty_log_stop do two things.
+ * do two things.
* 1. fetch dirty bitmap from kvm
* 2. stop dirty tracking
*/
- dirtyrate_global_dirty_log_stop();
+ global_dirty_log_sync(GLOBAL_DIRTY_DIRTY_RATE, true);
record_dirtypages_bitmap(&dirty_pages, false);
- do_calculate_dirtyrate_bitmap(dirty_pages);
+ DirtyStat.dirty_rate = do_calculate_dirtyrate(dirty_pages, msec);
}
static void calculate_dirtyrate_dirty_ring(struct DirtyRateConfig config)
{
- CPUState *cpu;
- int64_t msec = 0;
- int64_t start_time;
+ int64_t duration;
uint64_t dirtyrate = 0;
uint64_t dirtyrate_sum = 0;
- DirtyPageRecord *dirty_pages;
- int nvcpu = 0;
int i = 0;
- CPU_FOREACH(cpu) {
- nvcpu++;
- }
-
- dirty_pages = malloc(sizeof(*dirty_pages) * nvcpu);
-
- DirtyStat.dirty_ring.nvcpu = nvcpu;
- DirtyStat.dirty_ring.rates = malloc(sizeof(DirtyRateVcpu) * nvcpu);
-
- dirtyrate_global_dirty_log_start();
-
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, true);
- }
-
- start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
- DirtyStat.start_time = start_time / 1000;
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_DIRTY_RATE, true);
- msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, start_time);
- DirtyStat.calc_time = msec / 1000;
+ DirtyStat.start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) / 1000;
- dirtyrate_global_dirty_log_stop();
+ /* calculate vcpu dirtyrate */
+ duration = vcpu_calculate_dirtyrate(config.sample_period_seconds * 1000,
+ &DirtyStat.dirty_ring,
+ GLOBAL_DIRTY_DIRTY_RATE,
+ true);
- CPU_FOREACH(cpu) {
- record_dirtypages(dirty_pages, cpu, false);
- }
+ DirtyStat.calc_time = duration / 1000;
+ /* calculate vm dirtyrate */
for (i = 0; i < DirtyStat.dirty_ring.nvcpu; i++) {
- dirtyrate = do_calculate_dirtyrate_vcpu(dirty_pages[i]);
- trace_dirtyrate_do_calculate_vcpu(i, dirtyrate);
-
- DirtyStat.dirty_ring.rates[i].id = i;
+ dirtyrate = DirtyStat.dirty_ring.rates[i].dirty_rate;
DirtyStat.dirty_ring.rates[i].dirty_rate = dirtyrate;
dirtyrate_sum += dirtyrate;
}
DirtyStat.dirty_rate = dirtyrate_sum;
- free(dirty_pages);
}
static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
@@ -574,7 +635,7 @@ static void calculate_dirtyrate_sample_vm(struct DirtyRateConfig config)
rcu_read_unlock();
msec = config.sample_period_seconds * 1000;
- msec = set_sample_page_period(msec, initial_time);
+ msec = dirty_stat_wait(msec, initial_time);
DirtyStat.start_time = initial_time / 1000;
DirtyStat.calc_time = msec / 1000;
diff --git a/migration/dirtyrate.h b/migration/dirtyrate.h
index 69d4c5b865..594a5c0bb6 100644
--- a/migration/dirtyrate.h
+++ b/migration/dirtyrate.h
@@ -13,6 +13,8 @@
#ifndef QEMU_MIGRATION_DIRTYRATE_H
#define QEMU_MIGRATION_DIRTYRATE_H
+#include "sysemu/dirtyrate.h"
+
/*
* Sample 512 pages per GB as default.
*/
@@ -65,11 +67,6 @@ typedef struct SampleVMStat {
uint64_t total_block_mem_MB; /* size of total sampled pages in MB */
} SampleVMStat;
-typedef struct VcpuStat {
- int nvcpu; /* number of vcpu */
- DirtyRateVcpu *rates; /* array of dirty rate for each vcpu */
-} VcpuStat;
-
/*
* Store calculation statistics for each measure.
*/
diff --git a/migration/migration.c b/migration/migration.c
index 78f5057373..e03f698a3c 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -48,6 +48,7 @@
#include "trace.h"
#include "exec/target_page.h"
#include "io/channel-buffer.h"
+#include "io/channel-tls.h"
#include "migration/colo.h"
#include "hw/boards.h"
#include "hw/qdev-properties.h"
@@ -215,9 +216,11 @@ void migration_object_init(void)
current_incoming->postcopy_remote_fds =
g_array_new(FALSE, TRUE, sizeof(struct PostCopyFD));
qemu_mutex_init(&current_incoming->rp_mutex);
+ qemu_mutex_init(&current_incoming->postcopy_prio_thread_mutex);
qemu_event_init(&current_incoming->main_thread_load_event, false);
qemu_sem_init(&current_incoming->postcopy_pause_sem_dst, 0);
qemu_sem_init(&current_incoming->postcopy_pause_sem_fault, 0);
+ qemu_sem_init(&current_incoming->postcopy_pause_sem_fast_load, 0);
qemu_mutex_init(&current_incoming->page_request_mutex);
current_incoming->page_requested = g_tree_new(page_request_addr_cmp);
@@ -321,6 +324,12 @@ void migration_incoming_state_destroy(void)
mis->page_requested = NULL;
}
+ if (mis->postcopy_qemufile_dst) {
+ migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
+ qemu_fclose(mis->postcopy_qemufile_dst);
+ mis->postcopy_qemufile_dst = NULL;
+ }
+
yank_unregister_instance(MIGRATION_YANK_INSTANCE);
}
@@ -691,9 +700,9 @@ static bool postcopy_try_recover(void)
/*
* Here, we only wake up the main loading thread (while the
- * fault thread will still be waiting), so that we can receive
+ * rest threads will still be waiting), so that we can receive
* commands from source now, and answer it if needed. The
- * fault thread will be woken up afterwards until we are sure
+ * rest threads will be woken up afterwards until we are sure
* that source is ready to reply to page requests.
*/
qemu_sem_post(&mis->postcopy_pause_sem_dst);
@@ -714,15 +723,21 @@ void migration_fd_process_incoming(QEMUFile *f, Error **errp)
migration_incoming_process();
}
+static bool migration_needs_multiple_sockets(void)
+{
+ return migrate_use_multifd() || migrate_postcopy_preempt();
+}
+
void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
{
MigrationIncomingState *mis = migration_incoming_get_current();
Error *local_err = NULL;
bool start_migration;
+ QEMUFile *f;
if (!mis->from_src_file) {
/* The first connection (multifd may have multiple) */
- QEMUFile *f = qemu_file_new_input(ioc);
+ f = qemu_file_new_input(ioc);
if (!migration_incoming_setup(f, errp)) {
return;
@@ -730,13 +745,19 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
/*
* Common migration only needs one channel, so we can start
- * right now. Multifd needs more than one channel, we wait.
+ * right now. Some features need more than one channel, we wait.
*/
- start_migration = !migrate_use_multifd();
+ start_migration = !migration_needs_multiple_sockets();
} else {
/* Multiple connections */
- assert(migrate_use_multifd());
- start_migration = multifd_recv_new_channel(ioc, &local_err);
+ assert(migration_needs_multiple_sockets());
+ if (migrate_use_multifd()) {
+ start_migration = multifd_recv_new_channel(ioc, &local_err);
+ } else {
+ assert(migrate_postcopy_preempt());
+ f = qemu_file_new_input(ioc);
+ start_migration = postcopy_preempt_new_channel(mis, f);
+ }
if (local_err) {
error_propagate(errp, local_err);
return;
@@ -761,11 +782,20 @@ void migration_ioc_process_incoming(QIOChannel *ioc, Error **errp)
bool migration_has_all_channels(void)
{
MigrationIncomingState *mis = migration_incoming_get_current();
- bool all_channels;
- all_channels = multifd_recv_all_channels_created();
+ if (!mis->from_src_file) {
+ return false;
+ }
+
+ if (migrate_use_multifd()) {
+ return multifd_recv_all_channels_created();
+ }
+
+ if (migrate_postcopy_preempt()) {
+ return mis->postcopy_qemufile_dst != NULL;
+ }
- return all_channels && mis->from_src_file != NULL;
+ return true;
}
/*
@@ -1027,6 +1057,8 @@ static void populate_ram_info(MigrationInfo *info, MigrationState *s)
info->ram->normal_bytes = ram_counters.normal * page_size;
info->ram->mbps = s->mbps;
info->ram->dirty_sync_count = ram_counters.dirty_sync_count;
+ info->ram->dirty_sync_missed_zero_copy =
+ ram_counters.dirty_sync_missed_zero_copy;
info->ram->postcopy_requests = ram_counters.postcopy_requests;
info->ram->page_size = page_size;
info->ram->multifd_bytes = ram_counters.multifd_bytes;
@@ -1274,7 +1306,9 @@ static bool migrate_caps_check(bool *cap_list,
#ifdef CONFIG_LINUX
if (cap_list[MIGRATION_CAPABILITY_ZERO_COPY_SEND] &&
(!cap_list[MIGRATION_CAPABILITY_MULTIFD] ||
- migrate_use_compression() ||
+ cap_list[MIGRATION_CAPABILITY_COMPRESS] ||
+ cap_list[MIGRATION_CAPABILITY_XBZRLE] ||
+ migrate_multifd_compression() ||
migrate_use_tls())) {
error_setg(errp,
"Zero copy only available for non-compressed non-TLS multifd migration");
@@ -1297,6 +1331,13 @@ static bool migrate_caps_check(bool *cap_list,
return false;
}
+ if (cap_list[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT]) {
+ if (!cap_list[MIGRATION_CAPABILITY_POSTCOPY_RAM]) {
+ error_setg(errp, "Postcopy preempt requires postcopy-ram");
+ return false;
+ }
+ }
+
return true;
}
@@ -1511,6 +1552,17 @@ static bool migrate_params_check(MigrationParameters *params, Error **errp)
error_prepend(errp, "Invalid mapping given for block-bitmap-mapping: ");
return false;
}
+
+#ifdef CONFIG_LINUX
+ if (migrate_use_zero_copy_send() &&
+ ((params->has_multifd_compression && params->multifd_compression) ||
+ (params->has_tls_creds && params->tls_creds && *params->tls_creds))) {
+ error_setg(errp,
+ "Zero copy only available for non-compressed non-TLS multifd migration");
+ return false;
+ }
+#endif
+
return true;
}
@@ -1867,6 +1919,12 @@ static void migrate_fd_cleanup(MigrationState *s)
qemu_fclose(tmp);
}
+ if (s->postcopy_qemufile_src) {
+ migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
+ qemu_fclose(s->postcopy_qemufile_src);
+ s->postcopy_qemufile_src = NULL;
+ }
+
assert(!migration_is_active(s));
if (s->state == MIGRATION_STATUS_CANCELLING) {
@@ -2663,6 +2721,15 @@ bool migrate_background_snapshot(void)
return s->enabled_capabilities[MIGRATION_CAPABILITY_BACKGROUND_SNAPSHOT];
}
+bool migrate_postcopy_preempt(void)
+{
+ MigrationState *s;
+
+ s = migrate_get_current();
+
+ return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_PREEMPT];
+}
+
/* migration thread support */
/*
* Something bad happened to the RP stream, mark an error
@@ -3002,6 +3069,12 @@ static int postcopy_start(MigrationState *ms)
int64_t bandwidth = migrate_max_postcopy_bandwidth();
bool restart_block = false;
int cur_state = MIGRATION_STATUS_ACTIVE;
+
+ if (postcopy_preempt_wait_channel(ms)) {
+ migrate_set_state(&ms->state, ms->state, MIGRATION_STATUS_FAILED);
+ return -1;
+ }
+
if (!migrate_pause_before_switchover()) {
migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE,
MIGRATION_STATUS_POSTCOPY_ACTIVE);
@@ -3141,6 +3214,8 @@ static int postcopy_start(MigrationState *ms)
MIGRATION_STATUS_FAILED);
}
+ trace_postcopy_preempt_enabled(migrate_postcopy_preempt());
+
return ret;
fail_closefb:
@@ -3253,6 +3328,11 @@ static void migration_completion(MigrationState *s)
qemu_savevm_state_complete_postcopy(s->to_dst_file);
qemu_mutex_unlock_iothread();
+ /* Shutdown the postcopy fast path thread */
+ if (migrate_postcopy_preempt()) {
+ postcopy_preempt_shutdown_file(s);
+ }
+
trace_migration_completion_postcopy_end_after_complete();
} else {
goto fail;
@@ -3447,6 +3527,18 @@ static MigThrError postcopy_pause(MigrationState *s)
qemu_file_shutdown(file);
qemu_fclose(file);
+ /*
+ * Do the same to postcopy fast path socket too if there is. No
+ * locking needed because no racer as long as we do this before setting
+ * status to paused.
+ */
+ if (s->postcopy_qemufile_src) {
+ migration_ioc_unregister_yank_from_file(s->postcopy_qemufile_src);
+ qemu_file_shutdown(s->postcopy_qemufile_src);
+ qemu_fclose(s->postcopy_qemufile_src);
+ s->postcopy_qemufile_src = NULL;
+ }
+
migrate_set_state(&s->state, s->state,
MIGRATION_STATUS_POSTCOPY_PAUSED);
@@ -3464,6 +3556,14 @@ static MigThrError postcopy_pause(MigrationState *s)
if (s->state == MIGRATION_STATUS_POSTCOPY_RECOVER) {
/* Woken up by a recover procedure. Give it a shot */
+ if (postcopy_preempt_wait_channel(s)) {
+ /*
+ * Preempt enabled, and new channel create failed; loop
+ * back to wait for another recovery.
+ */
+ continue;
+ }
+
/*
* Firstly, let's wake up the return path now, with a new
* return path channel.
@@ -3502,8 +3602,13 @@ static MigThrError migration_detect_error(MigrationState *s)
return MIG_THR_ERR_FATAL;
}
- /* Try to detect any file errors */
- ret = qemu_file_get_error_obj(s->to_dst_file, &local_error);
+ /*
+ * Try to detect any file errors. Note that postcopy_qemufile_src will
+ * be NULL when postcopy preempt is not enabled.
+ */
+ ret = qemu_file_get_error_obj_any(s->to_dst_file,
+ s->postcopy_qemufile_src,
+ &local_error);
if (!ret) {
/* Everything is fine */
assert(!local_error);
@@ -4141,6 +4246,15 @@ void migrate_fd_connect(MigrationState *s, Error *error_in)
}
}
+ /* This needs to be done before resuming a postcopy */
+ if (postcopy_preempt_setup(s, &local_err)) {
+ error_report_err(local_err);
+ migrate_set_state(&s->state, MIGRATION_STATUS_SETUP,
+ MIGRATION_STATUS_FAILED);
+ migrate_fd_cleanup(s);
+ return;
+ }
+
if (resume) {
/* Wakeup the main migration thread to do the recovery */
migrate_set_state(&s->state, MIGRATION_STATUS_POSTCOPY_PAUSED,
@@ -4265,6 +4379,11 @@ static Property migration_properties[] = {
DEFINE_PROP_SIZE("announce-step", MigrationState,
parameters.announce_step,
DEFAULT_MIGRATE_ANNOUNCE_STEP),
+ DEFINE_PROP_BOOL("x-postcopy-preempt-break-huge", MigrationState,
+ postcopy_preempt_break_huge, true),
+ DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
+ DEFINE_PROP_STRING("tls-hostname", MigrationState, parameters.tls_hostname),
+ DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
/* Migration capabilities */
DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -4274,6 +4393,8 @@ static Property migration_properties[] = {
DEFINE_PROP_MIG_CAP("x-compress", MIGRATION_CAPABILITY_COMPRESS),
DEFINE_PROP_MIG_CAP("x-events", MIGRATION_CAPABILITY_EVENTS),
DEFINE_PROP_MIG_CAP("x-postcopy-ram", MIGRATION_CAPABILITY_POSTCOPY_RAM),
+ DEFINE_PROP_MIG_CAP("x-postcopy-preempt",
+ MIGRATION_CAPABILITY_POSTCOPY_PREEMPT),
DEFINE_PROP_MIG_CAP("x-colo", MIGRATION_CAPABILITY_X_COLO),
DEFINE_PROP_MIG_CAP("x-release-ram", MIGRATION_CAPABILITY_RELEASE_RAM),
DEFINE_PROP_MIG_CAP("x-block", MIGRATION_CAPABILITY_BLOCK),
@@ -4300,18 +4421,16 @@ static void migration_class_init(ObjectClass *klass, void *data)
static void migration_instance_finalize(Object *obj)
{
MigrationState *ms = MIGRATION_OBJ(obj);
- MigrationParameters *params = &ms->parameters;
qemu_mutex_destroy(&ms->error_mutex);
qemu_mutex_destroy(&ms->qemu_file_lock);
- g_free(params->tls_hostname);
- g_free(params->tls_creds);
qemu_sem_destroy(&ms->wait_unplug_sem);
qemu_sem_destroy(&ms->rate_limit_sem);
qemu_sem_destroy(&ms->pause_sem);
qemu_sem_destroy(&ms->postcopy_pause_sem);
qemu_sem_destroy(&ms->postcopy_pause_rp_sem);
qemu_sem_destroy(&ms->rp_state.rp_sem);
+ qemu_sem_destroy(&ms->postcopy_qemufile_src_sem);
error_free(ms->error);
}
@@ -4358,6 +4477,7 @@ static void migration_instance_init(Object *obj)
qemu_sem_init(&ms->rp_state.rp_sem, 0);
qemu_sem_init(&ms->rate_limit_sem, 0);
qemu_sem_init(&ms->wait_unplug_sem, 0);
+ qemu_sem_init(&ms->postcopy_qemufile_src_sem, 0);
qemu_mutex_init(&ms->qemu_file_lock);
}
diff --git a/migration/migration.h b/migration/migration.h
index 485d58b95f..cdad8aceaa 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -23,6 +23,7 @@
#include "io/channel-buffer.h"
#include "net/announce.h"
#include "qom/object.h"
+#include "postcopy-ram.h"
struct PostcopyBlocktimeContext;
@@ -67,7 +68,7 @@ typedef struct {
struct MigrationIncomingState {
QEMUFile *from_src_file;
/* Previously received RAM's RAMBlock pointer */
- RAMBlock *last_recv_block;
+ RAMBlock *last_recv_block[RAM_CHANNEL_MAX];
/* A hook to allow cleanup at the end of incoming migration */
void *transport_data;
void (*transport_cleanup)(void *data);
@@ -112,6 +113,23 @@ struct MigrationIncomingState {
* enabled.
*/
unsigned int postcopy_channels;
+ /* QEMUFile for postcopy only; it'll be handled by a separate thread */
+ QEMUFile *postcopy_qemufile_dst;
+ /* Postcopy priority thread is used to receive postcopy requested pages */
+ QemuThread postcopy_prio_thread;
+ bool postcopy_prio_thread_created;
+ /*
+ * Used to sync between the ram load main thread and the fast ram load
+ * thread. It protects postcopy_qemufile_dst, which is the postcopy
+ * fast channel.
+ *
+ * The ram fast load thread will take it mostly for the whole lifecycle
+ * because it needs to continuously read data from the channel, and
+ * it'll only release this mutex if postcopy is interrupted, so that
+ * the ram load main thread will take this mutex over and properly
+ * release the broken channel.
+ */
+ QemuMutex postcopy_prio_thread_mutex;
/*
* An array of temp host huge pages to be used, one for each postcopy
* channel.
@@ -141,6 +159,13 @@ struct MigrationIncomingState {
/* notify PAUSED postcopy incoming migrations to try to continue */
QemuSemaphore postcopy_pause_sem_dst;
QemuSemaphore postcopy_pause_sem_fault;
+ /*
+ * This semaphore is used to allow the ram fast load thread (only when
+ * postcopy preempt is enabled) fall into sleep when there's network
+ * interruption detected. When the recovery is done, the main load
+ * thread will kick the fast ram load thread using this semaphore.
+ */
+ QemuSemaphore postcopy_pause_sem_fast_load;
/* List of listening socket addresses */
SocketAddressList *socket_address_list;
@@ -192,6 +217,15 @@ struct MigrationState {
QEMUBH *cleanup_bh;
/* Protected by qemu_file_lock */
QEMUFile *to_dst_file;
+ /* Postcopy specific transfer channel */
+ QEMUFile *postcopy_qemufile_src;
+ /*
+ * It is posted when the preempt channel is established. Note: this is
+ * used for both the start or recover of a postcopy migration. We'll
+ * post to this sem every time a new preempt channel is created in the
+ * main thread, and we keep post() and wait() in pair.
+ */
+ QemuSemaphore postcopy_qemufile_src_sem;
QIOChannelBuffer *bioc;
/*
* Protects to_dst_file/from_dst_file pointers. We need to make sure we
@@ -306,6 +340,13 @@ struct MigrationState {
bool send_configuration;
/* Whether we send section footer during migration */
bool send_section_footer;
+ /*
+ * Whether we allow break sending huge pages when postcopy preempt is
+ * enabled. When disabled, we won't interrupt precopy within sending a
+ * host huge page, which is the old behavior of vanilla postcopy.
+ * NOTE: this parameter is ignored if postcopy preempt is not enabled.
+ */
+ bool postcopy_preempt_break_huge;
/* Needed by postcopy-pause state */
QemuSemaphore postcopy_pause_sem;
@@ -400,6 +441,7 @@ int migrate_decompress_threads(void);
bool migrate_use_events(void);
bool migrate_postcopy_blocktime(void);
bool migrate_background_snapshot(void);
+bool migrate_postcopy_preempt(void);
/* Sending on the return path - generic and then for each message type */
void migrate_send_rp_shut(MigrationIncomingState *mis,
diff --git a/migration/multifd-zlib.c b/migration/multifd-zlib.c
index 3a7ae44485..18213a9513 100644
--- a/migration/multifd-zlib.c
+++ b/migration/multifd-zlib.c
@@ -27,6 +27,8 @@ struct zlib_data {
uint8_t *zbuff;
/* size of compressed buffer */
uint32_t zbuff_len;
+ /* uncompressed buffer of size qemu_target_page_size() */
+ uint8_t *buf;
};
/* Multifd zlib compression */
@@ -45,26 +47,38 @@ static int zlib_send_setup(MultiFDSendParams *p, Error **errp)
{
struct zlib_data *z = g_new0(struct zlib_data, 1);
z_stream *zs = &z->zs;
+ const char *err_msg;
zs->zalloc = Z_NULL;
zs->zfree = Z_NULL;
zs->opaque = Z_NULL;
if (deflateInit(zs, migrate_multifd_zlib_level()) != Z_OK) {
- g_free(z);
- error_setg(errp, "multifd %u: deflate init failed", p->id);
- return -1;
+ err_msg = "deflate init failed";
+ goto err_free_z;
}
/* This is the maxium size of the compressed buffer */
z->zbuff_len = compressBound(MULTIFD_PACKET_SIZE);
z->zbuff = g_try_malloc(z->zbuff_len);
if (!z->zbuff) {
- deflateEnd(&z->zs);
- g_free(z);
- error_setg(errp, "multifd %u: out of memory for zbuff", p->id);
- return -1;
+ err_msg = "out of memory for zbuff";
+ goto err_deflate_end;
+ }
+ z->buf = g_try_malloc(qemu_target_page_size());
+ if (!z->buf) {
+ err_msg = "out of memory for buf";
+ goto err_free_zbuff;
}
p->data = z;
return 0;
+
+err_free_zbuff:
+ g_free(z->zbuff);
+err_deflate_end:
+ deflateEnd(&z->zs);
+err_free_z:
+ g_free(z);
+ error_setg(errp, "multifd %u: %s", p->id, err_msg);
+ return -1;
}
/**
@@ -82,6 +96,8 @@ static void zlib_send_cleanup(MultiFDSendParams *p, Error **errp)
deflateEnd(&z->zs);
g_free(z->zbuff);
z->zbuff = NULL;
+ g_free(z->buf);
+ z->buf = NULL;
g_free(p->data);
p->data = NULL;
}
@@ -114,8 +130,14 @@ static int zlib_send_prepare(MultiFDSendParams *p, Error **errp)
flush = Z_SYNC_FLUSH;
}
+ /*
+ * Since the VM might be running, the page may be changing concurrently
+ * with compression. zlib does not guarantee that this is safe,
+ * therefore copy the page before calling deflate().
+ */
+ memcpy(z->buf, p->pages->block->host + p->normal[i], page_size);
zs->avail_in = page_size;
- zs->next_in = p->pages->block->host + p->normal[i];
+ zs->next_in = z->buf;
zs->avail_out = available;
zs->next_out = z->zbuff + out_size;
diff --git a/migration/multifd.c b/migration/multifd.c
index 684c014c86..586ddc9d65 100644
--- a/migration/multifd.c
+++ b/migration/multifd.c
@@ -624,6 +624,8 @@ int multifd_send_sync_main(QEMUFile *f)
if (ret < 0) {
error_report_err(err);
return -1;
+ } else if (ret == 1) {
+ dirty_sync_missed_zero_copy();
}
}
}
@@ -831,9 +833,7 @@ static bool multifd_channel_connect(MultiFDSendParams *p,
migrate_get_current()->hostname, error);
if (!error) {
- if (migrate_use_tls() &&
- !object_dynamic_cast(OBJECT(ioc),
- TYPE_QIO_CHANNEL_TLS)) {
+ if (migrate_channel_requires_tls_upgrade(ioc)) {
multifd_tls_channel_connect(p, ioc, &error);
if (!error) {
/*
diff --git a/migration/multifd.h b/migration/multifd.h
index 4d8d89e5e5..519f498643 100644
--- a/migration/multifd.h
+++ b/migration/multifd.h
@@ -65,7 +65,9 @@ typedef struct {
} MultiFDPages_t;
typedef struct {
- /* this fields are not changed once the thread is created */
+ /* Fields are only written at creating/deletion time */
+ /* No lock required for them, they are read only */
+
/* channel number */
uint8_t id;
/* channel thread name */
@@ -74,39 +76,47 @@ typedef struct {
QemuThread thread;
/* communication channel */
QIOChannel *c;
+ /* is the yank function registered */
+ bool registered_yank;
+ /* packet allocated len */
+ uint32_t packet_len;
+ /* multifd flags for sending ram */
+ int write_flags;
+
/* sem where to wait for more work */
QemuSemaphore sem;
+ /* syncs main thread and channels */
+ QemuSemaphore sem_sync;
+
/* this mutex protects the following parameters */
QemuMutex mutex;
/* is this channel thread running */
bool running;
/* should this thread finish */
bool quit;
- /* is the yank function registered */
- bool registered_yank;
+ /* multifd flags for each packet */
+ uint32_t flags;
+ /* global number of generated multifd packets */
+ uint64_t packet_num;
/* thread has work to do */
int pending_job;
- /* array of pages to sent */
+ /* array of pages to sent.
+ * The owner of 'pages' depends of 'pending_job' value:
+ * pending_job == 0 -> migration_thread can use it.
+ * pending_job != 0 -> multifd_channel can use it.
+ */
MultiFDPages_t *pages;
- /* packet allocated len */
- uint32_t packet_len;
+
+ /* thread local variables. No locking required */
+
/* pointer to the packet */
MultiFDPacket_t *packet;
- /* multifd flags for sending ram */
- int write_flags;
- /* multifd flags for each packet */
- uint32_t flags;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
- /* global number of generated multifd packets */
- uint64_t packet_num;
- /* thread local variables */
/* packets sent through this channel */
uint64_t num_packets;
/* non zero pages sent through this channel */
uint64_t total_normal_pages;
- /* syncs main thread and channels */
- QemuSemaphore sem_sync;
/* buffers to send */
struct iovec *iov;
/* number of iovs used */
@@ -120,7 +130,9 @@ typedef struct {
} MultiFDSendParams;
typedef struct {
- /* this fields are not changed once the thread is created */
+ /* Fields are only written at creating/deletion time */
+ /* No lock required for them, they are read only */
+
/* channel number */
uint8_t id;
/* channel thread name */
@@ -129,31 +141,35 @@ typedef struct {
QemuThread thread;
/* communication channel */
QIOChannel *c;
+ /* packet allocated len */
+ uint32_t packet_len;
+
+ /* syncs main thread and channels */
+ QemuSemaphore sem_sync;
+
/* this mutex protects the following parameters */
QemuMutex mutex;
/* is this channel thread running */
bool running;
/* should this thread finish */
bool quit;
- /* ramblock host address */
- uint8_t *host;
- /* packet allocated len */
- uint32_t packet_len;
- /* pointer to the packet */
- MultiFDPacket_t *packet;
/* multifd flags for each packet */
uint32_t flags;
/* global number of generated multifd packets */
uint64_t packet_num;
- /* thread local variables */
+
+ /* thread local variables. No locking required */
+
+ /* pointer to the packet */
+ MultiFDPacket_t *packet;
/* size of the next packet that contains pages */
uint32_t next_packet_size;
/* packets sent through this channel */
uint64_t num_packets;
+ /* ramblock host address */
+ uint8_t *host;
/* non zero pages recv through this channel */
uint64_t total_normal_pages;
- /* syncs main thread and channels */
- QemuSemaphore sem_sync;
/* buffers to recv */
struct iovec *iov;
/* Pages that are not zero */
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index a66dd536d9..b9a37ef255 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -33,6 +33,10 @@
#include "trace.h"
#include "hw/boards.h"
#include "exec/ramblock.h"
+#include "socket.h"
+#include "qemu-file.h"
+#include "yank_functions.h"
+#include "tls.h"
/* Arbitrary limit on size of each discard command,
* keeps them around ~200 bytes
@@ -567,6 +571,11 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
{
trace_postcopy_ram_incoming_cleanup_entry();
+ if (mis->postcopy_prio_thread_created) {
+ qemu_thread_join(&mis->postcopy_prio_thread);
+ mis->postcopy_prio_thread_created = false;
+ }
+
if (mis->have_fault_thread) {
Error *local_err = NULL;
@@ -1102,8 +1111,13 @@ static int postcopy_temp_pages_setup(MigrationIncomingState *mis)
int err, i, channels;
void *temp_page;
- /* TODO: will be boosted when enable postcopy preemption */
- mis->postcopy_channels = 1;
+ if (migrate_postcopy_preempt()) {
+ /* If preemption enabled, need extra channel for urgent requests */
+ mis->postcopy_channels = RAM_CHANNEL_MAX;
+ } else {
+ /* Both precopy/postcopy on the same channel */
+ mis->postcopy_channels = 1;
+ }
channels = mis->postcopy_channels;
mis->postcopy_tmp_pages = g_malloc0_n(sizeof(PostcopyTmpPage), channels);
@@ -1170,7 +1184,7 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
return -1;
}
- postcopy_thread_create(mis, &mis->fault_thread, "postcopy/fault",
+ postcopy_thread_create(mis, &mis->fault_thread, "fault-default",
postcopy_ram_fault_thread, QEMU_THREAD_JOINABLE);
mis->have_fault_thread = true;
@@ -1185,6 +1199,16 @@ int postcopy_ram_incoming_setup(MigrationIncomingState *mis)
return -1;
}
+ if (migrate_postcopy_preempt()) {
+ /*
+ * This thread needs to be created after the temp pages because
+ * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
+ */
+ postcopy_thread_create(mis, &mis->postcopy_prio_thread, "fault-fast",
+ postcopy_preempt_thread, QEMU_THREAD_JOINABLE);
+ mis->postcopy_prio_thread_created = true;
+ }
+
trace_postcopy_ram_enable_notify();
return 0;
@@ -1514,3 +1538,159 @@ void postcopy_unregister_shared_ufd(struct PostCopyFD *pcfd)
}
}
}
+
+bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file)
+{
+ /*
+ * The new loading channel has its own threads, so it needs to be
+ * blocked too. It's by default true, just be explicit.
+ */
+ qemu_file_set_blocking(file, true);
+ mis->postcopy_qemufile_dst = file;
+ trace_postcopy_preempt_new_channel();
+
+ /* Start the migration immediately */
+ return true;
+}
+
+/*
+ * Setup the postcopy preempt channel with the IOC. If ERROR is specified,
+ * setup the error instead. This helper will free the ERROR if specified.
+ */
+static void
+postcopy_preempt_send_channel_done(MigrationState *s,
+ QIOChannel *ioc, Error *local_err)
+{
+ if (local_err) {
+ migrate_set_error(s, local_err);
+ error_free(local_err);
+ } else {
+ migration_ioc_register_yank(ioc);
+ s->postcopy_qemufile_src = qemu_file_new_output(ioc);
+ trace_postcopy_preempt_new_channel();
+ }
+
+ /*
+ * Kick the waiter in all cases. The waiter should check upon
+ * postcopy_qemufile_src to know whether it failed or not.
+ */
+ qemu_sem_post(&s->postcopy_qemufile_src_sem);
+}
+
+static void
+postcopy_preempt_tls_handshake(QIOTask *task, gpointer opaque)
+{
+ g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
+ MigrationState *s = opaque;
+ Error *local_err = NULL;
+
+ qio_task_propagate_error(task, &local_err);
+ postcopy_preempt_send_channel_done(s, ioc, local_err);
+}
+
+static void
+postcopy_preempt_send_channel_new(QIOTask *task, gpointer opaque)
+{
+ g_autoptr(QIOChannel) ioc = QIO_CHANNEL(qio_task_get_source(task));
+ MigrationState *s = opaque;
+ QIOChannelTLS *tioc;
+ Error *local_err = NULL;
+
+ if (qio_task_propagate_error(task, &local_err)) {
+ goto out;
+ }
+
+ if (migrate_channel_requires_tls_upgrade(ioc)) {
+ tioc = migration_tls_client_create(s, ioc, s->hostname, &local_err);
+ if (!tioc) {
+ goto out;
+ }
+ trace_postcopy_preempt_tls_handshake();
+ qio_channel_set_name(QIO_CHANNEL(tioc), "migration-tls-preempt");
+ qio_channel_tls_handshake(tioc, postcopy_preempt_tls_handshake,
+ s, NULL, NULL);
+ /* Setup the channel until TLS handshake finished */
+ return;
+ }
+
+out:
+ /* This handles both good and error cases */
+ postcopy_preempt_send_channel_done(s, ioc, local_err);
+}
+
+/* Returns 0 if channel established, -1 for error. */
+int postcopy_preempt_wait_channel(MigrationState *s)
+{
+ /* If preempt not enabled, no need to wait */
+ if (!migrate_postcopy_preempt()) {
+ return 0;
+ }
+
+ /*
+ * We need the postcopy preempt channel to be established before
+ * starting doing anything.
+ */
+ qemu_sem_wait(&s->postcopy_qemufile_src_sem);
+
+ return s->postcopy_qemufile_src ? 0 : -1;
+}
+
+int postcopy_preempt_setup(MigrationState *s, Error **errp)
+{
+ if (!migrate_postcopy_preempt()) {
+ return 0;
+ }
+
+ if (!migrate_multi_channels_is_allowed()) {
+ error_setg(errp, "Postcopy preempt is not supported as current "
+ "migration stream does not support multi-channels.");
+ return -1;
+ }
+
+ /* Kick an async task to connect */
+ socket_send_channel_create(postcopy_preempt_send_channel_new, s);
+
+ return 0;
+}
+
+static void postcopy_pause_ram_fast_load(MigrationIncomingState *mis)
+{
+ trace_postcopy_pause_fast_load();
+ qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
+ qemu_sem_wait(&mis->postcopy_pause_sem_fast_load);
+ qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
+ trace_postcopy_pause_fast_load_continued();
+}
+
+void *postcopy_preempt_thread(void *opaque)
+{
+ MigrationIncomingState *mis = opaque;
+ int ret;
+
+ trace_postcopy_preempt_thread_entry();
+
+ rcu_register_thread();
+
+ qemu_sem_post(&mis->thread_sync_sem);
+
+ /* Sending RAM_SAVE_FLAG_EOS to terminate this thread */
+ qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
+ while (1) {
+ ret = ram_load_postcopy(mis->postcopy_qemufile_dst,
+ RAM_CHANNEL_POSTCOPY);
+ /* If error happened, go into recovery routine */
+ if (ret) {
+ postcopy_pause_ram_fast_load(mis);
+ } else {
+ /* We're done */
+ break;
+ }
+ }
+ qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
+
+ rcu_unregister_thread();
+
+ trace_postcopy_preempt_thread_exit();
+
+ return NULL;
+}
diff --git a/migration/postcopy-ram.h b/migration/postcopy-ram.h
index 07684c0e1d..6147bf7d1d 100644
--- a/migration/postcopy-ram.h
+++ b/migration/postcopy-ram.h
@@ -183,4 +183,15 @@ int postcopy_wake_shared(struct PostCopyFD *pcfd, uint64_t client_addr,
int postcopy_request_shared_page(struct PostCopyFD *pcfd, RAMBlock *rb,
uint64_t client_addr, uint64_t offset);
+/* Hard-code channels for now for postcopy preemption */
+enum PostcopyChannels {
+ RAM_CHANNEL_PRECOPY = 0,
+ RAM_CHANNEL_POSTCOPY = 1,
+ RAM_CHANNEL_MAX,
+};
+
+bool postcopy_preempt_new_channel(MigrationIncomingState *mis, QEMUFile *file);
+int postcopy_preempt_setup(MigrationState *s, Error **errp);
+int postcopy_preempt_wait_channel(MigrationState *s);
+
#endif
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index 1e80d496b7..4f400c2e52 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -161,6 +161,33 @@ int qemu_file_get_error_obj(QEMUFile *f, Error **errp)
}
/*
+ * Get last error for either stream f1 or f2 with optional Error*.
+ * The error returned (non-zero) can be either from f1 or f2.
+ *
+ * If any of the qemufile* is NULL, then skip the check on that file.
+ *
+ * When there is no error on both qemufile, zero is returned.
+ */
+int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp)
+{
+ int ret = 0;
+
+ if (f1) {
+ ret = qemu_file_get_error_obj(f1, errp);
+ /* If there's already error detected, return */
+ if (ret) {
+ return ret;
+ }
+ }
+
+ if (f2) {
+ ret = qemu_file_get_error_obj(f2, errp);
+ }
+
+ return ret;
+}
+
+/*
* Set the last error for stream f with optional Error*
*/
void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err)
@@ -384,10 +411,8 @@ static ssize_t qemu_fill_buffer(QEMUFile *f)
f->total_transferred += len;
} else if (len == 0) {
qemu_file_set_error_obj(f, -EIO, local_error);
- } else if (len != -EAGAIN) {
- qemu_file_set_error_obj(f, len, local_error);
} else {
- error_free(local_error);
+ qemu_file_set_error_obj(f, len, local_error);
}
return len;
diff --git a/migration/qemu-file.h b/migration/qemu-file.h
index 96e72d8bd8..fa13d04d78 100644
--- a/migration/qemu-file.h
+++ b/migration/qemu-file.h
@@ -141,6 +141,7 @@ void qemu_file_acct_rate_limit(QEMUFile *f, int64_t len);
void qemu_file_set_rate_limit(QEMUFile *f, int64_t new_rate);
int64_t qemu_file_get_rate_limit(QEMUFile *f);
int qemu_file_get_error_obj(QEMUFile *f, Error **errp);
+int qemu_file_get_error_obj_any(QEMUFile *f1, QEMUFile *f2, Error **errp);
void qemu_file_set_error_obj(QEMUFile *f, int ret, Error *err);
void qemu_file_set_error(QEMUFile *f, int ret);
int qemu_file_shutdown(QEMUFile *f);
diff --git a/migration/ram.c b/migration/ram.c
index 01f9cc1d72..b94669ba5d 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -296,6 +296,20 @@ struct RAMSrcPageRequest {
QSIMPLEQ_ENTRY(RAMSrcPageRequest) next_req;
};
+typedef struct {
+ /*
+ * Cached ramblock/offset values if preempted. They're only meaningful if
+ * preempted==true below.
+ */
+ RAMBlock *ram_block;
+ unsigned long ram_page;
+ /*
+ * Whether a postcopy preemption just happened. Will be reset after
+ * precopy recovered to background migration.
+ */
+ bool preempted;
+} PostcopyPreemptState;
+
/* State of RAM for migration */
struct RAMState {
/* QEMUFile used for this migration */
@@ -350,6 +364,14 @@ struct RAMState {
/* Queue of outstanding page requests from the destination */
QemuMutex src_page_req_mutex;
QSIMPLEQ_HEAD(, RAMSrcPageRequest) src_page_requests;
+
+ /* Postcopy preemption informations */
+ PostcopyPreemptState postcopy_preempt_state;
+ /*
+ * Current channel we're using on src VM. Only valid if postcopy-preempt
+ * is enabled.
+ */
+ unsigned int postcopy_channel;
};
typedef struct RAMState RAMState;
@@ -357,6 +379,11 @@ static RAMState *ram_state;
static NotifierWithReturnList precopy_notifier_list;
+static void postcopy_preempt_reset(RAMState *rs)
+{
+ memset(&rs->postcopy_preempt_state, 0, sizeof(PostcopyPreemptState));
+}
+
/* Whether postcopy has queued requests? */
static bool postcopy_has_request(RAMState *rs)
{
@@ -407,6 +434,11 @@ static void ram_transferred_add(uint64_t bytes)
ram_counters.transferred += bytes;
}
+void dirty_sync_missed_zero_copy(void)
+{
+ ram_counters.dirty_sync_missed_zero_copy++;
+}
+
/* used by the search for pages to send */
struct PageSearchStatus {
/* Current block being searched */
@@ -415,8 +447,28 @@ struct PageSearchStatus {
unsigned long page;
/* Set once we wrap around */
bool complete_round;
- /* Whether current page is explicitly requested by postcopy */
+ /*
+ * [POSTCOPY-ONLY] Whether current page is explicitly requested by
+ * postcopy. When set, the request is "urgent" because the dest QEMU
+ * threads are waiting for us.
+ */
bool postcopy_requested;
+ /*
+ * [POSTCOPY-ONLY] The target channel to use to send current page.
+ *
+ * Note: This may _not_ match with the value in postcopy_requested
+ * above. Let's imagine the case where the postcopy request is exactly
+ * the page that we're sending in progress during precopy. In this case
+ * we'll have postcopy_requested set to true but the target channel
+ * will be the precopy channel (so that we don't split brain on that
+ * specific page since the precopy channel already contains partial of
+ * that page data).
+ *
+ * Besides that specific use case, postcopy_target_channel should
+ * always be equal to postcopy_requested, because by default we send
+ * postcopy pages via postcopy preempt channel.
+ */
+ bool postcopy_target_channel;
};
typedef struct PageSearchStatus PageSearchStatus;
@@ -468,6 +520,9 @@ static QemuCond decomp_done_cond;
static bool do_compress_ram_page(QEMUFile *f, z_stream *stream, RAMBlock *block,
ram_addr_t offset, uint8_t *source_buf);
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
+ bool postcopy_requested);
+
static void *do_data_compress(void *opaque)
{
CompressParam *param = opaque;
@@ -1489,8 +1544,12 @@ retry:
*/
static bool find_dirty_block(RAMState *rs, PageSearchStatus *pss, bool *again)
{
- /* This is not a postcopy requested page */
+ /*
+ * This is not a postcopy requested page, mark it "not urgent", and use
+ * precopy channel to send it.
+ */
pss->postcopy_requested = false;
+ pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
pss->page = migration_bitmap_find_dirty(rs, pss->block, pss->page);
if (pss->complete_round && pss->block == rs->last_seen_block &&
@@ -1947,6 +2006,55 @@ void ram_write_tracking_stop(void)
}
#endif /* defined(__linux__) */
+/*
+ * Check whether two addr/offset of the ramblock falls onto the same host huge
+ * page. Returns true if so, false otherwise.
+ */
+static bool offset_on_same_huge_page(RAMBlock *rb, uint64_t addr1,
+ uint64_t addr2)
+{
+ size_t page_size = qemu_ram_pagesize(rb);
+
+ addr1 = ROUND_DOWN(addr1, page_size);
+ addr2 = ROUND_DOWN(addr2, page_size);
+
+ return addr1 == addr2;
+}
+
+/*
+ * Whether a previous preempted precopy huge page contains current requested
+ * page? Returns true if so, false otherwise.
+ *
+ * This should really happen very rarely, because it means when we were sending
+ * during background migration for postcopy we're sending exactly the page that
+ * some vcpu got faulted on on dest node. When it happens, we probably don't
+ * need to do much but drop the request, because we know right after we restore
+ * the precopy stream it'll be serviced. It'll slightly affect the order of
+ * postcopy requests to be serviced (e.g. it'll be the same as we move current
+ * request to the end of the queue) but it shouldn't be a big deal. The most
+ * imporant thing is we can _never_ try to send a partial-sent huge page on the
+ * POSTCOPY channel again, otherwise that huge page will got "split brain" on
+ * two channels (PRECOPY, POSTCOPY).
+ */
+static bool postcopy_preempted_contains(RAMState *rs, RAMBlock *block,
+ ram_addr_t offset)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ /* No preemption at all? */
+ if (!state->preempted) {
+ return false;
+ }
+
+ /* Not even the same ramblock? */
+ if (state->ram_block != block) {
+ return false;
+ }
+
+ return offset_on_same_huge_page(block, offset,
+ state->ram_page << TARGET_PAGE_BITS);
+}
+
/**
* get_queued_page: unqueue a page from the postcopy requests
*
@@ -1964,7 +2072,20 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
block = unqueue_page(rs, &offset);
- if (!block) {
+ if (block) {
+ /* See comment above postcopy_preempted_contains() */
+ if (postcopy_preempted_contains(rs, block, offset)) {
+ trace_postcopy_preempt_hit(block->idstr, offset);
+ /*
+ * If what we preempted previously was exactly what we're
+ * requesting right now, restore the preempted precopy
+ * immediately, boosting its priority as it's requested by
+ * postcopy.
+ */
+ postcopy_preempt_restore(rs, pss, true);
+ return true;
+ }
+ } else {
/*
* Poll write faults too if background snapshot is enabled; that's
* when we have vcpus got blocked by the write protected pages.
@@ -1986,7 +2107,9 @@ static bool get_queued_page(RAMState *rs, PageSearchStatus *pss)
* really rare.
*/
pss->complete_round = false;
+ /* Mark it an urgent request, meanwhile using POSTCOPY channel */
pss->postcopy_requested = true;
+ pss->postcopy_target_channel = RAM_CHANNEL_POSTCOPY;
}
return !!block;
@@ -2180,6 +2303,129 @@ static int ram_save_target_page(RAMState *rs, PageSearchStatus *pss)
return ram_save_page(rs, pss);
}
+static bool postcopy_needs_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ MigrationState *ms = migrate_get_current();
+
+ /* Not enabled eager preempt? Then never do that. */
+ if (!migrate_postcopy_preempt()) {
+ return false;
+ }
+
+ /* If the user explicitly disabled breaking of huge page, skip */
+ if (!ms->postcopy_preempt_break_huge) {
+ return false;
+ }
+
+ /* If the ramblock we're sending is a small page? Never bother. */
+ if (qemu_ram_pagesize(pss->block) == TARGET_PAGE_SIZE) {
+ return false;
+ }
+
+ /* Not in postcopy at all? */
+ if (!migration_in_postcopy()) {
+ return false;
+ }
+
+ /*
+ * If we're already handling a postcopy request, don't preempt as this page
+ * has got the same high priority.
+ */
+ if (pss->postcopy_requested) {
+ return false;
+ }
+
+ /* If there's postcopy requests, then check it up! */
+ return postcopy_has_request(rs);
+}
+
+/* Returns true if we preempted precopy, false otherwise */
+static void postcopy_do_preempt(RAMState *rs, PageSearchStatus *pss)
+{
+ PostcopyPreemptState *p_state = &rs->postcopy_preempt_state;
+
+ trace_postcopy_preempt_triggered(pss->block->idstr, pss->page);
+
+ /*
+ * Time to preempt precopy. Cache current PSS into preempt state, so that
+ * after handling the postcopy pages we can recover to it. We need to do
+ * so because the dest VM will have partial of the precopy huge page kept
+ * over in its tmp huge page caches; better move on with it when we can.
+ */
+ p_state->ram_block = pss->block;
+ p_state->ram_page = pss->page;
+ p_state->preempted = true;
+}
+
+/* Whether we're preempted by a postcopy request during sending a huge page */
+static bool postcopy_preempt_triggered(RAMState *rs)
+{
+ return rs->postcopy_preempt_state.preempted;
+}
+
+static void postcopy_preempt_restore(RAMState *rs, PageSearchStatus *pss,
+ bool postcopy_requested)
+{
+ PostcopyPreemptState *state = &rs->postcopy_preempt_state;
+
+ assert(state->preempted);
+
+ pss->block = state->ram_block;
+ pss->page = state->ram_page;
+
+ /* Whether this is a postcopy request? */
+ pss->postcopy_requested = postcopy_requested;
+ /*
+ * When restoring a preempted page, the old data resides in PRECOPY
+ * slow channel, even if postcopy_requested is set. So always use
+ * PRECOPY channel here.
+ */
+ pss->postcopy_target_channel = RAM_CHANNEL_PRECOPY;
+
+ trace_postcopy_preempt_restored(pss->block->idstr, pss->page);
+
+ /* Reset preempt state, most importantly, set preempted==false */
+ postcopy_preempt_reset(rs);
+}
+
+static void postcopy_preempt_choose_channel(RAMState *rs, PageSearchStatus *pss)
+{
+ MigrationState *s = migrate_get_current();
+ unsigned int channel = pss->postcopy_target_channel;
+ QEMUFile *next;
+
+ if (channel != rs->postcopy_channel) {
+ if (channel == RAM_CHANNEL_PRECOPY) {
+ next = s->to_dst_file;
+ } else {
+ next = s->postcopy_qemufile_src;
+ }
+ /* Update and cache the current channel */
+ rs->f = next;
+ rs->postcopy_channel = channel;
+
+ /*
+ * If channel switched, reset last_sent_block since the old sent block
+ * may not be on the same channel.
+ */
+ rs->last_sent_block = NULL;
+
+ trace_postcopy_preempt_switch_channel(channel);
+ }
+
+ trace_postcopy_preempt_send_host_page(pss->block->idstr, pss->page);
+}
+
+/* We need to make sure rs->f always points to the default channel elsewhere */
+static void postcopy_preempt_reset_channel(RAMState *rs)
+{
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
+ rs->f = migrate_get_current()->to_dst_file;
+ trace_postcopy_preempt_reset_channel();
+ }
+}
+
/**
* ram_save_host_page: save a whole host page
*
@@ -2211,7 +2457,16 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
return 0;
}
+ if (migrate_postcopy_preempt() && migration_in_postcopy()) {
+ postcopy_preempt_choose_channel(rs, pss);
+ }
+
do {
+ if (postcopy_needs_preempt(rs, pss)) {
+ postcopy_do_preempt(rs, pss);
+ break;
+ }
+
/* Check the pages is dirty and if it is send it */
if (migration_bitmap_clear_dirty(rs, pss->block, pss->page)) {
tmppages = ram_save_target_page(rs, pss);
@@ -2235,6 +2490,19 @@ static int ram_save_host_page(RAMState *rs, PageSearchStatus *pss)
/* The offset we leave with is the min boundary of host page and block */
pss->page = MIN(pss->page, hostpage_boundary);
+ /*
+ * When with postcopy preempt mode, flush the data as soon as possible for
+ * postcopy requests, because we've already sent a whole huge page, so the
+ * dst node should already have enough resource to atomically filling in
+ * the current missing page.
+ *
+ * More importantly, when using separate postcopy channel, we must do
+ * explicit flush or it won't flush until the buffer is full.
+ */
+ if (migrate_postcopy_preempt() && pss->postcopy_requested) {
+ qemu_fflush(rs->f);
+ }
+
res = ram_save_release_protection(rs, pss, start_page);
return (res < 0 ? res : pages);
}
@@ -2276,8 +2544,17 @@ static int ram_find_and_save_block(RAMState *rs)
found = get_queued_page(rs, &pss);
if (!found) {
- /* priority queue empty, so just search for something dirty */
- found = find_dirty_block(rs, &pss, &again);
+ /*
+ * Recover previous precopy ramblock/offset if postcopy has
+ * preempted precopy. Otherwise find the next dirty bit.
+ */
+ if (postcopy_preempt_triggered(rs)) {
+ postcopy_preempt_restore(rs, &pss, false);
+ found = true;
+ } else {
+ /* priority queue empty, so just search for something dirty */
+ found = find_dirty_block(rs, &pss, &again);
+ }
}
if (found) {
@@ -2405,6 +2682,8 @@ static void ram_state_reset(RAMState *rs)
rs->last_page = 0;
rs->last_version = ram_list.version;
rs->xbzrle_enabled = false;
+ postcopy_preempt_reset(rs);
+ rs->postcopy_channel = RAM_CHANNEL_PRECOPY;
}
#define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -3048,6 +3327,8 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
}
qemu_mutex_unlock(&rs->bitmap_mutex);
+ postcopy_preempt_reset_channel(rs);
+
/*
* Must occur before EOS (or any QEMUFile operation)
* because of RDMA protocol.
@@ -3125,6 +3406,8 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
return ret;
}
+ postcopy_preempt_reset_channel(rs);
+
ret = multifd_send_sync_main(rs->f);
if (ret < 0) {
return ret;
@@ -3209,11 +3492,13 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
* @mis: the migration incoming state pointer
* @f: QEMUFile where to read the data from
* @flags: Page flags (mostly to see if it's a continuation of previous block)
+ * @channel: the channel we're using
*/
static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
- QEMUFile *f, int flags)
+ QEMUFile *f, int flags,
+ int channel)
{
- RAMBlock *block = mis->last_recv_block;
+ RAMBlock *block = mis->last_recv_block[channel];
char id[256];
uint8_t len;
@@ -3240,7 +3525,7 @@ static inline RAMBlock *ram_block_from_stream(MigrationIncomingState *mis,
return NULL;
}
- mis->last_recv_block = block;
+ mis->last_recv_block[channel] = block;
return block;
}
@@ -3659,15 +3944,15 @@ int ram_postcopy_incoming_init(MigrationIncomingState *mis)
* rcu_read_lock is taken prior to this being called.
*
* @f: QEMUFile where to send the data
+ * @channel: the channel to use for loading
*/
-int ram_load_postcopy(QEMUFile *f)
+int ram_load_postcopy(QEMUFile *f, int channel)
{
int flags = 0, ret = 0;
bool place_needed = false;
bool matches_target_page_size = false;
MigrationIncomingState *mis = migration_incoming_get_current();
- /* Currently we only use channel 0. TODO: use all the channels */
- PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[0];
+ PostcopyTmpPage *tmp_page = &mis->postcopy_tmp_pages[channel];
while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
ram_addr_t addr;
@@ -3691,10 +3976,10 @@ int ram_load_postcopy(QEMUFile *f)
flags = addr & ~TARGET_PAGE_MASK;
addr &= TARGET_PAGE_MASK;
- trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+ trace_ram_load_postcopy_loop(channel, (uint64_t)addr, flags);
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE)) {
- block = ram_block_from_stream(mis, f, flags);
+ block = ram_block_from_stream(mis, f, flags, channel);
if (!block) {
ret = -EINVAL;
break;
@@ -3732,10 +4017,10 @@ int ram_load_postcopy(QEMUFile *f)
} else if (tmp_page->host_addr !=
host_page_from_ram_block_offset(block, addr)) {
/* not the 1st TP within the HP */
- error_report("Non-same host page detected. "
+ error_report("Non-same host page detected on channel %d: "
"Target host page %p, received host page %p "
"(rb %s offset 0x"RAM_ADDR_FMT" target_pages %d)",
- tmp_page->host_addr,
+ channel, tmp_page->host_addr,
host_page_from_ram_block_offset(block, addr),
block->idstr, addr, tmp_page->target_pages);
ret = -EINVAL;
@@ -3945,7 +4230,8 @@ static int ram_load_precopy(QEMUFile *f)
if (flags & (RAM_SAVE_FLAG_ZERO | RAM_SAVE_FLAG_PAGE |
RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
- RAMBlock *block = ram_block_from_stream(mis, f, flags);
+ RAMBlock *block = ram_block_from_stream(mis, f, flags,
+ RAM_CHANNEL_PRECOPY);
host = host_from_ram_block_offset(block, addr);
/*
@@ -4122,7 +4408,12 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
*/
WITH_RCU_READ_LOCK_GUARD() {
if (postcopy_running) {
- ret = ram_load_postcopy(f);
+ /*
+ * Note! Here RAM_CHANNEL_PRECOPY is the precopy channel of
+ * postcopy migration, we have another RAM_CHANNEL_POSTCOPY to
+ * service fast page faults.
+ */
+ ret = ram_load_postcopy(f, RAM_CHANNEL_PRECOPY);
} else {
ret = ram_load_precopy(f);
}
@@ -4284,6 +4575,12 @@ static int ram_resume_prepare(MigrationState *s, void *opaque)
return 0;
}
+void postcopy_preempt_shutdown_file(MigrationState *s)
+{
+ qemu_put_be64(s->postcopy_qemufile_src, RAM_SAVE_FLAG_EOS);
+ qemu_fflush(s->postcopy_qemufile_src);
+}
+
static SaveVMHandlers savevm_ram_handlers = {
.save_setup = ram_save_setup,
.save_live_iterate = ram_save_iterate,
diff --git a/migration/ram.h b/migration/ram.h
index ded0a3a086..c7af65ac74 100644
--- a/migration/ram.h
+++ b/migration/ram.h
@@ -61,7 +61,7 @@ void ram_postcopy_send_discard_bitmap(MigrationState *ms);
/* For incoming postcopy discard */
int ram_discard_range(const char *block_name, uint64_t start, size_t length);
int ram_postcopy_incoming_init(MigrationIncomingState *mis);
-int ram_load_postcopy(QEMUFile *f);
+int ram_load_postcopy(QEMUFile *f, int channel);
void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
@@ -73,6 +73,8 @@ int64_t ramblock_recv_bitmap_send(QEMUFile *file,
const char *block_name);
int ram_dirty_bitmap_reload(MigrationState *s, RAMBlock *rb);
bool ramblock_page_is_discarded(RAMBlock *rb, ram_addr_t start);
+void postcopy_preempt_shutdown_file(MigrationState *s);
+void *postcopy_preempt_thread(void *opaque);
/* ram cache */
int colo_init_ram_cache(void);
@@ -87,4 +89,6 @@ void ram_write_tracking_prepare(void);
int ram_write_tracking_start(void);
void ram_write_tracking_stop(void);
+void dirty_sync_missed_zero_copy(void);
+
#endif
diff --git a/migration/savevm.c b/migration/savevm.c
index e8a1b96fcd..48e85c052c 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -2117,6 +2117,13 @@ static int loadvm_postcopy_handle_resume(MigrationIncomingState *mis)
*/
qemu_sem_post(&mis->postcopy_pause_sem_fault);
+ if (migrate_postcopy_preempt()) {
+ /* The channel should already be setup again; make sure of it */
+ assert(mis->postcopy_qemufile_dst);
+ /* Kick the fast ram load thread too */
+ qemu_sem_post(&mis->postcopy_pause_sem_fast_load);
+ }
+
return 0;
}
@@ -2540,16 +2547,6 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
{
int i;
- /*
- * If network is interrupted, any temp page we received will be useless
- * because we didn't mark them as "received" in receivedmap. After a
- * proper recovery later (which will sync src dirty bitmap with receivedmap
- * on dest) these cached small pages will be resent again.
- */
- for (i = 0; i < mis->postcopy_channels; i++) {
- postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
- }
-
trace_postcopy_pause_incoming();
assert(migrate_postcopy_ram());
@@ -2572,12 +2569,37 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
mis->to_src_file = NULL;
qemu_mutex_unlock(&mis->rp_mutex);
+ /*
+ * NOTE: this must happen before reset the PostcopyTmpPages below,
+ * otherwise it's racy to reset those fields when the fast load thread
+ * can be accessing it in parallel.
+ */
+ if (mis->postcopy_qemufile_dst) {
+ qemu_file_shutdown(mis->postcopy_qemufile_dst);
+ /* Take the mutex to make sure the fast ram load thread halted */
+ qemu_mutex_lock(&mis->postcopy_prio_thread_mutex);
+ migration_ioc_unregister_yank_from_file(mis->postcopy_qemufile_dst);
+ qemu_fclose(mis->postcopy_qemufile_dst);
+ mis->postcopy_qemufile_dst = NULL;
+ qemu_mutex_unlock(&mis->postcopy_prio_thread_mutex);
+ }
+
migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
MIGRATION_STATUS_POSTCOPY_PAUSED);
/* Notify the fault thread for the invalidated file handle */
postcopy_fault_thread_notify(mis);
+ /*
+ * If network is interrupted, any temp page we received will be useless
+ * because we didn't mark them as "received" in receivedmap. After a
+ * proper recovery later (which will sync src dirty bitmap with receivedmap
+ * on dest) these cached small pages will be resent again.
+ */
+ for (i = 0; i < mis->postcopy_channels; i++) {
+ postcopy_temp_page_reset(&mis->postcopy_tmp_pages[i]);
+ }
+
error_report("Detected IO failure for postcopy. "
"Migration paused.");
@@ -2599,8 +2621,8 @@ retry:
while (true) {
section_type = qemu_get_byte(f);
- if (qemu_file_get_error(f)) {
- ret = qemu_file_get_error(f);
+ ret = qemu_file_get_error_obj_any(f, mis->postcopy_qemufile_dst, NULL);
+ if (ret) {
break;
}
diff --git a/migration/socket.c b/migration/socket.c
index 4fd5e85f50..e6fdf3c5e1 100644
--- a/migration/socket.c
+++ b/migration/socket.c
@@ -26,7 +26,7 @@
#include "io/channel-socket.h"
#include "io/net-listener.h"
#include "trace.h"
-
+#include "postcopy-ram.h"
struct SocketOutgoingArgs {
SocketAddress *saddr;
@@ -39,6 +39,24 @@ void socket_send_channel_create(QIOTaskFunc f, void *data)
f, data, NULL, NULL);
}
+QIOChannel *socket_send_channel_create_sync(Error **errp)
+{
+ QIOChannelSocket *sioc = qio_channel_socket_new();
+
+ if (!outgoing_args.saddr) {
+ object_unref(OBJECT(sioc));
+ error_setg(errp, "Initial sock address not set!");
+ return NULL;
+ }
+
+ if (qio_channel_socket_connect_sync(sioc, outgoing_args.saddr, errp) < 0) {
+ object_unref(OBJECT(sioc));
+ return NULL;
+ }
+
+ return QIO_CHANNEL(sioc);
+}
+
int socket_send_channel_destroy(QIOChannel *send)
{
/* Remove channel */
@@ -166,6 +184,8 @@ socket_start_incoming_migration_internal(SocketAddress *saddr,
if (migrate_use_multifd()) {
num = migrate_multifd_channels();
+ } else if (migrate_postcopy_preempt()) {
+ num = RAM_CHANNEL_MAX;
}
if (qio_net_listener_open_sync(listener, saddr, num, errp) < 0) {
diff --git a/migration/socket.h b/migration/socket.h
index 891dbccceb..dc54df4e6c 100644
--- a/migration/socket.h
+++ b/migration/socket.h
@@ -21,6 +21,7 @@
#include "io/task.h"
void socket_send_channel_create(QIOTaskFunc f, void *data);
+QIOChannel *socket_send_channel_create_sync(Error **errp);
int socket_send_channel_destroy(QIOChannel *send);
void socket_start_incoming_migration(const char *str, Error **errp);
diff --git a/migration/tls.c b/migration/tls.c
index 32c384a8b6..73e8c9d3c2 100644
--- a/migration/tls.c
+++ b/migration/tls.c
@@ -166,3 +166,12 @@ void migration_tls_channel_connect(MigrationState *s,
NULL,
NULL);
}
+
+bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc)
+{
+ if (!migrate_use_tls()) {
+ return false;
+ }
+
+ return !object_dynamic_cast(OBJECT(ioc), TYPE_QIO_CHANNEL_TLS);
+}
diff --git a/migration/tls.h b/migration/tls.h
index de4fe2cafd..98e23c9b0e 100644
--- a/migration/tls.h
+++ b/migration/tls.h
@@ -37,4 +37,8 @@ void migration_tls_channel_connect(MigrationState *s,
QIOChannel *ioc,
const char *hostname,
Error **errp);
+
+/* Whether the QIO channel requires further TLS handshake? */
+bool migrate_channel_requires_tls_upgrade(QIOChannel *ioc);
+
#endif
diff --git a/migration/trace-events b/migration/trace-events
index 1aec580e92..a34afe7b85 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -91,7 +91,7 @@ migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, unsigned
migration_throttle(void) ""
ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: %" PRIx64 " %zx"
ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: addr: 0x%" PRIx64 " flags: 0x%x host: %p"
-ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
+ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d addr=0x%" PRIx64 " flags=0x%x"
ram_postcopy_send_discard_bitmap(void) ""
ram_save_page(const char *rbname, uint64_t offset, void *host) "%s: offset: 0x%" PRIx64 " host: %p"
ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: 0x%zx len: 0x%zx"
@@ -111,6 +111,12 @@ ram_load_complete(int ret, uint64_t seq_iter) "exit_code %d seq iteration %" PRI
ram_write_tracking_ramblock_start(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
ram_write_tracking_ramblock_stop(const char *block_id, size_t page_size, void *addr, size_t length) "%s: page_size: %zu addr: %p length: %zu"
unqueue_page(char *block, uint64_t offset, bool dirty) "ramblock '%s' offset 0x%"PRIx64" dirty %d"
+postcopy_preempt_triggered(char *str, unsigned long page) "during sending ramblock %s offset 0x%lx"
+postcopy_preempt_restored(char *str, unsigned long page) "ramblock %s offset 0x%lx"
+postcopy_preempt_hit(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_send_host_page(char *str, uint64_t offset) "ramblock %s offset 0x%"PRIx64
+postcopy_preempt_switch_channel(int channel) "%d"
+postcopy_preempt_reset_channel(void) ""
# multifd.c
multifd_new_send_channel_async(uint8_t id) "channel %u"
@@ -176,6 +182,7 @@ migration_thread_low_pending(uint64_t pending) "%" PRIu64
migrate_transferred(uint64_t tranferred, uint64_t time_spent, uint64_t bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %" PRIu64 " max_size %" PRId64
process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
process_incoming_migration_co_postcopy_end_main(void) ""
+postcopy_preempt_enabled(bool value) "%d"
# channel.c
migration_set_incoming_channel(void *ioc, const char *ioctype) "ioc=%p ioctype=%s"
@@ -263,6 +270,8 @@ mark_postcopy_blocktime_begin(uint64_t addr, void *dd, uint32_t time, int cpu, i
mark_postcopy_blocktime_end(uint64_t addr, void *dd, uint32_t time, int affected_cpu) "addr: 0x%" PRIx64 ", dd: %p, time: %u, affected_cpu: %d"
postcopy_pause_fault_thread(void) ""
postcopy_pause_fault_thread_continued(void) ""
+postcopy_pause_fast_load(void) ""
+postcopy_pause_fast_load_continued(void) ""
postcopy_ram_fault_thread_entry(void) ""
postcopy_ram_fault_thread_exit(void) ""
postcopy_ram_fault_thread_fds_core(int baseufd, int quitfd) "ufd: %d quitfd: %d"
@@ -278,6 +287,10 @@ postcopy_request_shared_page(const char *sharer, const char *rb, uint64_t rb_off
postcopy_request_shared_page_present(const char *sharer, const char *rb, uint64_t rb_offset) "%s already %s offset 0x%"PRIx64
postcopy_wake_shared(uint64_t client_addr, const char *rb) "at 0x%"PRIx64" in %s"
postcopy_page_req_del(void *addr, int count) "resolved page req %p total %d"
+postcopy_preempt_tls_handshake(void) ""
+postcopy_preempt_new_channel(void) ""
+postcopy_preempt_thread_entry(void) ""
+postcopy_preempt_thread_exit(void) ""
get_mem_fault_cpu_index(int cpu, uint32_t pid) "cpu: %d, pid: %u"
diff --git a/monitor/hmp-cmds.c b/monitor/hmp-cmds.c
index ca98df0495..a6dc79e0d5 100644
--- a/monitor/hmp-cmds.c
+++ b/monitor/hmp-cmds.c
@@ -307,6 +307,11 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
monitor_printf(mon, "postcopy ram: %" PRIu64 " kbytes\n",
info->ram->postcopy_bytes >> 10);
}
+ if (info->ram->dirty_sync_missed_zero_copy) {
+ monitor_printf(mon,
+ "Zero-copy-send fallbacks happened: %" PRIu64 " times\n",
+ info->ram->dirty_sync_missed_zero_copy);
+ }
}
if (info->has_disk) {
diff --git a/qapi/migration.json b/qapi/migration.json
index 7102e474a6..81185d4311 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -55,6 +55,10 @@
# @postcopy-bytes: The number of bytes sent during the post-copy phase
# (since 7.0).
#
+# @dirty-sync-missed-zero-copy: Number of times dirty RAM synchronization could
+# not avoid copying dirty pages. This is between
+# 0 and @dirty-sync-count * @multifd-channels.
+# (since 7.1)
# Since: 0.14
##
{ 'struct': 'MigrationStats',
@@ -65,7 +69,8 @@
'postcopy-requests' : 'int', 'page-size' : 'int',
'multifd-bytes' : 'uint64', 'pages-per-second' : 'uint64',
'precopy-bytes' : 'uint64', 'downtime-bytes' : 'uint64',
- 'postcopy-bytes' : 'uint64' } }
+ 'postcopy-bytes' : 'uint64',
+ 'dirty-sync-missed-zero-copy' : 'uint64' } }
##
# @XBZRLECacheStats:
@@ -467,6 +472,11 @@
# Requires that QEMU be permitted to use locked memory
# for guest RAM pages.
# (since 7.1)
+# @postcopy-preempt: If enabled, the migration process will allow postcopy
+# requests to preempt precopy stream, so postcopy requests
+# will be handled faster. This is a performance feature and
+# should not affect the correctness of postcopy migration.
+# (since 7.1)
#
# Features:
# @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -482,7 +492,7 @@
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
- 'zero-copy-send'] }
+ 'zero-copy-send', 'postcopy-preempt'] }
##
# @MigrationCapabilityStatus:
@@ -1869,6 +1879,86 @@
{ 'command': 'query-dirty-rate', 'returns': 'DirtyRateInfo' }
##
+# @DirtyLimitInfo:
+#
+# Dirty page rate limit information of a virtual CPU.
+#
+# @cpu-index: index of a virtual CPU.
+#
+# @limit-rate: upper limit of dirty page rate (MB/s) for a virtual
+# CPU, 0 means unlimited.
+#
+# @current-rate: current dirty page rate (MB/s) for a virtual CPU.
+#
+# Since: 7.1
+#
+##
+{ 'struct': 'DirtyLimitInfo',
+ 'data': { 'cpu-index': 'int',
+ 'limit-rate': 'uint64',
+ 'current-rate': 'uint64' } }
+
+##
+# @set-vcpu-dirty-limit:
+#
+# Set the upper limit of dirty page rate for virtual CPUs.
+#
+# Requires KVM with accelerator property "dirty-ring-size" set.
+# A virtual CPU's dirty page rate is a measure of its memory load.
+# To observe dirty page rates, use @calc-dirty-rate.
+#
+# @cpu-index: index of a virtual CPU, default is all.
+#
+# @dirty-rate: upper limit of dirty page rate (MB/s) for virtual CPUs.
+#
+# Since: 7.1
+#
+# Example:
+# {"execute": "set-vcpu-dirty-limit"}
+# "arguments": { "dirty-rate": 200,
+# "cpu-index": 1 } }
+#
+##
+{ 'command': 'set-vcpu-dirty-limit',
+ 'data': { '*cpu-index': 'int',
+ 'dirty-rate': 'uint64' } }
+
+##
+# @cancel-vcpu-dirty-limit:
+#
+# Cancel the upper limit of dirty page rate for virtual CPUs.
+#
+# Cancel the dirty page limit for the vCPU which has been set with
+# set-vcpu-dirty-limit command. Note that this command requires
+# support from dirty ring, same as the "set-vcpu-dirty-limit".
+#
+# @cpu-index: index of a virtual CPU, default is all.
+#
+# Since: 7.1
+#
+# Example:
+# {"execute": "cancel-vcpu-dirty-limit"}
+# "arguments": { "cpu-index": 1 } }
+#
+##
+{ 'command': 'cancel-vcpu-dirty-limit',
+ 'data': { '*cpu-index': 'int'} }
+
+##
+# @query-vcpu-dirty-limit:
+#
+# Returns information about virtual CPU dirty page rate limits, if any.
+#
+# Since: 7.1
+#
+# Example:
+# {"execute": "query-vcpu-dirty-limit"}
+#
+##
+{ 'command': 'query-vcpu-dirty-limit',
+ 'returns': [ 'DirtyLimitInfo' ] }
+
+##
# @snapshot-save:
#
# Save a VM snapshot
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
new file mode 100644
index 0000000000..8d98cb7f2c
--- /dev/null
+++ b/softmmu/dirtylimit.c
@@ -0,0 +1,601 @@
+/*
+ * Dirty page rate limit implementation code
+ *
+ * Copyright (c) 2022 CHINA TELECOM CO.,LTD.
+ *
+ * Authors:
+ * Hyman Huang(黄勇) <huangy81@chinatelecom.cn>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "qapi/qapi-commands-migration.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/error.h"
+#include "sysemu/dirtyrate.h"
+#include "sysemu/dirtylimit.h"
+#include "monitor/hmp.h"
+#include "monitor/monitor.h"
+#include "exec/memory.h"
+#include "hw/boards.h"
+#include "sysemu/kvm.h"
+#include "trace.h"
+
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
+/*
+ * Plus or minus vcpu sleep time linearly if dirty
+ * page rate error value percentage over
+ * DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT.
+ * Otherwise, plus or minus a fixed vcpu sleep time.
+ */
+#define DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT 50
+/*
+ * Max vcpu sleep time percentage during a cycle
+ * composed of dirty ring full and sleep time.
+ */
+#define DIRTYLIMIT_THROTTLE_PCT_MAX 99
+
+struct {
+ VcpuStat stat;
+ bool running;
+ QemuThread thread;
+} *vcpu_dirty_rate_stat;
+
+typedef struct VcpuDirtyLimitState {
+ int cpu_index;
+ bool enabled;
+ /*
+ * Quota dirty page rate, unit is MB/s
+ * zero if not enabled.
+ */
+ uint64_t quota;
+} VcpuDirtyLimitState;
+
+struct {
+ VcpuDirtyLimitState *states;
+ /* Max cpus number configured by user */
+ int max_cpus;
+ /* Number of vcpu under dirtylimit */
+ int limited_nvcpu;
+} *dirtylimit_state;
+
+/* protect dirtylimit_state */
+static QemuMutex dirtylimit_mutex;
+
+/* dirtylimit thread quit if dirtylimit_quit is true */
+static bool dirtylimit_quit;
+
+static void vcpu_dirty_rate_stat_collect(void)
+{
+ VcpuStat stat;
+ int i = 0;
+
+ /* calculate vcpu dirtyrate */
+ vcpu_calculate_dirtyrate(DIRTYLIMIT_CALC_TIME_MS,
+ &stat,
+ GLOBAL_DIRTY_LIMIT,
+ false);
+
+ for (i = 0; i < stat.nvcpu; i++) {
+ vcpu_dirty_rate_stat->stat.rates[i].id = i;
+ vcpu_dirty_rate_stat->stat.rates[i].dirty_rate =
+ stat.rates[i].dirty_rate;
+ }
+
+ free(stat.rates);
+}
+
+static void *vcpu_dirty_rate_stat_thread(void *opaque)
+{
+ rcu_register_thread();
+
+ /* start log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, true);
+
+ while (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ vcpu_dirty_rate_stat_collect();
+ if (dirtylimit_in_service()) {
+ dirtylimit_process();
+ }
+ }
+
+ /* stop log sync */
+ global_dirty_log_change(GLOBAL_DIRTY_LIMIT, false);
+
+ rcu_unregister_thread();
+ return NULL;
+}
+
+int64_t vcpu_dirty_rate_get(int cpu_index)
+{
+ DirtyRateVcpu *rates = vcpu_dirty_rate_stat->stat.rates;
+ return qatomic_read_i64(&rates[cpu_index].dirty_rate);
+}
+
+void vcpu_dirty_rate_stat_start(void)
+{
+ if (qatomic_read(&vcpu_dirty_rate_stat->running)) {
+ return;
+ }
+
+ qatomic_set(&vcpu_dirty_rate_stat->running, 1);
+ qemu_thread_create(&vcpu_dirty_rate_stat->thread,
+ "dirtyrate-stat",
+ vcpu_dirty_rate_stat_thread,
+ NULL,
+ QEMU_THREAD_JOINABLE);
+}
+
+void vcpu_dirty_rate_stat_stop(void)
+{
+ qatomic_set(&vcpu_dirty_rate_stat->running, 0);
+ dirtylimit_state_unlock();
+ qemu_mutex_unlock_iothread();
+ qemu_thread_join(&vcpu_dirty_rate_stat->thread);
+ qemu_mutex_lock_iothread();
+ dirtylimit_state_lock();
+}
+
+void vcpu_dirty_rate_stat_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+
+ vcpu_dirty_rate_stat =
+ g_malloc0(sizeof(*vcpu_dirty_rate_stat));
+
+ vcpu_dirty_rate_stat->stat.nvcpu = max_cpus;
+ vcpu_dirty_rate_stat->stat.rates =
+ g_malloc0(sizeof(DirtyRateVcpu) * max_cpus);
+
+ vcpu_dirty_rate_stat->running = false;
+}
+
+void vcpu_dirty_rate_stat_finalize(void)
+{
+ free(vcpu_dirty_rate_stat->stat.rates);
+ vcpu_dirty_rate_stat->stat.rates = NULL;
+
+ free(vcpu_dirty_rate_stat);
+ vcpu_dirty_rate_stat = NULL;
+}
+
+void dirtylimit_state_lock(void)
+{
+ qemu_mutex_lock(&dirtylimit_mutex);
+}
+
+void dirtylimit_state_unlock(void)
+{
+ qemu_mutex_unlock(&dirtylimit_mutex);
+}
+
+static void
+__attribute__((__constructor__)) dirtylimit_mutex_init(void)
+{
+ qemu_mutex_init(&dirtylimit_mutex);
+}
+
+static inline VcpuDirtyLimitState *dirtylimit_vcpu_get_state(int cpu_index)
+{
+ return &dirtylimit_state->states[cpu_index];
+}
+
+void dirtylimit_state_initialize(void)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ dirtylimit_state = g_malloc0(sizeof(*dirtylimit_state));
+
+ dirtylimit_state->states =
+ g_malloc0(sizeof(VcpuDirtyLimitState) * max_cpus);
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_state->states[i].cpu_index = i;
+ }
+
+ dirtylimit_state->max_cpus = max_cpus;
+ trace_dirtylimit_state_initialize(max_cpus);
+}
+
+void dirtylimit_state_finalize(void)
+{
+ free(dirtylimit_state->states);
+ dirtylimit_state->states = NULL;
+
+ free(dirtylimit_state);
+ dirtylimit_state = NULL;
+
+ trace_dirtylimit_state_finalize();
+}
+
+bool dirtylimit_in_service(void)
+{
+ return !!dirtylimit_state;
+}
+
+bool dirtylimit_vcpu_index_valid(int cpu_index)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+
+ return !(cpu_index < 0 ||
+ cpu_index >= ms->smp.max_cpus);
+}
+
+static inline int64_t dirtylimit_dirty_ring_full_time(uint64_t dirtyrate)
+{
+ static uint64_t max_dirtyrate;
+ uint32_t dirty_ring_size = kvm_dirty_ring_size();
+ uint64_t dirty_ring_size_meory_MB =
+ dirty_ring_size * TARGET_PAGE_SIZE >> 20;
+
+ if (max_dirtyrate < dirtyrate) {
+ max_dirtyrate = dirtyrate;
+ }
+
+ return dirty_ring_size_meory_MB * 1000000 / max_dirtyrate;
+}
+
+static inline bool dirtylimit_done(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) <= DIRTYLIMIT_TOLERANCE_RANGE) ? true : false;
+}
+
+static inline bool
+dirtylimit_need_linear_adjustment(uint64_t quota,
+ uint64_t current)
+{
+ uint64_t min, max;
+
+ min = MIN(quota, current);
+ max = MAX(quota, current);
+
+ return ((max - min) * 100 / max) > DIRTYLIMIT_LINEAR_ADJUSTMENT_PCT;
+}
+
+static void dirtylimit_set_throttle(CPUState *cpu,
+ uint64_t quota,
+ uint64_t current)
+{
+ int64_t ring_full_time_us = 0;
+ uint64_t sleep_pct = 0;
+ uint64_t throttle_us = 0;
+
+ if (current == 0) {
+ cpu->throttle_us_per_full = 0;
+ return;
+ }
+
+ ring_full_time_us = dirtylimit_dirty_ring_full_time(current);
+
+ if (dirtylimit_need_linear_adjustment(quota, current)) {
+ if (quota < current) {
+ sleep_pct = (current - quota) * 100 / current;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full += throttle_us;
+ } else {
+ sleep_pct = (quota - current) * 100 / quota;
+ throttle_us =
+ ring_full_time_us * sleep_pct / (double)(100 - sleep_pct);
+ cpu->throttle_us_per_full -= throttle_us;
+ }
+
+ trace_dirtylimit_throttle_pct(cpu->cpu_index,
+ sleep_pct,
+ throttle_us);
+ } else {
+ if (quota < current) {
+ cpu->throttle_us_per_full += ring_full_time_us / 10;
+ } else {
+ cpu->throttle_us_per_full -= ring_full_time_us / 10;
+ }
+ }
+
+ /*
+ * TODO: in the big kvm_dirty_ring_size case (eg: 65536, or other scenario),
+ * current dirty page rate may never reach the quota, we should stop
+ * increasing sleep time?
+ */
+ cpu->throttle_us_per_full = MIN(cpu->throttle_us_per_full,
+ ring_full_time_us * DIRTYLIMIT_THROTTLE_PCT_MAX);
+
+ cpu->throttle_us_per_full = MAX(cpu->throttle_us_per_full, 0);
+}
+
+static void dirtylimit_adjust_throttle(CPUState *cpu)
+{
+ uint64_t quota = 0;
+ uint64_t current = 0;
+ int cpu_index = cpu->cpu_index;
+
+ quota = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ current = vcpu_dirty_rate_get(cpu_index);
+
+ if (!dirtylimit_done(quota, current)) {
+ dirtylimit_set_throttle(cpu, quota, current);
+ }
+
+ return;
+}
+
+void dirtylimit_process(void)
+{
+ CPUState *cpu;
+
+ if (!qatomic_read(&dirtylimit_quit)) {
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return;
+ }
+
+ CPU_FOREACH(cpu) {
+ if (!dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled) {
+ continue;
+ }
+ dirtylimit_adjust_throttle(cpu);
+ }
+ dirtylimit_state_unlock();
+ }
+}
+
+void dirtylimit_change(bool start)
+{
+ if (start) {
+ qatomic_set(&dirtylimit_quit, 0);
+ } else {
+ qatomic_set(&dirtylimit_quit, 1);
+ }
+}
+
+void dirtylimit_set_vcpu(int cpu_index,
+ uint64_t quota,
+ bool enable)
+{
+ trace_dirtylimit_set_vcpu(cpu_index, quota);
+
+ if (enable) {
+ dirtylimit_state->states[cpu_index].quota = quota;
+ if (!dirtylimit_vcpu_get_state(cpu_index)->enabled) {
+ dirtylimit_state->limited_nvcpu++;
+ }
+ } else {
+ dirtylimit_state->states[cpu_index].quota = 0;
+ if (dirtylimit_state->states[cpu_index].enabled) {
+ dirtylimit_state->limited_nvcpu--;
+ }
+ }
+
+ dirtylimit_state->states[cpu_index].enabled = enable;
+}
+
+void dirtylimit_set_all(uint64_t quota,
+ bool enable)
+{
+ MachineState *ms = MACHINE(qdev_get_machine());
+ int max_cpus = ms->smp.max_cpus;
+ int i;
+
+ for (i = 0; i < max_cpus; i++) {
+ dirtylimit_set_vcpu(i, quota, enable);
+ }
+}
+
+void dirtylimit_vcpu_execute(CPUState *cpu)
+{
+ if (dirtylimit_in_service() &&
+ dirtylimit_vcpu_get_state(cpu->cpu_index)->enabled &&
+ cpu->throttle_us_per_full) {
+ trace_dirtylimit_vcpu_execute(cpu->cpu_index,
+ cpu->throttle_us_per_full);
+ usleep(cpu->throttle_us_per_full);
+ }
+}
+
+static void dirtylimit_init(void)
+{
+ dirtylimit_state_initialize();
+ dirtylimit_change(true);
+ vcpu_dirty_rate_stat_initialize();
+ vcpu_dirty_rate_stat_start();
+}
+
+static void dirtylimit_cleanup(void)
+{
+ vcpu_dirty_rate_stat_stop();
+ vcpu_dirty_rate_stat_finalize();
+ dirtylimit_change(false);
+ dirtylimit_state_finalize();
+}
+
+void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirtylimit_in_service()) {
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, 0, false);
+ } else {
+ dirtylimit_set_all(0, false);
+ }
+
+ if (!dirtylimit_state->limited_nvcpu) {
+ dirtylimit_cleanup();
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_cancel_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ qmp_cancel_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
+ "dirty limit for virtual CPU]\n");
+}
+
+void qmp_set_vcpu_dirty_limit(bool has_cpu_index,
+ int64_t cpu_index,
+ uint64_t dirty_rate,
+ Error **errp)
+{
+ if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+ error_setg(errp, "dirty page limit feature requires KVM with"
+ " accelerator property 'dirty-ring-size' set'");
+ return;
+ }
+
+ if (has_cpu_index && !dirtylimit_vcpu_index_valid(cpu_index)) {
+ error_setg(errp, "incorrect cpu index specified");
+ return;
+ }
+
+ if (!dirty_rate) {
+ qmp_cancel_vcpu_dirty_limit(has_cpu_index, cpu_index, errp);
+ return;
+ }
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_init();
+ }
+
+ if (has_cpu_index) {
+ dirtylimit_set_vcpu(cpu_index, dirty_rate, true);
+ } else {
+ dirtylimit_set_all(dirty_rate, true);
+ }
+
+ dirtylimit_state_unlock();
+}
+
+void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ int64_t dirty_rate = qdict_get_int(qdict, "dirty_rate");
+ int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
+ Error *err = NULL;
+
+ qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, &err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
+ "dirty limit for virtual CPU]\n");
+}
+
+static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
+{
+ DirtyLimitInfo *info = NULL;
+
+ info = g_malloc0(sizeof(*info));
+ info->cpu_index = cpu_index;
+ info->limit_rate = dirtylimit_vcpu_get_state(cpu_index)->quota;
+ info->current_rate = vcpu_dirty_rate_get(cpu_index);
+
+ return info;
+}
+
+static struct DirtyLimitInfoList *dirtylimit_query_all(void)
+{
+ int i, index;
+ DirtyLimitInfo *info = NULL;
+ DirtyLimitInfoList *head = NULL, **tail = &head;
+
+ dirtylimit_state_lock();
+
+ if (!dirtylimit_in_service()) {
+ dirtylimit_state_unlock();
+ return NULL;
+ }
+
+ for (i = 0; i < dirtylimit_state->max_cpus; i++) {
+ index = dirtylimit_state->states[i].cpu_index;
+ if (dirtylimit_vcpu_get_state(index)->enabled) {
+ info = dirtylimit_query_vcpu(index);
+ QAPI_LIST_APPEND(tail, info);
+ }
+ }
+
+ dirtylimit_state_unlock();
+
+ return head;
+}
+
+struct DirtyLimitInfoList *qmp_query_vcpu_dirty_limit(Error **errp)
+{
+ if (!dirtylimit_in_service()) {
+ return NULL;
+ }
+
+ return dirtylimit_query_all();
+}
+
+void hmp_info_vcpu_dirty_limit(Monitor *mon, const QDict *qdict)
+{
+ DirtyLimitInfoList *limit, *head, *info = NULL;
+ Error *err = NULL;
+
+ if (!dirtylimit_in_service()) {
+ monitor_printf(mon, "Dirty page limit not enabled!\n");
+ return;
+ }
+
+ info = qmp_query_vcpu_dirty_limit(&err);
+ if (err) {
+ hmp_handle_error(mon, err);
+ return;
+ }
+
+ head = info;
+ for (limit = head; limit != NULL; limit = limit->next) {
+ monitor_printf(mon, "vcpu[%"PRIi64"], limit rate %"PRIi64 " (MB/s),"
+ " current rate %"PRIi64 " (MB/s)\n",
+ limit->value->cpu_index,
+ limit->value->limit_rate,
+ limit->value->current_rate);
+ }
+
+ g_free(info);
+}
diff --git a/softmmu/meson.build b/softmmu/meson.build
index 8138248661..3272af1f31 100644
--- a/softmmu/meson.build
+++ b/softmmu/meson.build
@@ -4,6 +4,7 @@ specific_ss.add(when: 'CONFIG_SOFTMMU', if_true: [files(
'memory.c',
'physmem.c',
'qtest.c',
+ 'dirtylimit.c',
)])
specific_ss.add(when: ['CONFIG_SOFTMMU', 'CONFIG_TCG'], if_true: [files(
diff --git a/softmmu/trace-events b/softmmu/trace-events
index 9c88887b3c..22606dc27b 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -31,3 +31,10 @@ runstate_set(int current_state, const char *current_state_str, int new_state, co
system_wakeup_request(int reason) "reason=%d"
qemu_system_shutdown_request(int reason) "reason=%d"
qemu_system_powerdown_request(void) ""
+
+#dirtylimit.c
+dirtylimit_state_initialize(int max_cpus) "dirtylimit state initialize: max cpus %d"
+dirtylimit_state_finalize(void)
+dirtylimit_throttle_pct(int cpu_index, uint64_t pct, int64_t time_us) "CPU[%d] throttle percent: %" PRIu64 ", throttle adjust time %"PRIi64 " us"
+dirtylimit_set_vcpu(int cpu_index, uint64_t quota) "CPU[%d] set dirty page rate limit %"PRIu64
+dirtylimit_vcpu_execute(int cpu_index, int64_t sleep_time_us) "CPU[%d] sleep %"PRIi64 " us"
diff --git a/tests/qtest/migration-helpers.c b/tests/qtest/migration-helpers.c
index e81e831c85..c6fbeb3974 100644
--- a/tests/qtest/migration-helpers.c
+++ b/tests/qtest/migration-helpers.c
@@ -84,6 +84,28 @@ QDict *wait_command(QTestState *who, const char *command, ...)
}
/*
+ * Execute the qmp command only
+ */
+QDict *qmp_command(QTestState *who, const char *command, ...)
+{
+ va_list ap;
+ QDict *resp, *ret;
+
+ va_start(ap, command);
+ resp = qtest_vqmp(who, command, ap);
+ va_end(ap);
+
+ g_assert(!qdict_haskey(resp, "error"));
+ g_assert(qdict_haskey(resp, "return"));
+
+ ret = qdict_get_qdict(resp, "return");
+ qobject_ref(ret);
+ qobject_unref(resp);
+
+ return ret;
+}
+
+/*
* Send QMP command "migrate".
* Arguments are built from @fmt... (formatted like
* qobject_from_jsonf_nofail()) with "uri": @uri spliced in.
diff --git a/tests/qtest/migration-helpers.h b/tests/qtest/migration-helpers.h
index 78587c2b82..59561898d0 100644
--- a/tests/qtest/migration-helpers.h
+++ b/tests/qtest/migration-helpers.h
@@ -23,6 +23,8 @@ QDict *wait_command_fd(QTestState *who, int fd, const char *command, ...);
G_GNUC_PRINTF(2, 3)
QDict *wait_command(QTestState *who, const char *command, ...);
+QDict *qmp_command(QTestState *who, const char *command, ...);
+
G_GNUC_PRINTF(3, 4)
void migrate_qmp(QTestState *who, const char *uri, const char *fmt, ...);
diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index 9e64125f02..71595a74fd 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -24,6 +24,7 @@
#include "qapi/qobject-input-visitor.h"
#include "qapi/qobject-output-visitor.h"
#include "crypto/tlscredspsk.h"
+#include "qapi/qmp/qlist.h"
#include "migration-helpers.h"
#include "tests/migration/migration-test.h"
@@ -46,6 +47,12 @@ unsigned start_address;
unsigned end_address;
static bool uffd_feature_thread_id;
+/*
+ * Dirtylimit stop working if dirty page rate error
+ * value less than DIRTYLIMIT_TOLERANCE_RANGE
+ */
+#define DIRTYLIMIT_TOLERANCE_RANGE 25 /* MB/s */
+
#if defined(__linux__)
#include <sys/syscall.h>
#include <sys/vfs.h>
@@ -496,6 +503,82 @@ typedef struct {
const char *opts_target;
} MigrateStart;
+/*
+ * A hook that runs after the src and dst QEMUs have been
+ * created, but before the migration is started. This can
+ * be used to set migration parameters and capabilities.
+ *
+ * Returns: NULL, or a pointer to opaque state to be
+ * later passed to the TestMigrateFinishHook
+ */
+typedef void * (*TestMigrateStartHook)(QTestState *from,
+ QTestState *to);
+
+/*
+ * A hook that runs after the migration has finished,
+ * regardless of whether it succeeded or failed, but
+ * before QEMU has terminated (unless it self-terminated
+ * due to migration error)
+ *
+ * @opaque is a pointer to state previously returned
+ * by the TestMigrateStartHook if any, or NULL.
+ */
+typedef void (*TestMigrateFinishHook)(QTestState *from,
+ QTestState *to,
+ void *opaque);
+
+typedef struct {
+ /* Optional: fine tune start parameters */
+ MigrateStart start;
+
+ /* Required: the URI for the dst QEMU to listen on */
+ const char *listen_uri;
+
+ /*
+ * Optional: the URI for the src QEMU to connect to
+ * If NULL, then it will query the dst QEMU for its actual
+ * listening address and use that as the connect address.
+ * This allows for dynamically picking a free TCP port.
+ */
+ const char *connect_uri;
+
+ /* Optional: callback to run at start to set migration parameters */
+ TestMigrateStartHook start_hook;
+ /* Optional: callback to run at finish to cleanup */
+ TestMigrateFinishHook finish_hook;
+
+ /*
+ * Optional: normally we expect the migration process to complete.
+ *
+ * There can be a variety of reasons and stages in which failure
+ * can happen during tests.
+ *
+ * If a failure is expected to happen at time of establishing
+ * the connection, then MIG_TEST_FAIL will indicate that the dst
+ * QEMU is expected to stay running and accept future migration
+ * connections.
+ *
+ * If a failure is expected to happen while processing the
+ * migration stream, then MIG_TEST_FAIL_DEST_QUIT_ERR will indicate
+ * that the dst QEMU is expected to quit with non-zero exit status
+ */
+ enum {
+ /* This test should succeed, the default */
+ MIG_TEST_SUCCEED = 0,
+ /* This test should fail, dest qemu should keep alive */
+ MIG_TEST_FAIL,
+ /* This test should fail, dest qemu should fail with abnormal status */
+ MIG_TEST_FAIL_DEST_QUIT_ERR,
+ } result;
+
+ /* Optional: set number of migration passes to wait for */
+ unsigned int iterations;
+
+ /* Postcopy specific fields */
+ void *postcopy_data;
+ bool postcopy_preempt;
+} MigrateCommon;
+
static int test_migrate_start(QTestState **from, QTestState **to,
const char *uri, MigrateStart *args)
{
@@ -982,19 +1065,28 @@ test_migrate_tls_x509_finish(QTestState *from,
static int migrate_postcopy_prepare(QTestState **from_ptr,
QTestState **to_ptr,
- MigrateStart *args)
+ MigrateCommon *args)
{
g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
QTestState *from, *to;
- if (test_migrate_start(&from, &to, uri, args)) {
+ if (test_migrate_start(&from, &to, uri, &args->start)) {
return -1;
}
+ if (args->start_hook) {
+ args->postcopy_data = args->start_hook(from, to);
+ }
+
migrate_set_capability(from, "postcopy-ram", true);
migrate_set_capability(to, "postcopy-ram", true);
migrate_set_capability(to, "postcopy-blocktime", true);
+ if (args->postcopy_preempt) {
+ migrate_set_capability(from, "postcopy-preempt", true);
+ migrate_set_capability(to, "postcopy-preempt", true);
+ }
+
migrate_ensure_non_converge(from);
/* Wait for the first serial output from the source */
@@ -1010,7 +1102,8 @@ static int migrate_postcopy_prepare(QTestState **from_ptr,
return 0;
}
-static void migrate_postcopy_complete(QTestState *from, QTestState *to)
+static void migrate_postcopy_complete(QTestState *from, QTestState *to,
+ MigrateCommon *args)
{
wait_for_migration_complete(from);
@@ -1021,30 +1114,73 @@ static void migrate_postcopy_complete(QTestState *from, QTestState *to)
read_blocktime(to);
}
+ if (args->finish_hook) {
+ args->finish_hook(from, to, args->postcopy_data);
+ args->postcopy_data = NULL;
+ }
+
test_migrate_end(from, to, true);
}
-static void test_postcopy(void)
+static void test_postcopy_common(MigrateCommon *args)
{
- MigrateStart args = {};
QTestState *from, *to;
- if (migrate_postcopy_prepare(&from, &to, &args)) {
+ if (migrate_postcopy_prepare(&from, &to, args)) {
return;
}
migrate_postcopy_start(from, to);
- migrate_postcopy_complete(from, to);
+ migrate_postcopy_complete(from, to, args);
}
-static void test_postcopy_recovery(void)
+static void test_postcopy(void)
{
- MigrateStart args = {
- .hide_stderr = true,
+ MigrateCommon args = { };
+
+ test_postcopy_common(&args);
+}
+
+static void test_postcopy_preempt(void)
+{
+ MigrateCommon args = {
+ .postcopy_preempt = true,
};
+
+ test_postcopy_common(&args);
+}
+
+#ifdef CONFIG_GNUTLS
+static void test_postcopy_tls_psk(void)
+{
+ MigrateCommon args = {
+ .start_hook = test_migrate_tls_psk_start_match,
+ .finish_hook = test_migrate_tls_psk_finish,
+ };
+
+ test_postcopy_common(&args);
+}
+
+static void test_postcopy_preempt_tls_psk(void)
+{
+ MigrateCommon args = {
+ .postcopy_preempt = true,
+ .start_hook = test_migrate_tls_psk_start_match,
+ .finish_hook = test_migrate_tls_psk_finish,
+ };
+
+ test_postcopy_common(&args);
+}
+#endif
+
+static void test_postcopy_recovery_common(MigrateCommon *args)
+{
QTestState *from, *to;
g_autofree char *uri = NULL;
- if (migrate_postcopy_prepare(&from, &to, &args)) {
+ /* Always hide errors for postcopy recover tests since they're expected */
+ args->start.hide_stderr = true;
+
+ if (migrate_postcopy_prepare(&from, &to, args)) {
return;
}
@@ -1095,9 +1231,51 @@ static void test_postcopy_recovery(void)
/* Restore the postcopy bandwidth to unlimited */
migrate_set_parameter_int(from, "max-postcopy-bandwidth", 0);
- migrate_postcopy_complete(from, to);
+ migrate_postcopy_complete(from, to, args);
}
+static void test_postcopy_recovery(void)
+{
+ MigrateCommon args = { };
+
+ test_postcopy_recovery_common(&args);
+}
+
+#ifdef CONFIG_GNUTLS
+static void test_postcopy_recovery_tls_psk(void)
+{
+ MigrateCommon args = {
+ .start_hook = test_migrate_tls_psk_start_match,
+ .finish_hook = test_migrate_tls_psk_finish,
+ };
+
+ test_postcopy_recovery_common(&args);
+}
+#endif
+
+static void test_postcopy_preempt_recovery(void)
+{
+ MigrateCommon args = {
+ .postcopy_preempt = true,
+ };
+
+ test_postcopy_recovery_common(&args);
+}
+
+#ifdef CONFIG_GNUTLS
+/* This contains preempt+recovery+tls test altogether */
+static void test_postcopy_preempt_all(void)
+{
+ MigrateCommon args = {
+ .postcopy_preempt = true,
+ .start_hook = test_migrate_tls_psk_start_match,
+ .finish_hook = test_migrate_tls_psk_finish,
+ };
+
+ test_postcopy_recovery_common(&args);
+}
+#endif
+
static void test_baddest(void)
{
MigrateStart args = {
@@ -1113,78 +1291,6 @@ static void test_baddest(void)
test_migrate_end(from, to, false);
}
-/*
- * A hook that runs after the src and dst QEMUs have been
- * created, but before the migration is started. This can
- * be used to set migration parameters and capabilities.
- *
- * Returns: NULL, or a pointer to opaque state to be
- * later passed to the TestMigrateFinishHook
- */
-typedef void * (*TestMigrateStartHook)(QTestState *from,
- QTestState *to);
-
-/*
- * A hook that runs after the migration has finished,
- * regardless of whether it succeeded or failed, but
- * before QEMU has terminated (unless it self-terminated
- * due to migration error)
- *
- * @opaque is a pointer to state previously returned
- * by the TestMigrateStartHook if any, or NULL.
- */
-typedef void (*TestMigrateFinishHook)(QTestState *from,
- QTestState *to,
- void *opaque);
-
-typedef struct {
- /* Optional: fine tune start parameters */
- MigrateStart start;
-
- /* Required: the URI for the dst QEMU to listen on */
- const char *listen_uri;
-
- /*
- * Optional: the URI for the src QEMU to connect to
- * If NULL, then it will query the dst QEMU for its actual
- * listening address and use that as the connect address.
- * This allows for dynamically picking a free TCP port.
- */
- const char *connect_uri;
-
- /* Optional: callback to run at start to set migration parameters */
- TestMigrateStartHook start_hook;
- /* Optional: callback to run at finish to cleanup */
- TestMigrateFinishHook finish_hook;
-
- /*
- * Optional: normally we expect the migration process to complete.
- *
- * There can be a variety of reasons and stages in which failure
- * can happen during tests.
- *
- * If a failure is expected to happen at time of establishing
- * the connection, then MIG_TEST_FAIL will indicate that the dst
- * QEMU is expected to stay running and accept future migration
- * connections.
- *
- * If a failure is expected to happen while processing the
- * migration stream, then MIG_TEST_FAIL_DEST_QUIT_ERR will indicate
- * that the dst QEMU is expected to quit with non-zero exit status
- */
- enum {
- /* This test should succeed, the default */
- MIG_TEST_SUCCEED = 0,
- /* This test should fail, dest qemu should keep alive */
- MIG_TEST_FAIL,
- /* This test should fail, dest qemu should fail with abnormal status */
- MIG_TEST_FAIL_DEST_QUIT_ERR,
- } result;
-
- /* Optional: set number of migration passes to wait for */
- unsigned int iterations;
-} MigrateCommon;
-
static void test_precopy_common(MigrateCommon *args)
{
QTestState *from, *to;
@@ -2059,6 +2165,253 @@ static void test_multifd_tcp_cancel(void)
test_migrate_end(from, to2, true);
}
+static void calc_dirty_rate(QTestState *who, uint64_t calc_time)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'calc-dirty-rate',"
+ "'arguments': { "
+ "'calc-time': %ld,"
+ "'mode': 'dirty-ring' }}",
+ calc_time));
+}
+
+static QDict *query_dirty_rate(QTestState *who)
+{
+ return qmp_command(who, "{ 'execute': 'query-dirty-rate' }");
+}
+
+static void dirtylimit_set_all(QTestState *who, uint64_t dirtyrate)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'set-vcpu-dirty-limit',"
+ "'arguments': { "
+ "'dirty-rate': %ld } }",
+ dirtyrate));
+}
+
+static void cancel_vcpu_dirty_limit(QTestState *who)
+{
+ qobject_unref(qmp_command(who,
+ "{ 'execute': 'cancel-vcpu-dirty-limit' }"));
+}
+
+static QDict *query_vcpu_dirty_limit(QTestState *who)
+{
+ QDict *rsp;
+
+ rsp = qtest_qmp(who, "{ 'execute': 'query-vcpu-dirty-limit' }");
+ g_assert(!qdict_haskey(rsp, "error"));
+ g_assert(qdict_haskey(rsp, "return"));
+
+ return rsp;
+}
+
+static bool calc_dirtyrate_ready(QTestState *who)
+{
+ QDict *rsp_return;
+ gchar *status;
+
+ rsp_return = query_dirty_rate(who);
+ g_assert(rsp_return);
+
+ status = g_strdup(qdict_get_str(rsp_return, "status"));
+ g_assert(status);
+
+ return g_strcmp0(status, "measuring");
+}
+
+static void wait_for_calc_dirtyrate_complete(QTestState *who,
+ int64_t time_s)
+{
+ int max_try_count = 10000;
+ usleep(time_s * 1000000);
+
+ while (!calc_dirtyrate_ready(who) && max_try_count--) {
+ usleep(1000);
+ }
+
+ /*
+ * Set the timeout with 10 s(max_try_count * 1000us),
+ * if dirtyrate measurement not complete, fail test.
+ */
+ g_assert_cmpint(max_try_count, !=, 0);
+}
+
+static int64_t get_dirty_rate(QTestState *who)
+{
+ QDict *rsp_return;
+ gchar *status;
+ QList *rates;
+ const QListEntry *entry;
+ QDict *rate;
+ int64_t dirtyrate;
+
+ rsp_return = query_dirty_rate(who);
+ g_assert(rsp_return);
+
+ status = g_strdup(qdict_get_str(rsp_return, "status"));
+ g_assert(status);
+ g_assert_cmpstr(status, ==, "measured");
+
+ rates = qdict_get_qlist(rsp_return, "vcpu-dirty-rate");
+ g_assert(rates && !qlist_empty(rates));
+
+ entry = qlist_first(rates);
+ g_assert(entry);
+
+ rate = qobject_to(QDict, qlist_entry_obj(entry));
+ g_assert(rate);
+
+ dirtyrate = qdict_get_try_int(rate, "dirty-rate", -1);
+
+ qobject_unref(rsp_return);
+ return dirtyrate;
+}
+
+static int64_t get_limit_rate(QTestState *who)
+{
+ QDict *rsp_return;
+ QList *rates;
+ const QListEntry *entry;
+ QDict *rate;
+ int64_t dirtyrate;
+
+ rsp_return = query_vcpu_dirty_limit(who);
+ g_assert(rsp_return);
+
+ rates = qdict_get_qlist(rsp_return, "return");
+ g_assert(rates && !qlist_empty(rates));
+
+ entry = qlist_first(rates);
+ g_assert(entry);
+
+ rate = qobject_to(QDict, qlist_entry_obj(entry));
+ g_assert(rate);
+
+ dirtyrate = qdict_get_try_int(rate, "limit-rate", -1);
+
+ qobject_unref(rsp_return);
+ return dirtyrate;
+}
+
+static QTestState *dirtylimit_start_vm(void)
+{
+ QTestState *vm = NULL;
+ g_autofree gchar *cmd = NULL;
+ const char *arch = qtest_get_arch();
+ g_autofree char *bootpath = NULL;
+
+ assert((strcmp(arch, "x86_64") == 0));
+ bootpath = g_strdup_printf("%s/bootsect", tmpfs);
+ assert(sizeof(x86_bootsect) == 512);
+ init_bootfile(bootpath, x86_bootsect, sizeof(x86_bootsect));
+
+ cmd = g_strdup_printf("-accel kvm,dirty-ring-size=4096 "
+ "-name dirtylimit-test,debug-threads=on "
+ "-m 150M -smp 1 "
+ "-serial file:%s/vm_serial "
+ "-drive file=%s,format=raw ",
+ tmpfs, bootpath);
+
+ vm = qtest_init(cmd);
+ return vm;
+}
+
+static void dirtylimit_stop_vm(QTestState *vm)
+{
+ qtest_quit(vm);
+ cleanup("bootsect");
+ cleanup("vm_serial");
+}
+
+static void test_vcpu_dirty_limit(void)
+{
+ QTestState *vm;
+ int64_t origin_rate;
+ int64_t quota_rate;
+ int64_t rate ;
+ int max_try_count = 20;
+ int hit = 0;
+
+ /* Start vm for vcpu dirtylimit test */
+ vm = dirtylimit_start_vm();
+
+ /* Wait for the first serial output from the vm*/
+ wait_for_serial("vm_serial");
+
+ /* Do dirtyrate measurement with calc time equals 1s */
+ calc_dirty_rate(vm, 1);
+
+ /* Sleep calc time and wait for calc dirtyrate complete */
+ wait_for_calc_dirtyrate_complete(vm, 1);
+
+ /* Query original dirty page rate */
+ origin_rate = get_dirty_rate(vm);
+
+ /* VM booted from bootsect should dirty memory steadily */
+ assert(origin_rate != 0);
+
+ /* Setup quota dirty page rate at half of origin */
+ quota_rate = origin_rate / 2;
+
+ /* Set dirtylimit */
+ dirtylimit_set_all(vm, quota_rate);
+
+ /*
+ * Check if set-vcpu-dirty-limit and query-vcpu-dirty-limit
+ * works literally
+ */
+ g_assert_cmpint(quota_rate, ==, get_limit_rate(vm));
+
+ /* Sleep a bit to check if it take effect */
+ usleep(2000000);
+
+ /*
+ * Check if dirtylimit take effect realistically, set the
+ * timeout with 20 s(max_try_count * 1s), if dirtylimit
+ * doesn't take effect, fail test.
+ */
+ while (--max_try_count) {
+ calc_dirty_rate(vm, 1);
+ wait_for_calc_dirtyrate_complete(vm, 1);
+ rate = get_dirty_rate(vm);
+
+ /*
+ * Assume hitting if current rate is less
+ * than quota rate (within accepting error)
+ */
+ if (rate < (quota_rate + DIRTYLIMIT_TOLERANCE_RANGE)) {
+ hit = 1;
+ break;
+ }
+ }
+
+ g_assert_cmpint(hit, ==, 1);
+
+ hit = 0;
+ max_try_count = 20;
+
+ /* Check if dirtylimit cancellation take effect */
+ cancel_vcpu_dirty_limit(vm);
+ while (--max_try_count) {
+ calc_dirty_rate(vm, 1);
+ wait_for_calc_dirtyrate_complete(vm, 1);
+ rate = get_dirty_rate(vm);
+
+ /*
+ * Assume dirtylimit be canceled if current rate is
+ * greater than quota rate (within accepting error)
+ */
+ if (rate > (quota_rate + DIRTYLIMIT_TOLERANCE_RANGE)) {
+ hit = 1;
+ break;
+ }
+ }
+
+ g_assert_cmpint(hit, ==, 1);
+ dirtylimit_stop_vm(vm);
+}
+
static bool kvm_dirty_ring_supported(void)
{
#if defined(__linux__) && defined(HOST_X86_64)
@@ -2123,13 +2476,31 @@ int main(int argc, char **argv)
module_call_init(MODULE_INIT_QOM);
qtest_add_func("/migration/postcopy/unix", test_postcopy);
- qtest_add_func("/migration/postcopy/recovery", test_postcopy_recovery);
+ qtest_add_func("/migration/postcopy/plain", test_postcopy);
+ qtest_add_func("/migration/postcopy/recovery/plain",
+ test_postcopy_recovery);
+ qtest_add_func("/migration/postcopy/preempt/plain", test_postcopy_preempt);
+ qtest_add_func("/migration/postcopy/preempt/recovery/plain",
+ test_postcopy_preempt_recovery);
+
qtest_add_func("/migration/bad_dest", test_baddest);
qtest_add_func("/migration/precopy/unix/plain", test_precopy_unix_plain);
qtest_add_func("/migration/precopy/unix/xbzrle", test_precopy_unix_xbzrle);
#ifdef CONFIG_GNUTLS
qtest_add_func("/migration/precopy/unix/tls/psk",
test_precopy_unix_tls_psk);
+ /*
+ * NOTE: psk test is enough for postcopy, as other types of TLS
+ * channels are tested under precopy. Here what we want to test is the
+ * general postcopy path that has TLS channel enabled.
+ */
+ qtest_add_func("/migration/postcopy/tls/psk", test_postcopy_tls_psk);
+ qtest_add_func("/migration/postcopy/recovery/tls/psk",
+ test_postcopy_recovery_tls_psk);
+ qtest_add_func("/migration/postcopy/preempt/tls/psk",
+ test_postcopy_preempt_tls_psk);
+ qtest_add_func("/migration/postcopy/preempt/recovery/tls/psk",
+ test_postcopy_preempt_all);
#ifdef CONFIG_TASN1
qtest_add_func("/migration/precopy/unix/tls/x509/default-host",
test_precopy_unix_tls_x509_default_host);
@@ -2204,6 +2575,8 @@ int main(int argc, char **argv)
if (kvm_dirty_ring_supported()) {
qtest_add_func("/migration/dirty_ring",
test_precopy_unix_dirty_ring);
+ qtest_add_func("/migration/vcpu_dirty_limit",
+ test_vcpu_dirty_limit);
}
ret = g_test_run();
diff --git a/tests/qtest/qmp-cmd-test.c b/tests/qtest/qmp-cmd-test.c
index 056b40e67f..af00712458 100644
--- a/tests/qtest/qmp-cmd-test.c
+++ b/tests/qtest/qmp-cmd-test.c
@@ -110,6 +110,8 @@ static bool query_is_ignored(const char *cmd)
"query-sev-capabilities",
"query-sgx",
"query-sgx-capabilities",
+ /* Success depends on enabling dirty page rate limit */
+ "query-vcpu-dirty-limit",
NULL
};
int i;