[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 2/2] kvm: use per-cpu lock to free vcpu thread o
From: |
liu ping fan |
Subject: |
Re: [Qemu-devel] [PATCH 2/2] kvm: use per-cpu lock to free vcpu thread out of the big lock |
Date: |
Sun, 24 Jun 2012 22:09:41 +0800 |
With this patch, the iothread lock will be only hold around
1.kvm_flush_coalesced_mmio_buffer();
2.kvm_handle_io();
3.cpu_physical_memory_rw()
And I think the cpu_lock is something which we can treat as apic_state
device's lock, which the first device, we free it out of the iothread
lock
Regards,
pingfan
On Sat, Jun 23, 2012 at 5:32 PM, liu ping fan <address@hidden> wrote:
> On Sat, Jun 23, 2012 at 4:06 AM, Anthony Liguori <address@hidden> wrote:
>> On 06/21/2012 09:49 AM, Liu Ping Fan wrote:
>>>
>>> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec()
>>> to protect the race from other cpu's access to env->apic_state& related
>>>
>>> field in env.
>>> Also, we need to protect agaist run_on_cpu().
>>>
>>> Race condition can be like this:
>>> 1. vcpu-1 IPI vcpu-2
>>> vcpu-3 IPI vcpu-2
>>> Open window exists for accessing to vcpu-2's apic_state& env
>>>
>>>
>>> 2. run_on_cpu() write env->queued_work_last, while flush_queued_work()
>>> read
>>>
>>> Signed-off-by: Liu Ping Fan<address@hidden>
>>> ---
>>> cpus.c | 6 ++++--
>>> hw/apic.c | 58
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>> hw/pc.c | 8 +++++++-
>>> kvm-all.c | 13 +++++++++++--
>>> 4 files changed, 76 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/cpus.c b/cpus.c
>>> index 554f7bc..ac99afe 100644
>>> --- a/cpus.c
>>> +++ b/cpus.c
>>> @@ -649,6 +649,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void
>>> *data), void *data)
>>>
>>> wi.func = func;
>>> wi.data = data;
>>> + qemu_mutex_lock(env->cpu_lock);
>>> if (!env->queued_work_first) {
>>> env->queued_work_first =&wi;
>>> } else {
>>> @@ -657,6 +658,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void
>>> *data), void *data)
>>> env->queued_work_last =&wi;
>>> wi.next = NULL;
>>> wi.done = false;
>>> + qemu_mutex_unlock(env->cpu_lock);
>>>
>>> qemu_cpu_kick(env);
>>> while (!wi.done) {
>>> @@ -718,7 +720,7 @@ static void qemu_tcg_wait_io_event(void)
>>> static void qemu_kvm_wait_io_event(CPUArchState *env)
>>> {
>>> while (cpu_thread_is_idle(env)) {
>>> - qemu_cond_wait(env->halt_cond,&qemu_global_mutex);
>>> + qemu_cond_wait(env->halt_cond, env->cpu_lock);
>>> }
>>>
>>> qemu_kvm_eat_signals(env);
>>> @@ -729,8 +731,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>>> {
>>> CPUArchState *env = arg;
>>> int r;
>>> + qemu_mutex_lock_cpu(env);
>>>
>>> - qemu_mutex_lock(&qemu_global_mutex);
>>> qemu_thread_get_self(env->thread);
>>> env->thread_id = qemu_get_thread_id();
>>> cpu_single_env = env;
>>> diff --git a/hw/apic.c b/hw/apic.c
>>> index 4eeaf88..b999a40 100644
>>> --- a/hw/apic.c
>>> +++ b/hw/apic.c
>>> @@ -22,6 +22,7 @@
>>> #include "host-utils.h"
>>> #include "trace.h"
>>> #include "pc.h"
>>> +#include "qemu-thread.h"
>>>
>>> #define MAX_APIC_WORDS 8
>>>
>>> @@ -94,6 +95,7 @@ static int get_highest_priority_int(uint32_t *tab)
>>> return -1;
>>> }
>>>
>>> +/* Caller must hold the lock */
>>> static void apic_sync_vapic(APICCommonState *s, int sync_type)
>>> {
>>> VAPICState vapic_state;
>>> @@ -141,11 +143,13 @@ static void apic_sync_vapic(APICCommonState *s, int
>>> sync_type)
>>> }
>>> }
>>>
>>> +/* Caller must hold lock */
>>> static void apic_vapic_base_update(APICCommonState *s)
>>> {
>>> apic_sync_vapic(s, SYNC_TO_VAPIC);
>>> }
>>>
>>> +/* Caller must hold the lock */
>>> static void apic_local_deliver(APICCommonState *s, int vector)
>>> {
>>> uint32_t lvt = s->lvt[vector];
>>> @@ -175,9 +179,11 @@ static void apic_local_deliver(APICCommonState *s,
>>> int vector)
>>> (lvt& APIC_LVT_LEVEL_TRIGGER))
>>> trigger_mode = APIC_TRIGGER_LEVEL;
>>> apic_set_irq(s, lvt& 0xff, trigger_mode);
>>>
>>> + break;
>>> }
>>> }
>>>
>>> +/* Caller must hold the lock */
>>> void apic_deliver_pic_intr(DeviceState *d, int level)
>>> {
>>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -200,9 +206,12 @@ void apic_deliver_pic_intr(DeviceState *d, int level)
>>> }
>>> }
>>>
>>> +/* Must hold lock */
>>> static void apic_external_nmi(APICCommonState *s)
>>> {
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> apic_local_deliver(s, APIC_LVT_LINT1);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>>
>>> #define foreach_apic(apic, deliver_bitmask, code) \
>>> @@ -215,7 +224,9 @@ static void apic_external_nmi(APICCommonState *s)
>>> if (__mask& (1<< __j)) {\
>>>
>>> apic = local_apics[__i * 32 + __j];\
>>> if (apic) {\
>>> + qemu_mutex_lock_cpu(apic->cpu_env);\
>>> code;\
>>> + qemu_mutex_unlock_cpu(apic->cpu_env);\
>>> }\
>>> }\
>>> }\
>>> @@ -244,7 +255,9 @@ static void apic_bus_deliver(const uint32_t
>>> *deliver_bitmask,
>>> if (d>= 0) {
>>> apic_iter = local_apics[d];
>>> if (apic_iter) {
>>> + qemu_mutex_lock_cpu(apic_iter->cpu_env);
>>> apic_set_irq(apic_iter, vector_num,
>>> trigger_mode);
>>> + qemu_mutex_unlock_cpu(apic_iter->cpu_env);
>>> }
>>> }
>>> }
>>> @@ -293,6 +306,7 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode,
>>> uint8_t delivery_mode,
>>> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num,
>>> trigger_mode);
>>> }
>>>
>>> +/* Caller must hold lock */
>>> static void apic_set_base(APICCommonState *s, uint64_t val)
>>> {
>>> s->apicbase = (val& 0xfffff000) |
>>>
>>> @@ -305,6 +319,7 @@ static void apic_set_base(APICCommonState *s, uint64_t
>>> val)
>>> }
>>> }
>>>
>>> +/* caller must hold lock */
>>> static void apic_set_tpr(APICCommonState *s, uint8_t val)
>>> {
>>> /* Updates from cr8 are ignored while the VAPIC is active */
>>> @@ -314,12 +329,14 @@ static void apic_set_tpr(APICCommonState *s, uint8_t
>>> val)
>>> }
>>> }
>>>
>>> +/* caller must hold lock */
>>> static uint8_t apic_get_tpr(APICCommonState *s)
>>> {
>>> apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>> return s->tpr>> 4;
>>> }
>>>
>>> +/* Caller must hold the lock */
>>> static int apic_get_ppr(APICCommonState *s)
>>> {
>>> int tpr, isrv, ppr;
>>> @@ -348,6 +365,7 @@ static int apic_get_arb_pri(APICCommonState *s)
>>> * 0 - no interrupt,
>>> *>0 - interrupt number
>>> */
>>> +/* Caller must hold cpu_lock */
>>> static int apic_irq_pending(APICCommonState *s)
>>> {
>>> int irrv, ppr;
>>> @@ -363,6 +381,7 @@ static int apic_irq_pending(APICCommonState *s)
>>> return irrv;
>>> }
>>>
>>> +/* caller must ensure the lock has been taken */
>>> /* signal the CPU if an irq is pending */
>>> static void apic_update_irq(APICCommonState *s)
>>> {
>>> @@ -380,11 +399,13 @@ static void apic_update_irq(APICCommonState *s)
>>> void apic_poll_irq(DeviceState *d)
>>> {
>>> APICCommonState *s = APIC_COMMON(d);
>>> -
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>> apic_update_irq(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>>
>>> +/* caller must hold the lock */
>>> static void apic_set_irq(APICCommonState *s, int vector_num, int
>>> trigger_mode)
>>> {
>>> apic_report_irq_delivered(!get_bit(s->irr, vector_num));
>>> @@ -407,6 +428,7 @@ static void apic_set_irq(APICCommonState *s, int
>>> vector_num, int trigger_mode)
>>> apic_update_irq(s);
>>> }
>>>
>>> +/* caller must hold the lock */
>>> static void apic_eoi(APICCommonState *s)
>>> {
>>> int isrv;
>>> @@ -415,6 +437,7 @@ static void apic_eoi(APICCommonState *s)
>>> return;
>>> reset_bit(s->isr, isrv);
>>> if (!(s->spurious_vec& APIC_SV_DIRECTED_IO)&& get_bit(s->tmr,
>>> isrv)) {
>>>
>>> + //fix me,need to take extra lock for the bus?
>>> ioapic_eoi_broadcast(isrv);
>>> }
>>> apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC);
>>> @@ -480,18 +503,23 @@ static void apic_get_delivery_bitmask(uint32_t
>>> *deliver_bitmask,
>>> static void apic_startup(APICCommonState *s, int vector_num)
>>> {
>>> s->sipi_vector = vector_num;
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>>
>>> void apic_sipi(DeviceState *d)
>>> {
>>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> -
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
>>>
>>> - if (!s->wait_for_sipi)
>>> + if (!s->wait_for_sipi) {
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> return;
>>> + }
>>> cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> s->wait_for_sipi = 0;
>>> }
>>>
>>> @@ -543,6 +571,7 @@ static void apic_deliver(DeviceState *d, uint8_t dest,
>>> uint8_t dest_mode,
>>> apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num,
>>> trigger_mode);
>>> }
>>>
>>> +/* Caller must take lock */
>>> int apic_get_interrupt(DeviceState *d)
>>> {
>>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -572,6 +601,7 @@ int apic_get_interrupt(DeviceState *d)
>>> return intno;
>>> }
>>>
>>> +/* Caller must hold the cpu_lock */
>>> int apic_accept_pic_intr(DeviceState *d)
>>> {
>>> APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -589,6 +619,7 @@ int apic_accept_pic_intr(DeviceState *d)
>>> return 0;
>>> }
>>>
>>> +/* Caller hold lock */
>>> static uint32_t apic_get_current_count(APICCommonState *s)
>>> {
>>> int64_t d;
>>> @@ -619,9 +650,10 @@ static void apic_timer_update(APICCommonState *s,
>>> int64_t current_time)
>>> static void apic_timer(void *opaque)
>>> {
>>> APICCommonState *s = opaque;
>>> -
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> apic_local_deliver(s, APIC_LVT_TIMER);
>>> apic_timer_update(s, s->next_time);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>>
>>> static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr)
>>> @@ -664,18 +696,22 @@ static uint32_t apic_mem_readl(void *opaque,
>>> target_phys_addr_t addr)
>>> val = 0x11 | ((APIC_LVT_NB - 1)<< 16); /* version 0x11 */
>>> break;
>>> case 0x08:
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>> if (apic_report_tpr_access) {
>>> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ);
>>> }
>>> val = s->tpr;
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x09:
>>> val = apic_get_arb_pri(s);
>>> break;
>>> case 0x0a:
>>> /* ppr */
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> val = apic_get_ppr(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x0b:
>>> val = 0;
>>> @@ -712,7 +748,9 @@ static uint32_t apic_mem_readl(void *opaque,
>>> target_phys_addr_t addr)
>>> val = s->initial_count;
>>> break;
>>> case 0x39:
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> val = apic_get_current_count(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x3e:
>>> val = s->divide_conf;
>>> @@ -767,18 +805,22 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>> case 0x03:
>>> break;
>>> case 0x08:
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> if (apic_report_tpr_access) {
>>> cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE);
>>> }
>>> s->tpr = val;
>>> apic_sync_vapic(s, SYNC_TO_VAPIC);
>>> apic_update_irq(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x09:
>>> case 0x0a:
>>> break;
>>> case 0x0b: /* EOI */
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> apic_eoi(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x0d:
>>> s->log_dest = val>> 24;
>>> @@ -787,8 +829,10 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>> s->dest_mode = val>> 28;
>>> break;
>>> case 0x0f:
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> s->spurious_vec = val& 0x1ff;
>>>
>>> apic_update_irq(s);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x10 ... 0x17:
>>> case 0x18 ... 0x1f:
>>> @@ -807,24 +851,30 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>> case 0x32 ... 0x37:
>>> {
>>> int n = index - 0x32;
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> s->lvt[n] = val;
>>> if (n == APIC_LVT_TIMER)
>>> apic_timer_update(s, qemu_get_clock_ns(vm_clock));
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>> break;
>>> case 0x38:
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> s->initial_count = val;
>>> s->initial_count_load_time = qemu_get_clock_ns(vm_clock);
>>> apic_timer_update(s, s->initial_count_load_time);
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> break;
>>> case 0x39:
>>> break;
>>> case 0x3e:
>>> {
>>> int v;
>>> + qemu_mutex_lock_cpu(s->cpu_env);
>>> s->divide_conf = val& 0xb;
>>> v = (s->divide_conf& 3) | ((s->divide_conf>> 1)& 4);
>>> s->count_shift = (v + 1)& 7;
>>>
>>> + qemu_mutex_unlock_cpu(s->cpu_env);
>>> }
>>> break;
>>> default:
>>> diff --git a/hw/pc.c b/hw/pc.c
>>> index 4d34a33..5e7350c 100644
>>> --- a/hw/pc.c
>>> +++ b/hw/pc.c
>>> @@ -147,7 +147,7 @@ void cpu_smm_update(CPUX86State *env)
>>> smm_set(!!(env->hflags& HF_SMM_MASK), smm_arg);
>>>
>>> }
>>>
>>> -
>>> +/* Called by kvm_cpu_exec(), already with lock protection */
>>> /* IRQ handling */
>>> int cpu_get_pic_interrupt(CPUX86State *env)
>>> {
>>> @@ -173,9 +173,15 @@ static void pic_irq_request(void *opaque, int irq,
>>> int level)
>>> DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq);
>>> if (env->apic_state) {
>>> while (env) {
>>> + if (!qemu_cpu_is_self(env)) {
>>> + qemu_mutex_lock_cpu(env);
>>> + }
>>> if (apic_accept_pic_intr(env->apic_state)) {
>>> apic_deliver_pic_intr(env->apic_state, level);
>>> }
>>> + if (!qemu_cpu_is_self(env)) {
>>> + qemu_mutex_unlock_cpu(env);
>>> + }
>>> env = env->next_cpu;
>>> }
>>> } else {
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index 9b73ccf..dc9aa54 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -29,6 +29,7 @@
>>> #include "bswap.h"
>>> #include "memory.h"
>>> #include "exec-memory.h"
>>> +#include "qemu-thread.h"
>>>
>>> /* This check must be after config-host.h is included */
>>> #ifdef CONFIG_EVENTFD
>>> @@ -1252,12 +1253,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>> */
>>> qemu_cpu_kick_self();
>>> }
>>> - qemu_mutex_unlock_iothread();
>>> + qemu_mutex_unlock(env->cpu_lock);
>>>
>>> run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> - qemu_mutex_lock_iothread();
>>> + qemu_mutex_lock(env->cpu_lock);
>>> kvm_arch_post_run(env, run);
>>> + qemu_mutex_unlock_cpu(env);
>>> +
>>> + qemu_mutex_lock_iothread();
>>>
>>> kvm_flush_coalesced_mmio_buffer();
>>>
>>> @@ -1265,6 +1269,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>> if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>> DPRINTF("io window exit\n");
>>> ret = EXCP_INTERRUPT;
>>> + qemu_mutex_unlock_iothread();
>>> + qemu_mutex_lock_cpu(env);
>>> break;
>>> }
>>> fprintf(stderr, "error: kvm run failed %s\n",
>>> @@ -1312,6 +1318,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>> ret = kvm_arch_handle_exit(env, run);
>>> break;
>>> }
>>> +
>>> + qemu_mutex_unlock_iothread();
>>> + qemu_mutex_lock_cpu(env);
>>> } while (ret == 0);
>>
>>
>> This really confuses me.
>>
>> Shouldn't we be moving qemu_mutex_unlock_iothread() to the very top of
>> kvm_cpu_exec()?
>>
> Sorry, the code is not arranged good enough, but I think the final
> style will be like this:
> do {
> qemu_mutex_lock_iothread()
> kvm_flush_coalesced_mmio_buffer();
> qemu_mutex_unlock_iothread()
>
> switch (run->exit_reason) {
> case KVM_EXIT_IO:
> qemu_mutex_lock_iothread()
> kvm_handle_io();
> qemu_mutex_unlock_iothread()
> break;
> case KVM_EXIT_MMIO:
> qemu_mutex_lock_iothread()
> cpu_physical_memory_rw()
> qemu_mutex_unlock_iothread()
> break;
> ................
> }while()
>
> Right? Can we push the qemu_mutex_lock_iothread out of the do{}?
>
>> There's only a fixed amount of state that gets manipulated too in
>> kvm_cpu_exec() so shouldn't we try to specifically describe what state is
>> covered by cpu_lock?
>>
> In current code, apart from run_on_cpu() writes env->queued_work_last,
> while flush_queued_work() reads, the BQL protects the CPUArchState and
> the deviceState from the other threads. As to the CPUArchState, the
> emulated registers are local. So the only part can be accessed by
> other threads is for the irq emulation.
> These component including env's
> (interrupt_request,eflags,halted,exit_request,exception_injected), all
> of them are decided by env->apic_state, so we consider them as a
> atomic context, that is say during the updating of env's these
> context, we do not allow apic_state changed.
>
>> What's your strategy for ensuring that all accesses to that state is
>> protected by cpu_lock?
>>
> The cpu_lock is a lock for env->apic_state and all related irq
> emulation context. And the apic_state is the only entrance, through
> which, other threads can affect this env's irq emulation context. So
> when other threads want to access this_env->apic_state, they must
> firstly acquire the cpu_lock.
>
> Thanks and regards,
> pingfan
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>> if (ret< 0) {
>>
>>