qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] kvm: use per-cpu lock to free vcpu thread o


From: liu ping fan
Subject: Re: [Qemu-devel] [PATCH 2/2] kvm: use per-cpu lock to free vcpu thread out of the big lock
Date: Sun, 24 Jun 2012 22:09:41 +0800

With this patch, the iothread lock will be only hold around
1.kvm_flush_coalesced_mmio_buffer();
2.kvm_handle_io();
3.cpu_physical_memory_rw()

And I think the cpu_lock is something which we can treat as apic_state
device's lock, which the first device, we free it out of the  iothread
lock

Regards,
pingfan

On Sat, Jun 23, 2012 at 5:32 PM, liu ping fan <address@hidden> wrote:
> On Sat, Jun 23, 2012 at 4:06 AM, Anthony Liguori <address@hidden> wrote:
>> On 06/21/2012 09:49 AM, Liu Ping Fan wrote:
>>>
>>> In order to break the big lock, using per-cpu_lock in kvm_cpu_exec()
>>> to protect the race from other cpu's access to env->apic_state&  related
>>>
>>> field in env.
>>> Also, we need to protect agaist run_on_cpu().
>>>
>>> Race condition can be like this:
>>> 1.  vcpu-1 IPI vcpu-2
>>>     vcpu-3 IPI vcpu-2
>>>     Open window exists for accessing to vcpu-2's apic_state&  env
>>>
>>>
>>> 2. run_on_cpu() write env->queued_work_last, while flush_queued_work()
>>>    read
>>>
>>> Signed-off-by: Liu Ping Fan<address@hidden>
>>> ---
>>>  cpus.c    |    6 ++++--
>>>  hw/apic.c |   58
>>> ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
>>>  hw/pc.c   |    8 +++++++-
>>>  kvm-all.c |   13 +++++++++++--
>>>  4 files changed, 76 insertions(+), 9 deletions(-)
>>>
>>> diff --git a/cpus.c b/cpus.c
>>> index 554f7bc..ac99afe 100644
>>> --- a/cpus.c
>>> +++ b/cpus.c
>>> @@ -649,6 +649,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void
>>> *data), void *data)
>>>
>>>      wi.func = func;
>>>      wi.data = data;
>>> +    qemu_mutex_lock(env->cpu_lock);
>>>      if (!env->queued_work_first) {
>>>          env->queued_work_first =&wi;
>>>      } else {
>>> @@ -657,6 +658,7 @@ void run_on_cpu(CPUArchState *env, void (*func)(void
>>> *data), void *data)
>>>      env->queued_work_last =&wi;
>>>      wi.next = NULL;
>>>      wi.done = false;
>>> +    qemu_mutex_unlock(env->cpu_lock);
>>>
>>>      qemu_cpu_kick(env);
>>>      while (!wi.done) {
>>> @@ -718,7 +720,7 @@ static void qemu_tcg_wait_io_event(void)
>>>  static void qemu_kvm_wait_io_event(CPUArchState *env)
>>>  {
>>>      while (cpu_thread_is_idle(env)) {
>>> -        qemu_cond_wait(env->halt_cond,&qemu_global_mutex);
>>> +        qemu_cond_wait(env->halt_cond, env->cpu_lock);
>>>      }
>>>
>>>      qemu_kvm_eat_signals(env);
>>> @@ -729,8 +731,8 @@ static void *qemu_kvm_cpu_thread_fn(void *arg)
>>>  {
>>>      CPUArchState *env = arg;
>>>      int r;
>>> +    qemu_mutex_lock_cpu(env);
>>>
>>> -    qemu_mutex_lock(&qemu_global_mutex);
>>>      qemu_thread_get_self(env->thread);
>>>      env->thread_id = qemu_get_thread_id();
>>>      cpu_single_env = env;
>>> diff --git a/hw/apic.c b/hw/apic.c
>>> index 4eeaf88..b999a40 100644
>>> --- a/hw/apic.c
>>> +++ b/hw/apic.c
>>> @@ -22,6 +22,7 @@
>>>  #include "host-utils.h"
>>>  #include "trace.h"
>>>  #include "pc.h"
>>> +#include "qemu-thread.h"
>>>
>>>  #define MAX_APIC_WORDS 8
>>>
>>> @@ -94,6 +95,7 @@ static int get_highest_priority_int(uint32_t *tab)
>>>      return -1;
>>>  }
>>>
>>> +/* Caller must hold the lock */
>>>  static void apic_sync_vapic(APICCommonState *s, int sync_type)
>>>  {
>>>      VAPICState vapic_state;
>>> @@ -141,11 +143,13 @@ static void apic_sync_vapic(APICCommonState *s, int
>>> sync_type)
>>>      }
>>>  }
>>>
>>> +/* Caller must hold lock */
>>>  static void apic_vapic_base_update(APICCommonState *s)
>>>  {
>>>      apic_sync_vapic(s, SYNC_TO_VAPIC);
>>>  }
>>>
>>> +/* Caller must hold the lock */
>>>  static void apic_local_deliver(APICCommonState *s, int vector)
>>>  {
>>>      uint32_t lvt = s->lvt[vector];
>>> @@ -175,9 +179,11 @@ static void apic_local_deliver(APICCommonState *s,
>>> int vector)
>>>              (lvt&  APIC_LVT_LEVEL_TRIGGER))
>>>              trigger_mode = APIC_TRIGGER_LEVEL;
>>>          apic_set_irq(s, lvt&  0xff, trigger_mode);
>>>
>>> +        break;
>>>      }
>>>  }
>>>
>>> +/* Caller must hold the lock */
>>>  void apic_deliver_pic_intr(DeviceState *d, int level)
>>>  {
>>>      APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -200,9 +206,12 @@ void apic_deliver_pic_intr(DeviceState *d, int level)
>>>      }
>>>  }
>>>
>>> +/* Must hold lock */
>>>  static void apic_external_nmi(APICCommonState *s)
>>>  {
>>> +    qemu_mutex_lock_cpu(s->cpu_env);
>>>      apic_local_deliver(s, APIC_LVT_LINT1);
>>> +    qemu_mutex_unlock_cpu(s->cpu_env);
>>>  }
>>>
>>>  #define foreach_apic(apic, deliver_bitmask, code) \
>>> @@ -215,7 +224,9 @@ static void apic_external_nmi(APICCommonState *s)
>>>                  if (__mask&  (1<<  __j)) {\
>>>
>>>                      apic = local_apics[__i * 32 + __j];\
>>>                      if (apic) {\
>>> +                        qemu_mutex_lock_cpu(apic->cpu_env);\
>>>                          code;\
>>> +                        qemu_mutex_unlock_cpu(apic->cpu_env);\
>>>                      }\
>>>                  }\
>>>              }\
>>> @@ -244,7 +255,9 @@ static void apic_bus_deliver(const uint32_t
>>> *deliver_bitmask,
>>>                  if (d>= 0) {
>>>                      apic_iter = local_apics[d];
>>>                      if (apic_iter) {
>>> +                        qemu_mutex_lock_cpu(apic_iter->cpu_env);
>>>                          apic_set_irq(apic_iter, vector_num,
>>> trigger_mode);
>>> +                        qemu_mutex_unlock_cpu(apic_iter->cpu_env);
>>>                      }
>>>                  }
>>>              }
>>> @@ -293,6 +306,7 @@ void apic_deliver_irq(uint8_t dest, uint8_t dest_mode,
>>> uint8_t delivery_mode,
>>>      apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num,
>>> trigger_mode);
>>>  }
>>>
>>> +/* Caller must hold lock */
>>>  static void apic_set_base(APICCommonState *s, uint64_t val)
>>>  {
>>>      s->apicbase = (val&  0xfffff000) |
>>>
>>> @@ -305,6 +319,7 @@ static void apic_set_base(APICCommonState *s, uint64_t
>>> val)
>>>      }
>>>  }
>>>
>>> +/* caller must hold lock */
>>>  static void apic_set_tpr(APICCommonState *s, uint8_t val)
>>>  {
>>>      /* Updates from cr8 are ignored while the VAPIC is active */
>>> @@ -314,12 +329,14 @@ static void apic_set_tpr(APICCommonState *s, uint8_t
>>> val)
>>>      }
>>>  }
>>>
>>> +/* caller must hold lock */
>>>  static uint8_t apic_get_tpr(APICCommonState *s)
>>>  {
>>>      apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>>      return s->tpr>>  4;
>>>  }
>>>
>>> +/* Caller must hold the lock */
>>>  static int apic_get_ppr(APICCommonState *s)
>>>  {
>>>      int tpr, isrv, ppr;
>>> @@ -348,6 +365,7 @@ static int apic_get_arb_pri(APICCommonState *s)
>>>   * 0  - no interrupt,
>>>   *>0 - interrupt number
>>>   */
>>> +/* Caller must hold cpu_lock */
>>>  static int apic_irq_pending(APICCommonState *s)
>>>  {
>>>      int irrv, ppr;
>>> @@ -363,6 +381,7 @@ static int apic_irq_pending(APICCommonState *s)
>>>      return irrv;
>>>  }
>>>
>>> +/* caller must ensure the lock has been taken */
>>>  /* signal the CPU if an irq is pending */
>>>  static void apic_update_irq(APICCommonState *s)
>>>  {
>>> @@ -380,11 +399,13 @@ static void apic_update_irq(APICCommonState *s)
>>>  void apic_poll_irq(DeviceState *d)
>>>  {
>>>      APICCommonState *s = APIC_COMMON(d);
>>> -
>>> +    qemu_mutex_lock_cpu(s->cpu_env);
>>>      apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>>      apic_update_irq(s);
>>> +    qemu_mutex_unlock_cpu(s->cpu_env);
>>>  }
>>>
>>> +/* caller must hold the lock */
>>>  static void apic_set_irq(APICCommonState *s, int vector_num, int
>>> trigger_mode)
>>>  {
>>>      apic_report_irq_delivered(!get_bit(s->irr, vector_num));
>>> @@ -407,6 +428,7 @@ static void apic_set_irq(APICCommonState *s, int
>>> vector_num, int trigger_mode)
>>>      apic_update_irq(s);
>>>  }
>>>
>>> +/* caller must hold the lock */
>>>  static void apic_eoi(APICCommonState *s)
>>>  {
>>>      int isrv;
>>> @@ -415,6 +437,7 @@ static void apic_eoi(APICCommonState *s)
>>>          return;
>>>      reset_bit(s->isr, isrv);
>>>      if (!(s->spurious_vec&  APIC_SV_DIRECTED_IO)&&  get_bit(s->tmr,
>>> isrv)) {
>>>
>>> +        //fix me,need to take extra lock for the bus?
>>>          ioapic_eoi_broadcast(isrv);
>>>      }
>>>      apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC);
>>> @@ -480,18 +503,23 @@ static void apic_get_delivery_bitmask(uint32_t
>>> *deliver_bitmask,
>>>  static void apic_startup(APICCommonState *s, int vector_num)
>>>  {
>>>      s->sipi_vector = vector_num;
>>> +    qemu_mutex_lock_cpu(s->cpu_env);
>>>      cpu_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
>>> +    qemu_mutex_unlock_cpu(s->cpu_env);
>>>  }
>>>
>>>  void apic_sipi(DeviceState *d)
>>>  {
>>>      APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> -
>>> +    qemu_mutex_lock_cpu(s->cpu_env);
>>>      cpu_reset_interrupt(s->cpu_env, CPU_INTERRUPT_SIPI);
>>>
>>> -    if (!s->wait_for_sipi)
>>> +    if (!s->wait_for_sipi) {
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          return;
>>> +    }
>>>      cpu_x86_load_seg_cache_sipi(s->cpu_env, s->sipi_vector);
>>> +    qemu_mutex_unlock_cpu(s->cpu_env);
>>>      s->wait_for_sipi = 0;
>>>  }
>>>
>>> @@ -543,6 +571,7 @@ static void apic_deliver(DeviceState *d, uint8_t dest,
>>> uint8_t dest_mode,
>>>      apic_bus_deliver(deliver_bitmask, delivery_mode, vector_num,
>>> trigger_mode);
>>>  }
>>>
>>> +/* Caller must take lock */
>>>  int apic_get_interrupt(DeviceState *d)
>>>  {
>>>      APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -572,6 +601,7 @@ int apic_get_interrupt(DeviceState *d)
>>>      return intno;
>>>  }
>>>
>>> +/* Caller must hold the cpu_lock */
>>>  int apic_accept_pic_intr(DeviceState *d)
>>>  {
>>>      APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d);
>>> @@ -589,6 +619,7 @@ int apic_accept_pic_intr(DeviceState *d)
>>>      return 0;
>>>  }
>>>
>>> +/* Caller hold lock */
>>>  static uint32_t apic_get_current_count(APICCommonState *s)
>>>  {
>>>      int64_t d;
>>> @@ -619,9 +650,10 @@ static void apic_timer_update(APICCommonState *s,
>>> int64_t current_time)
>>>  static void apic_timer(void *opaque)
>>>  {
>>>      APICCommonState *s = opaque;
>>> -
>>> +    qemu_mutex_lock_cpu(s->cpu_env);
>>>      apic_local_deliver(s, APIC_LVT_TIMER);
>>>      apic_timer_update(s, s->next_time);
>>> +    qemu_mutex_unlock_cpu(s->cpu_env);
>>>  }
>>>
>>>  static uint32_t apic_mem_readb(void *opaque, target_phys_addr_t addr)
>>> @@ -664,18 +696,22 @@ static uint32_t apic_mem_readl(void *opaque,
>>> target_phys_addr_t addr)
>>>          val = 0x11 | ((APIC_LVT_NB - 1)<<  16); /* version 0x11 */
>>>          break;
>>>      case 0x08:
>>> +        qemu_mutex_lock_cpu(s->cpu_env);
>>>          apic_sync_vapic(s, SYNC_FROM_VAPIC);
>>>          if (apic_report_tpr_access) {
>>>              cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ);
>>>          }
>>>          val = s->tpr;
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x09:
>>>          val = apic_get_arb_pri(s);
>>>          break;
>>>      case 0x0a:
>>>          /* ppr */
>>> +        qemu_mutex_lock_cpu(s->cpu_env);
>>>          val = apic_get_ppr(s);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x0b:
>>>          val = 0;
>>> @@ -712,7 +748,9 @@ static uint32_t apic_mem_readl(void *opaque,
>>> target_phys_addr_t addr)
>>>          val = s->initial_count;
>>>          break;
>>>      case 0x39:
>>> +        qemu_mutex_lock_cpu(s->cpu_env);
>>>          val = apic_get_current_count(s);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x3e:
>>>          val = s->divide_conf;
>>> @@ -767,18 +805,22 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>>      case 0x03:
>>>          break;
>>>      case 0x08:
>>> +       qemu_mutex_lock_cpu(s->cpu_env);
>>>          if (apic_report_tpr_access) {
>>>              cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE);
>>>          }
>>>          s->tpr = val;
>>>          apic_sync_vapic(s, SYNC_TO_VAPIC);
>>>          apic_update_irq(s);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x09:
>>>      case 0x0a:
>>>          break;
>>>      case 0x0b: /* EOI */
>>> +        qemu_mutex_lock_cpu(s->cpu_env);
>>>          apic_eoi(s);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x0d:
>>>          s->log_dest = val>>  24;
>>> @@ -787,8 +829,10 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>>          s->dest_mode = val>>  28;
>>>          break;
>>>      case 0x0f:
>>> +        qemu_mutex_lock_cpu(s->cpu_env);
>>>          s->spurious_vec = val&  0x1ff;
>>>
>>>          apic_update_irq(s);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x10 ... 0x17:
>>>      case 0x18 ... 0x1f:
>>> @@ -807,24 +851,30 @@ static void apic_mem_writel(void *opaque,
>>> target_phys_addr_t addr, uint32_t val)
>>>      case 0x32 ... 0x37:
>>>          {
>>>              int n = index - 0x32;
>>> +            qemu_mutex_lock_cpu(s->cpu_env);
>>>              s->lvt[n] = val;
>>>              if (n == APIC_LVT_TIMER)
>>>                  apic_timer_update(s, qemu_get_clock_ns(vm_clock));
>>> +            qemu_mutex_unlock_cpu(s->cpu_env);
>>>          }
>>>          break;
>>>      case 0x38:
>>> +       qemu_mutex_lock_cpu(s->cpu_env);
>>>          s->initial_count = val;
>>>          s->initial_count_load_time = qemu_get_clock_ns(vm_clock);
>>>          apic_timer_update(s, s->initial_count_load_time);
>>> +        qemu_mutex_unlock_cpu(s->cpu_env);
>>>          break;
>>>      case 0x39:
>>>          break;
>>>      case 0x3e:
>>>          {
>>>              int v;
>>> +            qemu_mutex_lock_cpu(s->cpu_env);
>>>              s->divide_conf = val&  0xb;
>>>              v = (s->divide_conf&  3) | ((s->divide_conf>>  1)&  4);
>>>              s->count_shift = (v + 1)&  7;
>>>
>>> +            qemu_mutex_unlock_cpu(s->cpu_env);
>>>          }
>>>          break;
>>>      default:
>>> diff --git a/hw/pc.c b/hw/pc.c
>>> index 4d34a33..5e7350c 100644
>>> --- a/hw/pc.c
>>> +++ b/hw/pc.c
>>> @@ -147,7 +147,7 @@ void cpu_smm_update(CPUX86State *env)
>>>          smm_set(!!(env->hflags&  HF_SMM_MASK), smm_arg);
>>>
>>>  }
>>>
>>> -
>>> +/* Called by kvm_cpu_exec(), already with lock protection */
>>>  /* IRQ handling */
>>>  int cpu_get_pic_interrupt(CPUX86State *env)
>>>  {
>>> @@ -173,9 +173,15 @@ static void pic_irq_request(void *opaque, int irq,
>>> int level)
>>>      DPRINTF("pic_irqs: %s irq %d\n", level? "raise" : "lower", irq);
>>>      if (env->apic_state) {
>>>          while (env) {
>>> +            if (!qemu_cpu_is_self(env)) {
>>> +                qemu_mutex_lock_cpu(env);
>>> +            }
>>>              if (apic_accept_pic_intr(env->apic_state)) {
>>>                  apic_deliver_pic_intr(env->apic_state, level);
>>>              }
>>> +            if (!qemu_cpu_is_self(env)) {
>>> +                qemu_mutex_unlock_cpu(env);
>>> +            }
>>>              env = env->next_cpu;
>>>          }
>>>      } else {
>>> diff --git a/kvm-all.c b/kvm-all.c
>>> index 9b73ccf..dc9aa54 100644
>>> --- a/kvm-all.c
>>> +++ b/kvm-all.c
>>> @@ -29,6 +29,7 @@
>>>  #include "bswap.h"
>>>  #include "memory.h"
>>>  #include "exec-memory.h"
>>> +#include "qemu-thread.h"
>>>
>>>  /* This check must be after config-host.h is included */
>>>  #ifdef CONFIG_EVENTFD
>>> @@ -1252,12 +1253,15 @@ int kvm_cpu_exec(CPUArchState *env)
>>>               */
>>>              qemu_cpu_kick_self();
>>>          }
>>> -        qemu_mutex_unlock_iothread();
>>> +        qemu_mutex_unlock(env->cpu_lock);
>>>
>>>          run_ret = kvm_vcpu_ioctl(env, KVM_RUN, 0);
>>>
>>> -        qemu_mutex_lock_iothread();
>>> +        qemu_mutex_lock(env->cpu_lock);
>>>          kvm_arch_post_run(env, run);
>>> +        qemu_mutex_unlock_cpu(env);
>>> +
>>> +        qemu_mutex_lock_iothread();
>>>
>>>          kvm_flush_coalesced_mmio_buffer();
>>>
>>> @@ -1265,6 +1269,8 @@ int kvm_cpu_exec(CPUArchState *env)
>>>              if (run_ret == -EINTR || run_ret == -EAGAIN) {
>>>                  DPRINTF("io window exit\n");
>>>                  ret = EXCP_INTERRUPT;
>>> +                qemu_mutex_unlock_iothread();
>>> +                qemu_mutex_lock_cpu(env);
>>>                  break;
>>>              }
>>>              fprintf(stderr, "error: kvm run failed %s\n",
>>> @@ -1312,6 +1318,9 @@ int kvm_cpu_exec(CPUArchState *env)
>>>              ret = kvm_arch_handle_exit(env, run);
>>>              break;
>>>          }
>>> +
>>> +        qemu_mutex_unlock_iothread();
>>> +        qemu_mutex_lock_cpu(env);
>>>      } while (ret == 0);
>>
>>
>> This really confuses me.
>>
>> Shouldn't we be moving qemu_mutex_unlock_iothread() to the very top of
>> kvm_cpu_exec()?
>>
> Sorry, the code is not arranged good enough, but I think the final
> style will be like this:
>  do {
>        qemu_mutex_lock_iothread()
>        kvm_flush_coalesced_mmio_buffer();
>        qemu_mutex_unlock_iothread()
>
>        switch (run->exit_reason) {
>        case KVM_EXIT_IO:
>            qemu_mutex_lock_iothread()
>            kvm_handle_io();
>            qemu_mutex_unlock_iothread()
>            break;
>        case KVM_EXIT_MMIO:
>            qemu_mutex_lock_iothread()
>            cpu_physical_memory_rw()
>            qemu_mutex_unlock_iothread()
>            break;
>        ................
>  }while()
>
> Right? Can we push the qemu_mutex_lock_iothread out of the do{}?
>
>> There's only a fixed amount of state that gets manipulated too in
>> kvm_cpu_exec() so shouldn't we try to specifically describe what state is
>> covered by cpu_lock?
>>
> In current code, apart from run_on_cpu() writes env->queued_work_last,
> while flush_queued_work() reads, the BQL protects the CPUArchState and
> the deviceState from the other threads. As to the CPUArchState, the
> emulated registers are local. So the only part can be accessed by
> other threads is for the irq emulation.
> These component including env's
> (interrupt_request,eflags,halted,exit_request,exception_injected), all
> of them are decided by env->apic_state, so we consider them as a
> atomic context, that is say during the updating of env's these
> context, we do not allow apic_state changed.
>
>> What's your strategy for ensuring that all accesses to that state is
>> protected by cpu_lock?
>>
> The cpu_lock is a lock for env->apic_state and all related irq
> emulation context. And the apic_state is the only entrance, through
> which, other threads can affect this env's  irq emulation context. So
> when other threads want to access this_env->apic_state, they must
> firstly acquire the cpu_lock.
>
> Thanks and regards,
> pingfan
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>>      if (ret<  0) {
>>
>>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]