[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH] Fixes related to processing of qemu's -numa opt
From: |
Eduardo Habkost |
Subject: |
Re: [Qemu-devel] [PATCH] Fixes related to processing of qemu's -numa option |
Date: |
Mon, 18 Jun 2012 17:29:56 -0300 |
User-agent: |
Mutt/1.5.21 (2010-09-15) |
On Sun, Jun 17, 2012 at 01:12:31PM -0700, Chegu Vinod wrote:
> The -numa option to qemu is used to create [fake] numa nodes
> and expose them to the guest OS instance.
>
> There are a couple of issues with the -numa option:
>
> a) Max VCPU's that can be specified for a guest while using
> the qemu's -numa option is 64. Due to a typecasting issue
> when the number of VCPUs is > 32 the VCPUs don't show up
> under the specified [fake] numa nodes.
>
> b) KVM currently has support for 160VCPUs per guest. The
> qemu's -numa option has only support for upto 64VCPUs
> per guest.
>
> This patch addresses these two issues. [ Note: This
> patch has been verified by Eduardo Habkost ].
I was going to add a Tested-by line, but this patch breaks the automatic
round-robin assignment when no CPU is added to any node on the
command-line.
Additional questions below:
[...]
> diff --git a/cpus.c b/cpus.c
> index b182b3d..f9cee60 100644
> --- a/cpus.c
> +++ b/cpus.c
> @@ -1145,7 +1145,7 @@ void set_numa_modes(void)
>
> for (env = first_cpu; env != NULL; env = env->next_cpu) {
> for (i = 0; i < nb_numa_nodes; i++) {
> - if (node_cpumask[i] & (1 << env->cpu_index)) {
> + if (CPU_ISSET_S(env->cpu_index, cpumask_size, node_cpumask[i])) {
> env->numa_node = i;
> }
> }
> diff --git a/hw/pc.c b/hw/pc.c
> index 8368701..f0c3665 100644
> --- a/hw/pc.c
> +++ b/hw/pc.c
> @@ -639,7 +639,7 @@ static void *bochs_bios_init(void)
> numa_fw_cfg[0] = cpu_to_le64(nb_numa_nodes);
> for (i = 0; i < max_cpus; i++) {
> for (j = 0; j < nb_numa_nodes; j++) {
> - if (node_cpumask[j] & (1 << i)) {
> + if (CPU_ISSET_S(i, cpumask_size, node_cpumask[j])) {
> numa_fw_cfg[i + 1] = cpu_to_le64(j);
> break;
> }
> diff --git a/sysemu.h b/sysemu.h
> index bc2c788..6e4d342 100644
> --- a/sysemu.h
> +++ b/sysemu.h
> @@ -9,6 +9,7 @@
> #include "qapi-types.h"
> #include "notify.h"
> #include "main-loop.h"
> +#include <sched.h>
>
> /* vl.c */
>
> @@ -133,9 +134,11 @@ extern uint8_t qemu_extra_params_fw[2];
> extern QEMUClock *rtc_clock;
>
> #define MAX_NODES 64
> +#define KVM_MAX_VCPUS 254
Do we really need this constant? Why not just use max_cpus when
allocating the CPU sets, instead?
> extern int nb_numa_nodes;
> extern uint64_t node_mem[MAX_NODES];
> -extern uint64_t node_cpumask[MAX_NODES];
> +extern cpu_set_t *node_cpumask[MAX_NODES];
> +extern size_t cpumask_size;
>
> #define MAX_OPTION_ROMS 16
> typedef struct QEMUOptionRom {
> diff --git a/vl.c b/vl.c
> index 204d85b..1906412 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -28,6 +28,7 @@
> #include <errno.h>
> #include <sys/time.h>
> #include <zlib.h>
> +#include <sched.h>
>
> /* Needed early for CONFIG_BSD etc. */
> #include "config-host.h"
> @@ -240,7 +241,8 @@ QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
> QTAILQ_HEAD_INITIALIZER(fw_boot_order
>
> int nb_numa_nodes;
> uint64_t node_mem[MAX_NODES];
> -uint64_t node_cpumask[MAX_NODES];
> +cpu_set_t *node_cpumask[MAX_NODES];
> +size_t cpumask_size;
>
> uint8_t qemu_uuid[16];
>
> @@ -950,6 +952,9 @@ static void numa_add(const char *optarg)
> char *endptr;
> unsigned long long value, endvalue;
> int nodenr;
> + int i;
> +
> + value = endvalue = 0;
>
> optarg = get_opt_name(option, 128, optarg, ',') + 1;
> if (!strcmp(option, "node")) {
> @@ -970,27 +975,17 @@ static void numa_add(const char *optarg)
> }
> node_mem[nodenr] = sval;
> }
> - if (get_param_value(option, 128, "cpus", optarg) == 0) {
> - node_cpumask[nodenr] = 0;
> - } else {
> + if (get_param_value(option, 128, "cpus", optarg) != 0) {
> value = strtoull(option, &endptr, 10);
> - if (value >= 64) {
> - value = 63;
> - fprintf(stderr, "only 64 CPUs in NUMA mode supported.\n");
> + if (*endptr == '-') {
> + endvalue = strtoull(endptr+1, &endptr, 10);
> } else {
> - if (*endptr == '-') {
> - endvalue = strtoull(endptr+1, &endptr, 10);
> - if (endvalue >= 63) {
> - endvalue = 62;
> - fprintf(stderr,
> - "only 63 CPUs in NUMA mode supported.\n");
> - }
> - value = (2ULL << endvalue) - (1ULL << value);
> - } else {
> - value = 1ULL << value;
> - }
> + endvalue = value;
> + }
> +
> + for (i = value; i <= endvalue; ++i) {
> + CPU_SET_S(i, cpumask_size, node_cpumask[nodenr]);
What if endvalue is larger than the cpu mask size we allocated?
> }
> - node_cpumask[nodenr] = value;
> }
> nb_numa_nodes++;
> }
> @@ -2331,7 +2326,9 @@ int main(int argc, char **argv, char **envp)
>
> for (i = 0; i < MAX_NODES; i++) {
> node_mem[i] = 0;
> - node_cpumask[i] = 0;
> + node_cpumask[i] = CPU_ALLOC(KVM_MAX_VCPUS);
> + cpumask_size = CPU_ALLOC_SIZE(KVM_MAX_VCPUS);
> + CPU_ZERO_S(cpumask_size, node_cpumask[i]);
> }
>
> nb_numa_nodes = 0;
> @@ -3465,8 +3462,9 @@ int main(int argc, char **argv, char **envp)
> }
>
> for (i = 0; i < nb_numa_nodes; i++) {
> - if (node_cpumask[i] != 0)
> + if (node_cpumask[i] != NULL) {
This will be true for every node (as you preallocate all the CPU
sets in the beginning), so the loop will always end with i==0, in turn
unconditionally disabling the round-robin CPU assignment code below.
You can use CPU_COUNT_S, instead.
> break;
> + }
> }
> /* assigning the VCPUs round-robin is easier to implement, guest OSes
> * must cope with this anyway, because there are BIOSes out there in
> @@ -3474,7 +3472,7 @@ int main(int argc, char **argv, char **envp)
> */
> if (i == nb_numa_nodes) {
> for (i = 0; i < max_cpus; i++) {
> - node_cpumask[i % nb_numa_nodes] |= 1 << i;
> + CPU_SET_S(i, cpumask_size, node_cpumask[i % nb_numa_nodes]);
> }
> }
> }
> --
> 1.7.1
>
--
Eduardo