qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v5 4/4] spapr: Add Hcalls to support PAPR NVDIMM device


From: David Gibson
Subject: Re: [PATCH v5 4/4] spapr: Add Hcalls to support PAPR NVDIMM device
Date: Tue, 4 Feb 2020 15:09:15 +1100

On Thu, Jan 30, 2020 at 05:48:28AM -0600, Shivaprasad G Bhat wrote:
> This patch implements few of the necessary hcalls for the nvdimm support.
> 
> PAPR semantics is such that each NVDIMM device is comprising of multiple
> SCM(Storage Class Memory) blocks. The guest requests the hypervisor to
> bind each of the SCM blocks of the NVDIMM device using hcalls. There can
> be SCM block unbind requests in case of driver errors or unplug(not
> supported now) use cases. The NVDIMM label read/writes are done through
> hcalls.
> 
> Since each virtual NVDIMM device is divided into multiple SCM blocks,
> the bind, unbind, and queries using hcalls on those blocks can come
> independently. This doesn't fit well into the qemu device semantics,
> where the map/unmap are done at the (whole)device/object level granularity.
> The patch doesnt actually bind/unbind on hcalls but let it happen at the
> device_add/del phase itself instead.
> 
> The guest kernel makes bind/unbind requests for the virtual NVDIMM device
> at the region level granularity. Without interleaving, each virtual NVDIMM
> device is presented as a separate guest physical address range. So, there
> is no way a partial bind/unbind request can come for the vNVDIMM in a
> hcall for a subset of SCM blocks of a virtual NVDIMM. Hence it is safe to
> do bind/unbind everything during the device_add/del.
> 
> Signed-off-by: Shivaprasad G Bhat <address@hidden>

LGTM, apart from some minor nits noted below.

> ---
>  hw/ppc/Makefile.objs   |    2 
>  hw/ppc/spapr_nvdimm.c  |  327 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/hw/ppc/spapr.h |    8 +
>  3 files changed, 335 insertions(+), 2 deletions(-)
>  create mode 100644 hw/ppc/spapr_nvdimm.c
> 
> diff --git a/hw/ppc/Makefile.objs b/hw/ppc/Makefile.objs
> index a4bac57be6..c3d3cc56eb 100644
> --- a/hw/ppc/Makefile.objs
> +++ b/hw/ppc/Makefile.objs
> @@ -7,7 +7,7 @@ obj-$(CONFIG_PSERIES) += spapr.o spapr_caps.o spapr_vio.o 
> spapr_events.o
>  obj-$(CONFIG_PSERIES) += spapr_hcall.o spapr_iommu.o spapr_rtas.o
>  obj-$(CONFIG_PSERIES) += spapr_pci.o spapr_rtc.o spapr_drc.o
>  obj-$(CONFIG_PSERIES) += spapr_cpu_core.o spapr_ovec.o spapr_irq.o
> -obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o
> +obj-$(CONFIG_PSERIES) += spapr_tpm_proxy.o spapr_nvdimm.o
>  obj-$(CONFIG_SPAPR_RNG) +=  spapr_rng.o
>  obj-$(call land,$(CONFIG_PSERIES),$(CONFIG_LINUX)) += spapr_pci_vfio.o 
> spapr_pci_nvlink2.o
>  # IBM PowerNV
> diff --git a/hw/ppc/spapr_nvdimm.c b/hw/ppc/spapr_nvdimm.c
> new file mode 100644
> index 0000000000..8d1c2dc009
> --- /dev/null
> +++ b/hw/ppc/spapr_nvdimm.c

It'd be nice to introduce this file in the previous patch and try to
keep as much of the NVDIMM code together, rather than bloating spapr.c
even further.

> @@ -0,0 +1,327 @@
> +/*
> + * QEMU PAPR Storage Class Memory Interfaces
> + *
> + * Copyright (c) 2019-2020, IBM Corporation.
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "hw/ppc/spapr.h"
> +#include "hw/ppc/spapr_drc.h"
> +#include "hw/mem/nvdimm.h"
> +#include "qemu/range.h"
> +#include "qemu/nvdimm-utils.h"
> +
> +static target_ulong h_scm_read_metadata(PowerPCCPU *cpu,
> +                                        SpaprMachineState *spapr,
> +                                        target_ulong opcode,
> +                                        target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t numBytesToRead = args[2];

That's a really long name for a local.  How about just 'size' or 'len'?

> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm;
> +    NVDIMMClass *ddc;
> +    uint64_t data = 0;
> +    uint8_t buf[8] = { 0 };
> +
> +    if (!drc || !drc->dev ||
> +        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToRead != 1 && numBytesToRead != 2 &&
> +        numBytesToRead != 4 && numBytesToRead != 8) {
> +        return H_P3;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    if ((offset + numBytesToRead < offset) ||
> +        (nvdimm->label_size < numBytesToRead + offset)) {
> +        return H_P2;
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->read_label_data(nvdimm, buf, numBytesToRead, offset);
> +
> +    switch (numBytesToRead) {
> +    case 1:
> +        data = ldub_p(buf);
> +        break;
> +    case 2:
> +        data = lduw_be_p(buf);
> +        break;
> +    case 4:
> +        data = ldl_be_p(buf);
> +        break;
> +    case 8:
> +        data = ldq_be_p(buf);
> +        break;
> +    default:
> +        g_assert_not_reached();
> +    }
> +
> +    args[0] = data;
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_write_metadata(PowerPCCPU *cpu,
> +                                         SpaprMachineState *spapr,
> +                                         target_ulong opcode,
> +                                         target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t offset = args[1];
> +    uint64_t data = args[2];
> +    uint64_t numBytesToWrite = args[3];
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm;
> +    NVDIMMClass *ddc;
> +    uint8_t buf[8] = { 0 };
> +
> +    if (!drc || !drc->dev ||
> +        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    if (numBytesToWrite != 1 && numBytesToWrite != 2 &&
> +        numBytesToWrite != 4 && numBytesToWrite != 8) {
> +        return H_P4;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    if ((offset + numBytesToWrite < offset) ||
> +        (nvdimm->label_size < numBytesToWrite + offset)) {
> +        return H_P2;
> +    }
> +
> +    switch (numBytesToWrite) {
> +    case 1:
> +        if (data & 0xffffffffffffff00) {
> +            return H_P2;
> +        }
> +        stb_p(buf, data);
> +        break;
> +    case 2:
> +        if (data & 0xffffffffffff0000) {
> +            return H_P2;
> +        }
> +        stw_be_p(buf, data);
> +        break;
> +    case 4:
> +        if (data & 0xffffffff00000000) {
> +            return H_P2;
> +        }
> +        stl_be_p(buf, data);
> +        break;
> +    case 8:
> +        stq_be_p(buf, data);
> +        break;
> +    default:
> +            g_assert_not_reached();
> +    }
> +
> +    ddc = NVDIMM_GET_CLASS(nvdimm);
> +    ddc->write_label_data(nvdimm, buf, numBytesToWrite, offset);
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
> +                                   target_ulong opcode, target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_idx = args[1];
> +    uint64_t no_of_scm_blocks_to_bind = args[2];
> +    uint64_t target_logical_mem_addr = args[3];
> +    uint64_t continue_token = args[4];
> +    uint64_t size;
> +    uint64_t total_no_of_scm_blocks;
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    hwaddr addr;
> +    NVDIMMDevice *nvdimm;
> +
> +    if (!drc || !drc->dev ||
> +        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    /*
> +     * Currently continue token should be zero qemu has already bound
> +     * everything and this hcall doesnt return H_BUSY.
> +     */
> +    if (continue_token > 0) {
> +        return H_P5;
> +    }
> +
> +    /* Currently qemu assigns the address. */
> +    if (target_logical_mem_addr != 0xffffffffffffffff) {
> +        return H_OVERLAP;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +
> +    size = object_property_get_uint(OBJECT(nvdimm),
> +                                    PC_DIMM_SIZE_PROP, &error_abort);
> +
> +    total_no_of_scm_blocks = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +    if (starting_idx > total_no_of_scm_blocks) {
> +        return H_P2;
> +    }
> +
> +    if (((starting_idx + no_of_scm_blocks_to_bind) < starting_idx) ||
> +        ((starting_idx + no_of_scm_blocks_to_bind) > 
> total_no_of_scm_blocks)) {
> +        return H_P3;
> +    }
> +
> +    addr = object_property_get_uint(OBJECT(nvdimm),
> +                                    PC_DIMM_ADDR_PROP, &error_abort);
> +
> +    addr += starting_idx * SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +
> +    /* Already bound, Return target logical address in R5 */
> +    args[1] = addr;
> +    args[2] = no_of_scm_blocks_to_bind;
> +
> +    return H_SUCCESS;
> +}
> +
> +static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState 
> *spapr,
> +                                     target_ulong opcode, target_ulong *args)
> +{
> +    uint32_t drc_index = args[0];
> +    uint64_t starting_scm_logical_addr = args[1];
> +    uint64_t no_of_scm_blocks_to_unbind = args[2];
> +    uint64_t continue_token = args[3];
> +    uint64_t size_to_unbind;
> +    Range blockrange = range_empty;
> +    Range nvdimmrange = range_empty;
> +    SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +    NVDIMMDevice *nvdimm;
> +    uint64_t size, addr;
> +
> +    if (!drc || !drc->dev ||
> +        spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +        return H_PARAMETER;
> +    }
> +
> +    /* continue_token should be zero as this hcall doesn't return H_BUSY. */
> +    if (continue_token > 0) {
> +        return H_P4;
> +    }
> +
> +    /* Check if starting_scm_logical_addr is block aligned */
> +    if (!QEMU_IS_ALIGNED(starting_scm_logical_addr,
> +                         SPAPR_MINIMUM_SCM_BLOCK_SIZE)) {
> +        return H_P2;
> +    }
> +
> +    size_to_unbind = no_of_scm_blocks_to_unbind * 
> SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +    if (no_of_scm_blocks_to_unbind == 0 || no_of_scm_blocks_to_unbind !=
> +                               size_to_unbind / 
> SPAPR_MINIMUM_SCM_BLOCK_SIZE) {
> +        return H_P3;
> +    }
> +
> +    nvdimm = NVDIMM(drc->dev);
> +    size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> +                                   &error_abort);
> +    addr = object_property_get_int(OBJECT(nvdimm), PC_DIMM_ADDR_PROP,
> +                                   &error_abort);
> +
> +    range_init_nofail(&nvdimmrange, addr, size);
> +    range_init_nofail(&blockrange, starting_scm_logical_addr, 
> size_to_unbind);
> +
> +    if (!range_contains_range(&nvdimmrange, &blockrange)) {
> +        return H_P3;
> +    }
> +
> +    args[1] = no_of_scm_blocks_to_unbind;
> +
> +    /* let unplug take care of actual unbind */
> +    return H_SUCCESS;
> +}
> +
> +#define H_UNBIND_SCOPE_ALL 0x1
> +#define H_UNBIND_SCOPE_DRC 0x2
> +
> +static target_ulong h_scm_unbind_all(PowerPCCPU *cpu, SpaprMachineState 
> *spapr,
> +                                     target_ulong opcode, target_ulong *args)
> +{
> +    uint64_t target_scope = args[0];
> +    uint32_t drc_index = args[1];
> +    uint64_t continue_token = args[2];
> +    NVDIMMDevice *nvdimm;
> +    uint64_t size;
> +    uint64_t no_of_scm_blocks_unbound = 0;
> +
> +    /* continue_token should be zero as this hcall doesn't return H_BUSY. */
> +    if (continue_token > 0) {
> +        return H_P4;
> +    }
> +
> +    if (target_scope == H_UNBIND_SCOPE_DRC) {
> +        SpaprDrc *drc = spapr_drc_by_index(drc_index);
> +
> +        if (!drc || !drc->dev ||
> +            spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
> +            return H_P2;
> +        }
> +
> +        nvdimm = NVDIMM(drc->dev);
> +        size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> +                                       &error_abort);
> +
> +        no_of_scm_blocks_unbound = size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +    } else if (target_scope ==  H_UNBIND_SCOPE_ALL) {
> +        GSList *list, *nvdimms;
> +
> +        nvdimms = nvdimm_get_device_list();
> +        for (list = nvdimms; list; list = list->next) {
> +            nvdimm = list->data;
> +            size = object_property_get_int(OBJECT(nvdimm), PC_DIMM_SIZE_PROP,
> +                                           &error_abort);
> +
> +            no_of_scm_blocks_unbound += size / SPAPR_MINIMUM_SCM_BLOCK_SIZE;
> +        }
> +        g_slist_free(nvdimms);
> +    } else {
> +        return H_PARAMETER;
> +    }
> +
> +    args[1] = no_of_scm_blocks_unbound;
> +
> +    /* let unplug take care of actual unbind */
> +    return H_SUCCESS;
> +}
> +
> +static void spapr_scm_register_types(void)
> +{
> +    /* qemu/scm specific hcalls */
> +    spapr_register_hypercall(H_SCM_READ_METADATA, h_scm_read_metadata);
> +    spapr_register_hypercall(H_SCM_WRITE_METADATA, h_scm_write_metadata);
> +    spapr_register_hypercall(H_SCM_BIND_MEM, h_scm_bind_mem);
> +    spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
> +    spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
> +}
> +
> +type_init(spapr_scm_register_types)
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index ed2de4bae5..633ff5202b 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -287,6 +287,7 @@ struct SpaprMachineState {
>  #define H_P7              -60
>  #define H_P8              -61
>  #define H_P9              -62
> +#define H_OVERLAP         -68
>  #define H_UNSUPPORTED_FLAG -256
>  #define H_MULTI_THREADS_ACTIVE -9005
>  
> @@ -494,8 +495,13 @@ struct SpaprMachineState {
>  #define H_INT_ESB               0x3C8
>  #define H_INT_SYNC              0x3CC
>  #define H_INT_RESET             0x3D0
> +#define H_SCM_READ_METADATA     0x3E4
> +#define H_SCM_WRITE_METADATA    0x3E8
> +#define H_SCM_BIND_MEM          0x3EC
> +#define H_SCM_UNBIND_MEM        0x3F0
> +#define H_SCM_UNBIND_ALL        0x3FC
>  
> -#define MAX_HCALL_OPCODE        H_INT_RESET
> +#define MAX_HCALL_OPCODE        H_SCM_UNBIND_ALL
>  
>  /* The hcalls above are standardized in PAPR and implemented by pHyp
>   * as well.
> 

-- 
David Gibson                    | I'll have my music baroque, and my code
david AT gibson.dropbear.id.au  | minimalist, thank you.  NOT _the_ _other_
                                | _way_ _around_!
http://www.ozlabs.org/~dgibson

Attachment: signature.asc
Description: PGP signature


reply via email to

[Prev in Thread] Current Thread [Next in Thread]