qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy


From: Avi Kivity
Subject: Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy
Date: Thu, 29 Dec 2011 13:17:51 +0200
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:8.0) Gecko/20111115 Thunderbird/8.0

On 12/29/2011 03:26 AM, Isaku Yamahata wrote:
> This is a character device to hook page access.
> The page fault in the area is reported to another user process by
> this chardriver. Then, the process fills the page contents and
> resolves the page fault.
>
>  
> +config UMEM
> +        tristate "/dev/umem user process backed memory support"

tab

> +     default n
> +     help
> +       User process backed memory driver provides /dev/umem device.
> +       The /dev/umem device is designed for some sort of distributed
> +       shared memory. Especially post-copy live migration with KVM.
> +       When in doubt, say "N".
> +

Need documentation of the protocol between the kernel and userspace; not
just the ioctls, but also how faults are propagated.

> +
> +struct umem_page_req_list {
> +     struct list_head list;
> +     pgoff_t pgoff;
> +};
> +
>
> +
> +
> +static int umem_mark_page_cached(struct umem *umem,
> +                              struct umem_page_cached *page_cached)
> +{
> +     int ret = 0;
> +#define PG_MAX       ((__u32)32)
> +     __u64 pgoffs[PG_MAX];
> +     __u32 nr;
> +     unsigned long bit;
> +     bool wake_up_list = false;
> +
> +     nr = 0;
> +     while (nr < page_cached->nr) {
> +             __u32 todo = min(PG_MAX, (page_cached->nr - nr));
> +             int i;
> +
> +             if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> +                                sizeof(*pgoffs) * todo)) {
> +                     ret = -EFAULT;
> +                     goto out;
> +             }
> +             for (i = 0; i < todo; ++i) {
> +                     if (pgoffs[i] >= umem->pgoff_end) {
> +                             ret = -EINVAL;
> +                             goto out;
> +                     }
> +                     set_bit(pgoffs[i], umem->cached);
> +             }
> +             nr += todo;
> +     }
> +

Probably need an smp_wmb() where.

> +     spin_lock(&umem->lock);
> +     bit = 0;
> +     for (;;) {
> +             bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> +                                 bit);
> +             if (bit >= umem->sync_req_max)
> +                     break;
> +             if (test_bit(umem->sync_req[bit], umem->cached))
> +                     wake_up(&umem->page_wait[bit]);

Why not do this test in the loop above?

> +             bit++;
> +     }
> +
> +     if (umem->req_list_nr > 0)
> +             wake_up_list = true;
> +     spin_unlock(&umem->lock);
> +
> +     if (wake_up_list)
> +             wake_up_all(&umem->req_list_wait);
> +
> +out:
> +     return ret;
> +}
> +
> +
> +
> +static void umem_put(struct umem *umem)
> +{
> +     int ret;
> +
> +     mutex_lock(&umem_list_mutex);
> +     ret = kref_put(&umem->kref, umem_free);
> +     if (ret == 0) {
> +             mutex_unlock(&umem_list_mutex);
> +     }

This looks wrong.

> +}
> +
> +
> +static int umem_create_umem(struct umem_create *create)
> +{
> +     int error = 0;
> +     struct umem *umem = NULL;
> +     struct vm_area_struct *vma;
> +     int shmem_fd;
> +     unsigned long bitmap_bytes;
> +     unsigned long sync_bitmap_bytes;
> +     int i;
> +
> +     umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> +     umem->name = create->name;
> +     kref_init(&umem->kref);
> +     INIT_LIST_HEAD(&umem->list);
> +
> +     mutex_lock(&umem_list_mutex);
> +     error = umem_add_list(umem);
> +     if (error) {
> +             goto out;
> +     }
> +
> +     umem->task = NULL;
> +     umem->mmapped = false;
> +     spin_lock_init(&umem->lock);
> +     umem->size = roundup(create->size, PAGE_SIZE);
> +     umem->pgoff_end = umem->size >> PAGE_SHIFT;
> +     init_waitqueue_head(&umem->req_wait);
> +
> +     vma = &umem->vma;
> +     vma->vm_start = 0;
> +     vma->vm_end = umem->size;
> +     /* this shmem file is used for temporal buffer for pages
> +        so it's unlikely that so many pages exists in this shmem file */
> +     vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> +             VM_DONTEXPAND;
> +     vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> +     vma->vm_pgoff = 0;
> +     INIT_LIST_HEAD(&vma->anon_vma_chain);
> +
> +     shmem_fd = get_unused_fd();
> +     if (shmem_fd < 0) {
> +             error = shmem_fd;
> +             goto out;
> +     }
> +     error = shmem_zero_setup(vma);
> +     if (error < 0) {
> +             put_unused_fd(shmem_fd);
> +             goto out;
> +     }
> +     umem->shmem_filp = vma->vm_file;
> +     get_file(umem->shmem_filp);
> +     fd_install(shmem_fd, vma->vm_file);
> +     create->shmem_fd = shmem_fd;
> +
> +     create->umem_fd = anon_inode_getfd("umem",
> +                                        &umem_fops, umem, O_RDWR);
> +     if (create->umem_fd < 0) {
> +             error = create->umem_fd;
> +             goto out;
> +     }
> +
> +     bitmap_bytes = umem_bitmap_bytes(umem);
> +     if (bitmap_bytes > PAGE_SIZE) {
> +             umem->cached = vzalloc(bitmap_bytes);
> +             umem->faulted = vzalloc(bitmap_bytes);
> +     } else {
> +             umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> +             umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> +     }
> +
> +     /* those constants are not exported.
> +        They are just used for default value */
> +#define KVM_MAX_VCPUS        256
> +#define ASYNC_PF_PER_VCPU 64

Best to avoid defaults and require userspace choose.

> +
> +#define ASYNC_REQ_MAX        (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> +     if (create->async_req_max == 0)
> +             create->async_req_max = ASYNC_REQ_MAX;
> +     umem->async_req_max = create->async_req_max;
> +     umem->async_req_nr = 0;
> +     umem->async_req = kzalloc(
> +             sizeof(*umem->async_req) * umem->async_req_max,
> +             GFP_KERNEL);
> +
> +#define SYNC_REQ_MAX (KVM_MAX_VCPUS)
> +     if (create->sync_req_max == 0)
> +             create->sync_req_max = SYNC_REQ_MAX;
> +     umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> +     sync_bitmap_bytes = sizeof(unsigned long) *
> +             (umem->sync_req_max / BITS_PER_LONG);
> +     umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +     umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> +     umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> +                               umem->sync_req_max, GFP_KERNEL);
> +     for (i = 0; i < umem->sync_req_max; ++i)
> +             init_waitqueue_head(&umem->page_wait[i]);
> +     umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> +                              umem->sync_req_max, GFP_KERNEL);
> +
> +     umem->req_list_nr = 0;
> +     INIT_LIST_HEAD(&umem->req_list);
> +     init_waitqueue_head(&umem->req_list_wait);
> +
> +     mutex_unlock(&umem_list_mutex);
> +     return 0;
> +
> + out:
> +     umem_free(&umem->kref);
> +     return error;
> +}
> +
> +
> +static int umem_reattach_umem(struct umem_create *create)
> +{
> +     struct umem *entry;
> +
> +     mutex_lock(&umem_list_mutex);
> +     list_for_each_entry(entry, &umem_list, list) {
> +             if (umem_name_eq(&entry->name, &create->name)) {
> +                     kref_get(&entry->kref);
> +                     mutex_unlock(&umem_list_mutex);
> +
> +                     create->shmem_fd = get_unused_fd();
> +                     if (create->shmem_fd < 0) {
> +                             umem_put(entry);
> +                             return create->shmem_fd;
> +                     }
> +                     create->umem_fd = anon_inode_getfd(
> +                             "umem", &umem_fops, entry, O_RDWR);
> +                     if (create->umem_fd < 0) {
> +                             put_unused_fd(create->shmem_fd);
> +                             umem_put(entry);
> +                             return create->umem_fd;
> +                     }
> +                     get_file(entry->shmem_filp);
> +                     fd_install(create->shmem_fd, entry->shmem_filp);
> +
> +                     create->size = entry->size;
> +                     create->sync_req_max = entry->sync_req_max;
> +                     create->async_req_max = entry->async_req_max;
> +                     return 0;
> +             }
> +     }
> +     mutex_unlock(&umem_list_mutex);
> +
> +     return -ENOENT;
> +}

Can you explain how reattach is used?

> +
> +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> +                        unsigned long arg)
> +{
> +     void __user *argp = (void __user *) arg;
> +     long ret;
> +     struct umem_create *create = NULL;
> +
> +
> +     switch (ioctl) {
> +     case UMEM_DEV_CREATE_UMEM:
> +             create = kmalloc(sizeof(*create), GFP_KERNEL);
> +             if (copy_from_user(create, argp, sizeof(*create))) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +             ret = umem_create_umem(create);
> +             if (copy_to_user(argp, create, sizeof(*create))) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +             break;

A simpler approach is the open("/dev/umem") returns an mmap()able fd. 
You need to call an ioctl() to set the size, etc. but only you only
operate on that fd.

> +     case UMEM_DEV_LIST:
> +             ret = umem_list_umem(argp);
> +             break;
> +     case UMEM_DEV_REATTACH:
> +             create = kmalloc(sizeof(*create), GFP_KERNEL);
> +             if (copy_from_user(create, argp, sizeof(*create))) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +             ret = umem_reattach_umem(create);
> +             if (copy_to_user(argp, create, sizeof(*create))) {
> +                     ret = -EFAULT;
> +                     break;
> +             }
> +             break;
> +     default:
> +             ret = -EINVAL;
> +             break;
> +     }
> +
> +     kfree(create);
> +     return ret;
> +}
> +
> +
> +#ifdef __KERNEL__
> +#include <linux/compiler.h>
> +#else
> +#define __user
> +#endif

I think a #include <linux/compiler.h> is sufficient, the export process
(see include/linux/Kbuild, add an entry there) takes care of __user.

> +
> +#define UMEM_ID_MAX  256
> +#define UMEM_NAME_MAX        256
> +
> +struct umem_name {
> +     char id[UMEM_ID_MAX];           /* non-zero terminated */
> +     char name[UMEM_NAME_MAX];       /* non-zero terminated */
> +};

IMO, it would be better to avoid names, and use opaque __u64 identifiers
assigned by userspace, or perhaps file descriptors generated by the
kernel.  With names come the complications of namespaces, etc.  One user
can DoS another by grabbing a name that it knows the other user wants to
use.

> +
> +struct umem_create {
> +     __u64 size;     /* in bytes */
> +     __s32 umem_fd;
> +     __s32 shmem_fd;
> +     __u32 async_req_max;
> +     __u32 sync_req_max;
> +     struct umem_name name;
> +};
> +
> +struct umem_page_request {
> +     __u64 __user *pgoffs;

Pointers change their size in 32-bit and 64-bit userspace, best to avoid
them.

> +     __u32 nr;
> +     __u32 padding;
> +};
> +
> +struct umem_page_cached {
> +     __u64 __user *pgoffs;
> +     __u32 nr;
> +     __u32 padding;
> +};
> +
> +#define UMEMIO       0x1E
> +
> +/* ioctl for umem_dev fd */
> +#define UMEM_DEV_CREATE_UMEM _IOWR(UMEMIO, 0x0, struct umem_create)
> +#define UMEM_DEV_LIST                _IOWR(UMEMIO, 0x1, struct umem_list)

Why is _LIST needed?

> +#define UMEM_DEV_REATTACH    _IOWR(UMEMIO, 0x2, struct umem_create)
> +
> +/* ioctl for umem fd */
> +#define UMEM_GET_PAGE_REQUEST        _IOWR(UMEMIO, 0x10, struct 
> umem_page_request)
> +#define UMEM_MARK_PAGE_CACHED        _IOW (UMEMIO, 0x11, struct 
> umem_page_cached)

You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
file descriptors, instead of an ioctl.  It allows you to implement the
other side in either the kernel or userspace.  This is similar to how
kvm uses an eventfd for communication with vhost-net in the kernel, or
an implementation in userspace.

> +#define UMEM_MAKE_VMA_ANONYMOUS      _IO  (UMEMIO, 0x12)
> +
> +#endif /* __LINUX_UMEM_H */


-- 
error compiling committee.c: too many arguments to function




reply via email to

[Prev in Thread] Current Thread [Next in Thread]