[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy
From: |
Isaku Yamahata |
Subject: |
Re: [Qemu-devel] [PATCH 2/2] umem: chardevice for kvm postcopy |
Date: |
Thu, 29 Dec 2011 21:22:50 +0900 |
User-agent: |
Mutt/1.5.19 (2009-01-05) |
Thank you for review.
On Thu, Dec 29, 2011 at 01:17:51PM +0200, Avi Kivity wrote:
> > + default n
> > + help
> > + User process backed memory driver provides /dev/umem device.
> > + The /dev/umem device is designed for some sort of distributed
> > + shared memory. Especially post-copy live migration with KVM.
> > + When in doubt, say "N".
> > +
>
> Need documentation of the protocol between the kernel and userspace; not
> just the ioctls, but also how faults are propagated.
Will do.
>
> > +
> > +struct umem_page_req_list {
> > + struct list_head list;
> > + pgoff_t pgoff;
> > +};
> > +
> >
> > +
> > +
> > +static int umem_mark_page_cached(struct umem *umem,
> > + struct umem_page_cached *page_cached)
> > +{
> > + int ret = 0;
> > +#define PG_MAX ((__u32)32)
> > + __u64 pgoffs[PG_MAX];
> > + __u32 nr;
> > + unsigned long bit;
> > + bool wake_up_list = false;
> > +
> > + nr = 0;
> > + while (nr < page_cached->nr) {
> > + __u32 todo = min(PG_MAX, (page_cached->nr - nr));
> > + int i;
> > +
> > + if (copy_from_user(pgoffs, page_cached->pgoffs + nr,
> > + sizeof(*pgoffs) * todo)) {
> > + ret = -EFAULT;
> > + goto out;
> > + }
> > + for (i = 0; i < todo; ++i) {
> > + if (pgoffs[i] >= umem->pgoff_end) {
> > + ret = -EINVAL;
> > + goto out;
> > + }
> > + set_bit(pgoffs[i], umem->cached);
> > + }
> > + nr += todo;
> > + }
> > +
>
> Probably need an smp_wmb() where.
>
> > + spin_lock(&umem->lock);
> > + bit = 0;
> > + for (;;) {
> > + bit = find_next_bit(umem->sync_wait_bitmap, umem->sync_req_max,
> > + bit);
> > + if (bit >= umem->sync_req_max)
> > + break;
> > + if (test_bit(umem->sync_req[bit], umem->cached))
> > + wake_up(&umem->page_wait[bit]);
>
> Why not do this test in the loop above?
>
> > + bit++;
> > + }
> > +
> > + if (umem->req_list_nr > 0)
> > + wake_up_list = true;
> > + spin_unlock(&umem->lock);
> > +
> > + if (wake_up_list)
> > + wake_up_all(&umem->req_list_wait);
> > +
> > +out:
> > + return ret;
> > +}
> > +
> > +
> > +
> > +static void umem_put(struct umem *umem)
> > +{
> > + int ret;
> > +
> > + mutex_lock(&umem_list_mutex);
> > + ret = kref_put(&umem->kref, umem_free);
> > + if (ret == 0) {
> > + mutex_unlock(&umem_list_mutex);
> > + }
>
> This looks wrong.
>
> > +}
> > +
> > +
> > +static int umem_create_umem(struct umem_create *create)
> > +{
> > + int error = 0;
> > + struct umem *umem = NULL;
> > + struct vm_area_struct *vma;
> > + int shmem_fd;
> > + unsigned long bitmap_bytes;
> > + unsigned long sync_bitmap_bytes;
> > + int i;
> > +
> > + umem = kzalloc(sizeof(*umem), GFP_KERNEL);
> > + umem->name = create->name;
> > + kref_init(&umem->kref);
> > + INIT_LIST_HEAD(&umem->list);
> > +
> > + mutex_lock(&umem_list_mutex);
> > + error = umem_add_list(umem);
> > + if (error) {
> > + goto out;
> > + }
> > +
> > + umem->task = NULL;
> > + umem->mmapped = false;
> > + spin_lock_init(&umem->lock);
> > + umem->size = roundup(create->size, PAGE_SIZE);
> > + umem->pgoff_end = umem->size >> PAGE_SHIFT;
> > + init_waitqueue_head(&umem->req_wait);
> > +
> > + vma = &umem->vma;
> > + vma->vm_start = 0;
> > + vma->vm_end = umem->size;
> > + /* this shmem file is used for temporal buffer for pages
> > + so it's unlikely that so many pages exists in this shmem file */
> > + vma->vm_flags = VM_READ | VM_SHARED | VM_NOHUGEPAGE | VM_DONTCOPY |
> > + VM_DONTEXPAND;
> > + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
> > + vma->vm_pgoff = 0;
> > + INIT_LIST_HEAD(&vma->anon_vma_chain);
> > +
> > + shmem_fd = get_unused_fd();
> > + if (shmem_fd < 0) {
> > + error = shmem_fd;
> > + goto out;
> > + }
> > + error = shmem_zero_setup(vma);
> > + if (error < 0) {
> > + put_unused_fd(shmem_fd);
> > + goto out;
> > + }
> > + umem->shmem_filp = vma->vm_file;
> > + get_file(umem->shmem_filp);
> > + fd_install(shmem_fd, vma->vm_file);
> > + create->shmem_fd = shmem_fd;
> > +
> > + create->umem_fd = anon_inode_getfd("umem",
> > + &umem_fops, umem, O_RDWR);
> > + if (create->umem_fd < 0) {
> > + error = create->umem_fd;
> > + goto out;
> > + }
> > +
> > + bitmap_bytes = umem_bitmap_bytes(umem);
> > + if (bitmap_bytes > PAGE_SIZE) {
> > + umem->cached = vzalloc(bitmap_bytes);
> > + umem->faulted = vzalloc(bitmap_bytes);
> > + } else {
> > + umem->cached = kzalloc(bitmap_bytes, GFP_KERNEL);
> > + umem->faulted = kzalloc(bitmap_bytes, GFP_KERNEL);
> > + }
> > +
> > + /* those constants are not exported.
> > + They are just used for default value */
> > +#define KVM_MAX_VCPUS 256
> > +#define ASYNC_PF_PER_VCPU 64
>
> Best to avoid defaults and require userspace choose.
Okay.
> > +
> > +#define ASYNC_REQ_MAX (ASYNC_PF_PER_VCPU * KVM_MAX_VCPUS)
> > + if (create->async_req_max == 0)
> > + create->async_req_max = ASYNC_REQ_MAX;
> > + umem->async_req_max = create->async_req_max;
> > + umem->async_req_nr = 0;
> > + umem->async_req = kzalloc(
> > + sizeof(*umem->async_req) * umem->async_req_max,
> > + GFP_KERNEL);
> > +
> > +#define SYNC_REQ_MAX (KVM_MAX_VCPUS)
> > + if (create->sync_req_max == 0)
> > + create->sync_req_max = SYNC_REQ_MAX;
> > + umem->sync_req_max = round_up(create->sync_req_max, BITS_PER_LONG);
> > + sync_bitmap_bytes = sizeof(unsigned long) *
> > + (umem->sync_req_max / BITS_PER_LONG);
> > + umem->sync_req_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > + umem->sync_wait_bitmap = kzalloc(sync_bitmap_bytes, GFP_KERNEL);
> > + umem->page_wait = kzalloc(sizeof(*umem->page_wait) *
> > + umem->sync_req_max, GFP_KERNEL);
> > + for (i = 0; i < umem->sync_req_max; ++i)
> > + init_waitqueue_head(&umem->page_wait[i]);
> > + umem->sync_req = kzalloc(sizeof(*umem->sync_req) *
> > + umem->sync_req_max, GFP_KERNEL);
> > +
> > + umem->req_list_nr = 0;
> > + INIT_LIST_HEAD(&umem->req_list);
> > + init_waitqueue_head(&umem->req_list_wait);
> > +
> > + mutex_unlock(&umem_list_mutex);
> > + return 0;
> > +
> > + out:
> > + umem_free(&umem->kref);
> > + return error;
> > +}
> > +
> > +
> > +static int umem_reattach_umem(struct umem_create *create)
> > +{
> > + struct umem *entry;
> > +
> > + mutex_lock(&umem_list_mutex);
> > + list_for_each_entry(entry, &umem_list, list) {
> > + if (umem_name_eq(&entry->name, &create->name)) {
> > + kref_get(&entry->kref);
> > + mutex_unlock(&umem_list_mutex);
> > +
> > + create->shmem_fd = get_unused_fd();
> > + if (create->shmem_fd < 0) {
> > + umem_put(entry);
> > + return create->shmem_fd;
> > + }
> > + create->umem_fd = anon_inode_getfd(
> > + "umem", &umem_fops, entry, O_RDWR);
> > + if (create->umem_fd < 0) {
> > + put_unused_fd(create->shmem_fd);
> > + umem_put(entry);
> > + return create->umem_fd;
> > + }
> > + get_file(entry->shmem_filp);
> > + fd_install(create->shmem_fd, entry->shmem_filp);
> > +
> > + create->size = entry->size;
> > + create->sync_req_max = entry->sync_req_max;
> > + create->async_req_max = entry->async_req_max;
> > + return 0;
> > + }
> > + }
> > + mutex_unlock(&umem_list_mutex);
> > +
> > + return -ENOENT;
> > +}
>
> Can you explain how reattach is used?
>
> > +
> > +static long umem_dev_ioctl(struct file *filp, unsigned int ioctl,
> > + unsigned long arg)
> > +{
> > + void __user *argp = (void __user *) arg;
> > + long ret;
> > + struct umem_create *create = NULL;
> > +
> > +
> > + switch (ioctl) {
> > + case UMEM_DEV_CREATE_UMEM:
> > + create = kmalloc(sizeof(*create), GFP_KERNEL);
> > + if (copy_from_user(create, argp, sizeof(*create))) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > + ret = umem_create_umem(create);
> > + if (copy_to_user(argp, create, sizeof(*create))) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > + break;
>
> A simpler approach is the open("/dev/umem") returns an mmap()able fd.
> You need to call an ioctl() to set the size, etc. but only you only
> operate on that fd.
So you are suggesting that /dev/umem and /dev/umemctl should be introduced
and split the functionality.
> > + case UMEM_DEV_LIST:
> > + ret = umem_list_umem(argp);
> > + break;
> > + case UMEM_DEV_REATTACH:
> > + create = kmalloc(sizeof(*create), GFP_KERNEL);
> > + if (copy_from_user(create, argp, sizeof(*create))) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > + ret = umem_reattach_umem(create);
> > + if (copy_to_user(argp, create, sizeof(*create))) {
> > + ret = -EFAULT;
> > + break;
> > + }
> > + break;
> > + default:
> > + ret = -EINVAL;
> > + break;
> > + }
> > +
> > + kfree(create);
> > + return ret;
> > +}
> > +
> > +
> > +#ifdef __KERNEL__
> > +#include <linux/compiler.h>
> > +#else
> > +#define __user
> > +#endif
>
> I think a #include <linux/compiler.h> is sufficient, the export process
> (see include/linux/Kbuild, add an entry there) takes care of __user.
>
> > +
> > +#define UMEM_ID_MAX 256
> > +#define UMEM_NAME_MAX 256
> > +
> > +struct umem_name {
> > + char id[UMEM_ID_MAX]; /* non-zero terminated */
> > + char name[UMEM_NAME_MAX]; /* non-zero terminated */
> > +};
>
> IMO, it would be better to avoid names, and use opaque __u64 identifiers
> assigned by userspace, or perhaps file descriptors generated by the
> kernel. With names come the complications of namespaces, etc. One user
> can DoS another by grabbing a name that it knows the other user wants to
> use.
So how about the kernel assigning identifiers which is system global?
> > +
> > +struct umem_create {
> > + __u64 size; /* in bytes */
> > + __s32 umem_fd;
> > + __s32 shmem_fd;
> > + __u32 async_req_max;
> > + __u32 sync_req_max;
> > + struct umem_name name;
> > +};
> > +
> > +struct umem_page_request {
> > + __u64 __user *pgoffs;
>
> Pointers change their size in 32-bit and 64-bit userspace, best to avoid
> them.
Ah yes, right. How about following?
struct {
__u32 nr;
__u32 padding;
__u64 pgoffs[0];
}
> > + __u32 nr;
> > + __u32 padding;
> > +};
> > +
> > +struct umem_page_cached {
> > + __u64 __user *pgoffs;
> > + __u32 nr;
> > + __u32 padding;
> > +};
> > +
> > +#define UMEMIO 0x1E
> > +
> > +/* ioctl for umem_dev fd */
> > +#define UMEM_DEV_CREATE_UMEM _IOWR(UMEMIO, 0x0, struct umem_create)
> > +#define UMEM_DEV_LIST _IOWR(UMEMIO, 0x1, struct umem_list)
>
> Why is _LIST needed?
>
> > +#define UMEM_DEV_REATTACH _IOWR(UMEMIO, 0x2, struct umem_create)
> > +
> > +/* ioctl for umem fd */
> > +#define UMEM_GET_PAGE_REQUEST _IOWR(UMEMIO, 0x10, struct
> > umem_page_request)
> > +#define UMEM_MARK_PAGE_CACHED _IOW (UMEMIO, 0x11, struct
> > umem_page_cached)
>
> You could make the GET_PAGE_REQUEST / MARK_PAGE_CACHED protocol run over
> file descriptors, instead of an ioctl. It allows you to implement the
> other side in either the kernel or userspace. This is similar to how
> kvm uses an eventfd for communication with vhost-net in the kernel, or
> an implementation in userspace.
Do you mean that read/write on file descriptors is better than ioctl?
Okay, it would be easy to convert ioctl into read/write.
> > +#define UMEM_MAKE_VMA_ANONYMOUS _IO (UMEMIO, 0x12)
> > +
> > +#endif /* __LINUX_UMEM_H */
>
>
> --
> error compiling committee.c: too many arguments to function
>
--
yamahata
- [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Isaku Yamahata, 2011/12/28
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Isaku Yamahata, 2011/12/28
- [Qemu-devel] [PATCH 1/2] export necessary symbols, Isaku Yamahata, 2011/12/28
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Avi Kivity, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Isaku Yamahata, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Avi Kivity, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Isaku Yamahata, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Avi Kivity, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Isaku Yamahata, 2011/12/29
- Re: [Qemu-devel] [PATCH 0/2][RFC] postcopy migration: Linux char device for postcopy, Avi Kivity, 2011/12/29