qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[PATCH] virtio_balloon: Add option mprotect to handle guest kernel cheat


From: Hui Zhu
Subject: [PATCH] virtio_balloon: Add option mprotect to handle guest kernel cheat issue
Date: Thu, 12 Mar 2020 18:56:12 +0800

QEMU virtio_balloon decreases the memory usage of VM depends on the
cooperation of the guest kernel.  If the guest kernel cheats with
the virtio_balloon, it can break the limit.

[1] is a Linux kernel with a cheat code.
This is an example in a VM with 1G memory 1CPU that use this kernel:
// Start VM.
// In the guest, use 
usemem(https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git)
// allocate 800M memory then the usage of VM will increase.
usemem 800m

// Check anon memory usage of QEMU in the host.
ps -e | grep qemu
67768 pts/5    00:00:02 qemu-system-x86
cat /proc/67768/status | grep RssAnon:
RssAnon:          994892 kB

// Connect to the monitor of QEMU and inflate the balloon 800M.
(qemu) device_add virtio-balloon-pci,id=balloon1
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 224
(qemu) info balloon
balloon: actual=224

// Check anon memory usage of QEMU in the host again.
cat /proc/67768/status | grep RssAnon:
RssAnon:          175848 kB

// Use the cheat code inside the guest kernel to
// access the memory of the balloon.
cat /proc/virtio_balloon
204800

// The memory usage of QEMU increase back without deflation the balloon.
cat /proc/67768/status | grep RssAnon:
RssAnon:          995036 kB

This commit tries to handle the issue.
It set the address of balloon pages to cannot access with mprotect
when inflating the balloon and set it back when deflating the balloon.
When guest kernel cheat code tries to access the pages inside the balloon,
the host kernel will call handle_ept_violation-> tdp_page_fault to handle it.
tdp_page_fault will call hva_to_pfn to get PFN of the page.
hva_to_pfn_fast and hva_to_pfn_slow will check the VMA access permission.
Then change the VMA access permission can handle the issue.

This is an example in a VM with 1G memory 1CPU that use this kernel:
// Start VM.
// In the guest, use 
usemem(https://git.kernel.org/pub/scm/linux/kernel/git/wfg/vm-scalability.git)
// allocate 800M memory then the usage of VM will increase.
usemem 800m

// Connect to the monitor of QEMU and inflate the balloon 800M.
(qemu) device_add virtio-balloon-pci,id=balloon1,mprotect=on
(qemu) info balloon
balloon: actual=1024
(qemu) balloon 224
(qemu) info balloon
balloon: actual=224

// Use the cheat code inside the guest kernel to access the memory of balloon.
cat /proc/virtio_balloon
error: kvm run failed Bad address
RAX=000000000783c000 RBX=0000000000000000 RCX=ffff88803952bc00 
RDX=ffff888000000000
RSI=ffffea00001e0f40 RDI=ffffea00001e0f48 RBP=ffff88803aac7180 
RSP=ffffc90000207a70
R8 =ffff8880396af100 R9 =ffff88803a802700 R10=0000000000001000 
R11=ffffffffffffffff
R12=ffff8880396af120 R13=0000000000000001 R14=ffffc90000207c60 
R15=ffff88803aac7180
RIP=ffffffff81604aad RFL=00010206 [-----P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 0000000000000000 ffffffff 00c00000
CS =0010 0000000000000000 ffffffff 00a09b00 DPL=0 CS64 [-RA]
SS =0018 0000000000000000 ffffffff 00c09300 DPL=0 DS   [-WA]
DS =0000 0000000000000000 ffffffff 00c00000
FS =0000 0000000000bcd880 ffffffff 00c00000
GS =0000 ffff88803ec00000 ffffffff 00c00000
LDT=0000 0000000000000000 ffffffff 00c00000
TR =0040 fffffe0000003000 00004087 00008b00 DPL=0 TSS64-busy
GDT=     fffffe0000001000 0000007f
IDT=     fffffe0000000000 00000fff
CR0=80050033 CR2=00000000008a2260 CR3=00000000396b6004 CR4=00360ef0
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 
DR3=0000000000000000
DR6=00000000fffe0ff0 DR7=0000000000000400
EFER=0000000000000d01
Code=05 72 5b e1 00 48 8b 15 7b 5b e1 00 48 c1 f8 06 48 c1 e0 0c <c6> 04 10 01 
83 a9 a8 12 00 00 01 48 8b 46 08 83 c3 01 48 8d 50 f8 48 8d 46 08 49 35

This shows that the access is refused and QEMU stops.

[1] https://github.com/teawater/linux/tree/write_balloon

Signed-off-by: Hui Zhu <address@hidden>
---
 exec.c                             | 33 +++++++++++++++++++++++----------
 hw/virtio/virtio-balloon.c         | 20 +++++++++++++++++---
 include/exec/cpu-common.h          |  2 ++
 include/hw/virtio/virtio-balloon.h |  2 ++
 include/qemu/osdep.h               |  1 +
 util/osdep.c                       |  9 +++++++++
 6 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/exec.c b/exec.c
index 0cc500d..ea14412 100644
--- a/exec.c
+++ b/exec.c
@@ -3846,17 +3846,20 @@ int qemu_ram_foreach_block(RAMBlockIterFunc func, void 
*opaque)
  * they a) read as 0, b) Trigger whatever fault mechanism
  * the OS provides for postcopy.
  * The pages must be unmapped by the end of the function.
+ * If do_mprotect is true, call mprotect to change the
+ * protection to PROT_NONE before unmap.
  * Returns: 0 on success, none-0 on failure
  *
  */
-int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
+int __ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length,
+                              bool do_mprotect)
 {
     int ret = -1;
 
     uint8_t *host_startaddr = rb->host + start;
 
     if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) {
-        error_report("ram_block_discard_range: Unaligned start address: %p",
+        error_report("__ram_block_discard_range: Unaligned start address: %p",
                      host_startaddr);
         goto err;
     }
@@ -3864,11 +3867,16 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t 
start, size_t length)
     if ((start + length) <= rb->used_length) {
         bool need_madvise, need_fallocate;
         if (!QEMU_IS_ALIGNED(length, rb->page_size)) {
-            error_report("ram_block_discard_range: Unaligned length: %zx",
+            error_report("__ram_block_discard_range: Unaligned length: %zx",
                          length);
             goto err;
         }
 
+        if (do_mprotect && qemu_mprotect_none(host_startaddr, length) < 0) {
+            ret = -errno;
+            goto err;
+        }
+
         errno = ENOTSUP; /* If we are missing MADVISE etc */
 
         /* The logic here is messy;
@@ -3887,15 +3895,15 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t 
start, size_t length)
                             start, length);
             if (ret) {
                 ret = -errno;
-                error_report("ram_block_discard_range: Failed to fallocate "
+                error_report("__ram_block_discard_range: Failed to fallocate "
                              "%s:%" PRIx64 " +%zx (%d)",
                              rb->idstr, start, length, ret);
                 goto err;
             }
 #else
             ret = -ENOSYS;
-            error_report("ram_block_discard_range: fallocate not 
available/file"
-                         "%s:%" PRIx64 " +%zx (%d)",
+            error_report("__ram_block_discard_range: fallocate not "
+                         "available/file %s:%" PRIx64 " +%zx (%d)",
                          rb->idstr, start, length, ret);
             goto err;
 #endif
@@ -3910,14 +3918,14 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t 
start, size_t length)
             ret =  madvise(host_startaddr, length, MADV_DONTNEED);
             if (ret) {
                 ret = -errno;
-                error_report("ram_block_discard_range: Failed to discard range 
"
-                             "%s:%" PRIx64 " +%zx (%d)",
+                error_report("__ram_block_discard_range: Failed to discard "
+                             "range %s:%" PRIx64 " +%zx (%d)",
                              rb->idstr, start, length, ret);
                 goto err;
             }
 #else
             ret = -ENOSYS;
-            error_report("ram_block_discard_range: MADVISE not available"
+            error_report("__ram_block_discard_range: MADVISE not available "
                          "%s:%" PRIx64 " +%zx (%d)",
                          rb->idstr, start, length, ret);
             goto err;
@@ -3926,7 +3934,7 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, 
size_t length)
         trace_ram_block_discard_range(rb->idstr, host_startaddr, length,
                                       need_madvise, need_fallocate, ret);
     } else {
-        error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64
+        error_report("__ram_block_discard_range: Overrun block '%s' (%" PRIu64
                      "/%zx/" RAM_ADDR_FMT")",
                      rb->idstr, start, length, rb->used_length);
     }
@@ -3935,6 +3943,11 @@ err:
     return ret;
 }
 
+int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length)
+{
+    return __ram_block_discard_range(rb, start, length, false);
+}
+
 bool ramblock_is_pmem(RAMBlock *rb)
 {
     return rb->flags & RAM_PMEM;
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index a4729f7..77c0c4b 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -81,7 +81,8 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
     if (rb_page_size == BALLOON_PAGE_SIZE) {
         /* Easy case */
 
-        ram_block_discard_range(rb, rb_offset, rb_page_size);
+        __ram_block_discard_range(rb, rb_offset, rb_page_size,
+                                  balloon->do_mprotect);
         /* We ignore errors from ram_block_discard_range(), because it
          * has already reported them, and failing to discard a balloon
          * page is not fatal */
@@ -120,8 +121,9 @@ static void balloon_inflate_page(VirtIOBalloon *balloon,
         /* We've accumulated a full host page, we can actually discard
          * it now */
 
-        ram_block_discard_range(rb, rb_aligned_offset, rb_page_size);
-        /* We ignore errors from ram_block_discard_range(), because it
+        __ram_block_discard_range(rb, rb_aligned_offset, rb_page_size,
+                                  balloon->do_mprotect);
+        /* We ignore errors from __ram_block_discard_range(), because it
          * has already reported them, and failing to discard a balloon
          * page is not fatal */
         virtio_balloon_pbp_free(pbp);
@@ -145,6 +147,9 @@ static void balloon_deflate_page(VirtIOBalloon *balloon,
 
     host_addr = (void *)((uintptr_t)addr & ~(rb_page_size - 1));
 
+    if (balloon->do_mprotect)
+        g_assert(qemu_mprotect_rw(host_addr, rb_page_size) == 0);
+
     /* When a page is deflated, we hint the whole host page it lives
      * on, since we can't do anything smaller */
     ret = qemu_madvise(host_addr, rb_page_size, QEMU_MADV_WILLNEED);
@@ -780,6 +785,14 @@ static void virtio_balloon_device_realize(DeviceState 
*dev, Error **errp)
     virtio_init(vdev, "virtio-balloon", VIRTIO_ID_BALLOON,
                 virtio_balloon_config_size(s));
 
+    if (s->do_mprotect
+        && virtio_has_feature(s->host_features,
+                              VIRTIO_BALLOON_F_FREE_PAGE_HINT)) {
+        error_setg(errp, "mprotect and free-page-hint cannot work together");
+        virtio_cleanup(vdev);
+        return;
+    }
+
     ret = qemu_add_balloon_handler(virtio_balloon_to_target,
                                    virtio_balloon_stat, s);
 
@@ -924,6 +937,7 @@ static Property virtio_balloon_properties[] = {
                      qemu_4_0_config_size, false),
     DEFINE_PROP_LINK("iothread", VirtIOBalloon, iothread, TYPE_IOTHREAD,
                      IOThread *),
+    DEFINE_PROP_BOOL("mprotect", VirtIOBalloon, do_mprotect, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index b47e563..ffeb4a6 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -104,6 +104,8 @@ typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque);
 
 int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque);
 int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length);
+int __ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length,
+                              bool do_mprotect);
 
 #endif
 
diff --git a/include/hw/virtio/virtio-balloon.h 
b/include/hw/virtio/virtio-balloon.h
index d1c968d..6f15085 100644
--- a/include/hw/virtio/virtio-balloon.h
+++ b/include/hw/virtio/virtio-balloon.h
@@ -70,6 +70,8 @@ typedef struct VirtIOBalloon {
     uint32_t host_features;
 
     bool qemu_4_0_config_size;
+
+    bool do_mprotect;
 } VirtIOBalloon;
 
 #endif
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 9bd3dcf..1bcb4e4 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -458,6 +458,7 @@ void sigaction_invoke(struct sigaction *action,
 
 int qemu_madvise(void *addr, size_t len, int advice);
 int qemu_mprotect_rwx(void *addr, size_t size);
+int qemu_mprotect_rw(void *addr, size_t size);
 int qemu_mprotect_none(void *addr, size_t size);
 
 int qemu_open(const char *name, int flags, ...);
diff --git a/util/osdep.c b/util/osdep.c
index 4829c07..2aff13b 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -105,6 +105,15 @@ int qemu_mprotect_rwx(void *addr, size_t size)
 #endif
 }
 
+int qemu_mprotect_rw(void *addr, size_t size)
+{
+#ifdef _WIN32
+    return qemu_mprotect__osdep(addr, size, PAGE_EXECUTE_READWRITE);
+#else
+    return qemu_mprotect__osdep(addr, size, PROT_READ | PROT_WRITE);
+#endif
+}
+
 int qemu_mprotect_none(void *addr, size_t size)
 {
 #ifdef _WIN32
-- 
2.7.4




reply via email to

[Prev in Thread] Current Thread [Next in Thread]