diff mbox series

kvm: x86: Add memcg accounting to KVM allocations

Message ID 20190130204751.178301-1-bgardon@google.com (mailing list archive)
State New, archived
Headers show
Series kvm: x86: Add memcg accounting to KVM allocations | expand

Commit Message

Ben Gardon Jan. 30, 2019, 8:47 p.m. UTC
There are many KVM kernel memory allocations which are tied to the life of
the VM process and should be charged to the VM process's cgroup. If the
allocations aren't tied to the process, the OOM killer will not know
that killing the process will free the associated kernel memory.
Add __GFP_ACCOUNT flags to many of the allocations which are not yet being
charged to the VM process's cgroup.

Tested:
	Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch
	introduced no new failures.
	Ran a kernel memory accounting test which creates a VM to touch
	memory and then checks that the kernel memory is within ceratin
	bounds.
	With this patch that memory accounting is more (see below)
	correct.

There remain a few allocations which should be charged to the VM's
cgroup but are not. In x86, they include:
	vcpu->run
	vcpu->arch.pio_data
	kvm->coalesced_mmio_ring
There allocations are unaccounted in this patch because they are mapped
to userspace, and accounting them to a cgroup causes problems. This
should be addressed in a future patch.

Signed-off-by: Ben Gardon <bgardon@google.com>
---
 arch/x86/kvm/hyperv.c     |  2 +-
 arch/x86/kvm/i8254.c      |  2 +-
 arch/x86/kvm/i8259.c      |  2 +-
 arch/x86/kvm/ioapic.c     |  2 +-
 arch/x86/kvm/lapic.c      |  7 ++++---
 arch/x86/kvm/mmu.c        |  6 +++---
 arch/x86/kvm/page_track.c |  2 +-
 arch/x86/kvm/svm.c        | 23 ++++++++++++-----------
 arch/x86/kvm/vmx/nested.c |  9 +++++++--
 arch/x86/kvm/vmx/vmx.c    | 23 +++++++++++++++++------
 arch/x86/kvm/x86.c        | 16 +++++++++-------
 virt/kvm/coalesced_mmio.c |  3 ++-
 virt/kvm/eventfd.c        |  7 ++++---
 virt/kvm/irqchip.c        |  4 ++--
 virt/kvm/kvm_main.c       | 18 +++++++++---------
 virt/kvm/vfio.c           |  4 ++--
 16 files changed, 76 insertions(+), 54 deletions(-)

Comments

Shakeel Butt Jan. 30, 2019, 9:28 p.m. UTC | #1
On Wed, Jan 30, 2019 at 12:48 PM Ben Gardon <bgardon@google.com> wrote:
>
> There are many KVM kernel memory allocations which are tied to the life of
> the VM process and should be charged to the VM process's cgroup. If the
> allocations aren't tied to the process, the OOM killer will not know
> that killing the process will free the associated kernel memory.
> Add __GFP_ACCOUNT flags to many of the allocations which are not yet being
> charged to the VM process's cgroup.
>
> Tested:
>         Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch
>         introduced no new failures.
>         Ran a kernel memory accounting test which creates a VM to touch
>         memory and then checks that the kernel memory is within ceratin
>         bounds.
>         With this patch that memory accounting is more (see below)
>         correct.
>
> There remain a few allocations which should be charged to the VM's
> cgroup but are not. In x86, they include:
>         vcpu->run
>         vcpu->arch.pio_data
>         kvm->coalesced_mmio_ring
> There allocations are unaccounted in this patch because they are mapped
> to userspace, and accounting them to a cgroup causes problems. This
> should be addressed in a future patch.

Yes, this is based on the assumption that memcg charged kmem should
not be mapped to userspace and PG_kmemcg is define on page->_mapcount
field.

Seems like these fields are per-vcpu and are of PAGE_SIZE which I
think should not be ignored as system overhead.

The easiest possible solution seems like to move PG_kmemcg to actual
page flags but page flags are very rare resource. The other way is to
have an explicit interface similar to mem_cgroup_charge_skmem().
However that would be error prone. For network skbuff
allocations/deallocations are at one place but that's not the case
here.

>
> Signed-off-by: Ben Gardon <bgardon@google.com>

Reviewed-by: Shakeel Butt <shakeelb@google.com>

> ---
>  arch/x86/kvm/hyperv.c     |  2 +-
>  arch/x86/kvm/i8254.c      |  2 +-
>  arch/x86/kvm/i8259.c      |  2 +-
>  arch/x86/kvm/ioapic.c     |  2 +-
>  arch/x86/kvm/lapic.c      |  7 ++++---
>  arch/x86/kvm/mmu.c        |  6 +++---
>  arch/x86/kvm/page_track.c |  2 +-
>  arch/x86/kvm/svm.c        | 23 ++++++++++++-----------
>  arch/x86/kvm/vmx/nested.c |  9 +++++++--
>  arch/x86/kvm/vmx/vmx.c    | 23 +++++++++++++++++------
>  arch/x86/kvm/x86.c        | 16 +++++++++-------
>  virt/kvm/coalesced_mmio.c |  3 ++-
>  virt/kvm/eventfd.c        |  7 ++++---
>  virt/kvm/irqchip.c        |  4 ++--
>  virt/kvm/kvm_main.c       | 18 +++++++++---------
>  virt/kvm/vfio.c           |  4 ++--
>  16 files changed, 76 insertions(+), 54 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index c90a5352d158f..d51c09ca9f7bd 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -1729,7 +1729,7 @@ static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
>
>         mutex_lock(&hv->hv_lock);
>         ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
> -                       GFP_KERNEL);
> +                       GFP_KERNEL_ACCOUNT);
>         mutex_unlock(&hv->hv_lock);
>
>         if (ret >= 0)
> diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
> index af192895b1fc6..4a6dc54cc12be 100644
> --- a/arch/x86/kvm/i8254.c
> +++ b/arch/x86/kvm/i8254.c
> @@ -653,7 +653,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
>         pid_t pid_nr;
>         int ret;
>
> -       pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
> +       pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
>         if (!pit)
>                 return NULL;
>
> diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
> index bdcd4139eca92..8b38bb4868a65 100644
> --- a/arch/x86/kvm/i8259.c
> +++ b/arch/x86/kvm/i8259.c
> @@ -583,7 +583,7 @@ int kvm_pic_init(struct kvm *kvm)
>         struct kvm_pic *s;
>         int ret;
>
> -       s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
> +       s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
>         if (!s)
>                 return -ENOMEM;
>         spin_lock_init(&s->lock);
> diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
> index 4e822ad363f37..1add1bc881e22 100644
> --- a/arch/x86/kvm/ioapic.c
> +++ b/arch/x86/kvm/ioapic.c
> @@ -622,7 +622,7 @@ int kvm_ioapic_init(struct kvm *kvm)
>         struct kvm_ioapic *ioapic;
>         int ret;
>
> -       ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
> +       ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
>         if (!ioapic)
>                 return -ENOMEM;
>         spin_lock_init(&ioapic->lock);
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 9f089e2e09d02..a449a18df4bc3 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -181,7 +181,8 @@ static void recalculate_apic_map(struct kvm *kvm)
>                         max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
>
>         new = kvzalloc(sizeof(struct kvm_apic_map) +
> -                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
> +                          sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
> +                          GFP_KERNEL_ACCOUNT);
>
>         if (!new)
>                 goto out;
> @@ -2257,13 +2258,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
>         ASSERT(vcpu != NULL);
>         apic_debug("apic_init %d\n", vcpu->vcpu_id);
>
> -       apic = kzalloc(sizeof(*apic), GFP_KERNEL);
> +       apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
>         if (!apic)
>                 goto nomem;
>
>         vcpu->arch.apic = apic;
>
> -       apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
> +       apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>         if (!apic->regs) {
>                 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
>                        vcpu->vcpu_id);
> diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
> index ce770b4462385..98b60d0fe4b76 100644
> --- a/arch/x86/kvm/mmu.c
> +++ b/arch/x86/kvm/mmu.c
> @@ -959,7 +959,7 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
>         if (cache->nobjs >= min)
>                 return 0;
>         while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
> -               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
> +               obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
>                 if (!obj)
>                         return cache->nobjs >= min ? 0 : -ENOMEM;
>                 cache->objects[cache->nobjs++] = obj;
> @@ -3700,7 +3700,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
>
>                         u64 *lm_root;
>
> -                       lm_root = (void*)get_zeroed_page(GFP_KERNEL);
> +                       lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
>                         if (lm_root == NULL)
>                                 return 1;
>
> @@ -5496,7 +5496,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
>          * Therefore we need to allocate shadow page tables in the first
>          * 4GB of memory, which happens to fit the DMA32 zone.
>          */
> -       page = alloc_page(GFP_KERNEL | __GFP_DMA32);
> +       page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
>         if (!page)
>                 return -ENOMEM;
>
> diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
> index 3052a59a30655..fd04d462fdaee 100644
> --- a/arch/x86/kvm/page_track.c
> +++ b/arch/x86/kvm/page_track.c
> @@ -42,7 +42,7 @@ int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
>         for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
>                 slot->arch.gfn_track[i] =
>                         kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
> -                                GFP_KERNEL);
> +                                GFP_KERNEL_ACCOUNT);
>                 if (!slot->arch.gfn_track[i])
>                         goto track_free;
>         }
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> index 307e5bddb6d97..01cc4b1f7bfa0 100644
> --- a/arch/x86/kvm/svm.c
> +++ b/arch/x86/kvm/svm.c
> @@ -1797,7 +1797,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
>         if (size > PAGE_SIZE)
>                 pages = vmalloc(size);
>         else
> -               pages = kmalloc(size, GFP_KERNEL);
> +               pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
>
>         if (!pages)
>                 return NULL;
> @@ -1940,7 +1940,7 @@ static int avic_vm_init(struct kvm *kvm)
>                 return 0;
>
>         /* Allocating physical APIC ID table (4KB) */
> -       p_page = alloc_page(GFP_KERNEL);
> +       p_page = alloc_page(GFP_KERNEL_ACCOUNT);
>         if (!p_page)
>                 goto free_avic;
>
> @@ -1948,7 +1948,7 @@ static int avic_vm_init(struct kvm *kvm)
>         clear_page(page_address(p_page));
>
>         /* Allocating logical APIC ID table (4KB) */
> -       l_page = alloc_page(GFP_KERNEL);
> +       l_page = alloc_page(GFP_KERNEL_ACCOUNT);
>         if (!l_page)
>                 goto free_avic;
>
> @@ -2119,13 +2119,14 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>         struct page *nested_msrpm_pages;
>         int err;
>
> -       svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> +       svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
>         if (!svm) {
>                 err = -ENOMEM;
>                 goto out;
>         }
>
> -       svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
> +       svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
> +                                                    GFP_KERNEL_ACCOUNT);
>         if (!svm->vcpu.arch.guest_fpu) {
>                 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
>                 err = -ENOMEM;
> @@ -2137,19 +2138,19 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
>                 goto free_svm;
>
>         err = -ENOMEM;
> -       page = alloc_page(GFP_KERNEL);
> +       page = alloc_page(GFP_KERNEL_ACCOUNT);
>         if (!page)
>                 goto uninit;
>
> -       msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
> +       msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
>         if (!msrpm_pages)
>                 goto free_page1;
>
> -       nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
> +       nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
>         if (!nested_msrpm_pages)
>                 goto free_page2;
>
> -       hsave_page = alloc_page(GFP_KERNEL);
> +       hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
>         if (!hsave_page)
>                 goto free_page3;
>
> @@ -5196,7 +5197,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
>          * Allocating new amd_iommu_pi_data, which will get
>          * add to the per-vcpu ir_list.
>          */
> -       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
> +       ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
>         if (!ir) {
>                 ret = -ENOMEM;
>                 goto out;
> @@ -6309,7 +6310,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
>         if (ret)
>                 return ret;
>
> -       data = kzalloc(sizeof(*data), GFP_KERNEL);
> +       data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
>         if (!data)
>                 return -ENOMEM;
>
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index 3170e291215d0..88d20904b16e0 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -4140,11 +4140,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
>         if (r < 0)
>                 goto out_vmcs02;
>
> -       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
> +       vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
>         if (!vmx->nested.cached_vmcs12)
>                 goto out_cached_vmcs12;
>
> -       vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
> +       vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE,
> +                                                  GFP_KERNEL_ACCOUNT);
>         if (!vmx->nested.cached_shadow_vmcs12)
>                 goto out_cached_shadow_vmcs12;
>
> @@ -5686,6 +5687,10 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
>                 enable_shadow_vmcs = 0;
>         if (enable_shadow_vmcs) {
>                 for (i = 0; i < VMX_BITMAP_NR; i++) {
> +                       /*
> +                        * The vmx_bitmap is not tied to a VM and so should
> +                        * not be charged to a memcg.
> +                        */
>                         vmx_bitmap[i] = (unsigned long *)
>                                 __get_free_page(GFP_KERNEL);
>                         if (!vmx_bitmap[i]) {
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index 4d39f731bc332..6c2779cf9472c 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -245,6 +245,10 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
>
>         if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
>             !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> +               /*
> +                * This allocation for vmx_l1d_flush_pages is not tied to a VM
> +                * lifetime and so should not be charged to a memcg.
> +                */
>                 page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
>                 if (!page)
>                         return -ENOMEM;
> @@ -2395,7 +2399,11 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
>         struct page *pages;
>         struct vmcs *vmcs;
>
> -       pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
> +       /*
> +        * Since the page for the VMCS is inherently tied to the VM lifetime,
> +        * we should charge this allocation to the VM's memcg.
> +        */
> +       pages = __alloc_pages_node(node, GFP_KERNEL_ACCOUNT, vmcs_config.order);
>         if (!pages)
>                 return NULL;
>         vmcs = page_address(pages);
> @@ -2442,7 +2450,8 @@ int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
>         loaded_vmcs_init(loaded_vmcs);
>
>         if (cpu_has_vmx_msr_bitmap()) {
> -               loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
> +               loaded_vmcs->msr_bitmap = (unsigned long *)
> +                               __get_free_page(GFP_KERNEL_ACCOUNT);
>                 if (!loaded_vmcs->msr_bitmap)
>                         goto out_vmcs;
>                 memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
> @@ -6680,14 +6689,16 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
>  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>  {
>         int err;
> -       struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
> +       struct vcpu_vmx *vmx;
>         unsigned long *msr_bitmap;
>         int cpu;
>
> +       vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
>         if (!vmx)
>                 return ERR_PTR(-ENOMEM);
>
> -       vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
> +       vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
> +                       GFP_KERNEL_ACCOUNT);
>         if (!vmx->vcpu.arch.guest_fpu) {
>                 printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
>                 err = -ENOMEM;
> @@ -6709,12 +6720,12 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
>          * for the guest, etc.
>          */
>         if (enable_pml) {
> -               vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +               vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
>                 if (!vmx->pml_pg)
>                         goto uninit_vcpu;
>         }
>
> -       vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
> +       vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
>         BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
>                      > PAGE_SIZE);
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 02c8e095a2390..3aaa988c288aa 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -3877,7 +3877,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 r = -EINVAL;
>                 if (!lapic_in_kernel(vcpu))
>                         goto out;
> -               u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
> +               u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
> +                               GFP_KERNEL_ACCOUNT);
>
>                 r = -ENOMEM;
>                 if (!u.lapic)
> @@ -4064,7 +4065,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 break;
>         }
>         case KVM_GET_XSAVE: {
> -               u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
> +               u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
>                 r = -ENOMEM;
>                 if (!u.xsave)
>                         break;
> @@ -4088,7 +4089,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 break;
>         }
>         case KVM_GET_XCRS: {
> -               u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
> +               u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
>                 r = -ENOMEM;
>                 if (!u.xcrs)
>                         break;
> @@ -9024,14 +9025,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
>                 static_key_slow_inc(&kvm_no_apic_vcpu);
>
>         vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
> -                                      GFP_KERNEL);
> +                                      GFP_KERNEL_ACCOUNT);
>         if (!vcpu->arch.mce_banks) {
>                 r = -ENOMEM;
>                 goto fail_free_lapic;
>         }
>         vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
>
> -       if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
> +       if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
> +                               GFP_KERNEL_ACCOUNT)) {
>                 r = -ENOMEM;
>                 goto fail_free_mce_banks;
>         }
> @@ -9290,13 +9292,13 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
>
>                 slot->arch.rmap[i] =
>                         kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
> -                                GFP_KERNEL);
> +                                GFP_KERNEL_ACCOUNT);
>                 if (!slot->arch.rmap[i])
>                         goto out_free;
>                 if (i == 0)
>                         continue;
>
> -               linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
> +               linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
>                 if (!linfo)
>                         goto out_free;
>
> diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
> index 6855cce3e5287..5294abb3f1788 100644
> --- a/virt/kvm/coalesced_mmio.c
> +++ b/virt/kvm/coalesced_mmio.c
> @@ -144,7 +144,8 @@ int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
>         if (zone->pio != 1 && zone->pio != 0)
>                 return -EINVAL;
>
> -       dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
> +       dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
> +                     GFP_KERNEL_ACCOUNT);
>         if (!dev)
>                 return -ENOMEM;
>
> diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
> index b20b751286fc6..4325250afd728 100644
> --- a/virt/kvm/eventfd.c
> +++ b/virt/kvm/eventfd.c
> @@ -297,7 +297,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
>         if (!kvm_arch_intc_initialized(kvm))
>                 return -EAGAIN;
>
> -       irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
> +       irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
>         if (!irqfd)
>                 return -ENOMEM;
>
> @@ -345,7 +345,8 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
>                 }
>
>                 if (!irqfd->resampler) {
> -                       resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
> +                       resampler = kzalloc(sizeof(*resampler),
> +                                           GFP_KERNEL_ACCOUNT);
>                         if (!resampler) {
>                                 ret = -ENOMEM;
>                                 mutex_unlock(&kvm->irqfds.resampler_lock);
> @@ -797,7 +798,7 @@ static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
>         if (IS_ERR(eventfd))
>                 return PTR_ERR(eventfd);
>
> -       p = kzalloc(sizeof(*p), GFP_KERNEL);
> +       p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
>         if (!p) {
>                 ret = -ENOMEM;
>                 goto fail;
> diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
> index b1286c4e07122..3547b0d8c91ea 100644
> --- a/virt/kvm/irqchip.c
> +++ b/virt/kvm/irqchip.c
> @@ -196,7 +196,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
>         nr_rt_entries += 1;
>
>         new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
> -                     GFP_KERNEL);
> +                     GFP_KERNEL_ACCOUNT);
>
>         if (!new)
>                 return -ENOMEM;
> @@ -208,7 +208,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
>
>         for (i = 0; i < nr; ++i) {
>                 r = -ENOMEM;
> -               e = kzalloc(sizeof(*e), GFP_KERNEL);
> +               e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
>                 if (!e)
>                         goto out;
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 1f888a103f788..0d041a856da0d 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -525,7 +525,7 @@ static struct kvm_memslots *kvm_alloc_memslots(void)
>         int i;
>         struct kvm_memslots *slots;
>
> -       slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
> +       slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
>         if (!slots)
>                 return NULL;
>
> @@ -601,12 +601,12 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
>
>         kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
>                                          sizeof(*kvm->debugfs_stat_data),
> -                                        GFP_KERNEL);
> +                                        GFP_KERNEL_ACCOUNT);
>         if (!kvm->debugfs_stat_data)
>                 return -ENOMEM;
>
>         for (p = debugfs_entries; p->name; p++) {
> -               stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
> +               stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
>                 if (!stat_data)
>                         return -ENOMEM;
>
> @@ -671,7 +671,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
>                 goto out_err_no_irq_srcu;
>         for (i = 0; i < KVM_NR_BUSES; i++) {
>                 rcu_assign_pointer(kvm->buses[i],
> -                       kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
> +                       kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
>                 if (!kvm->buses[i])
>                         goto out_err;
>         }
> @@ -789,7 +789,7 @@ static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
>  {
>         unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
>
> -       memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
> +       memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
>         if (!memslot->dirty_bitmap)
>                 return -ENOMEM;
>
> @@ -1018,7 +1018,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
>                         goto out_free;
>         }
>
> -       slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
> +       slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
>         if (!slots)
>                 goto out_free;
>         memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
> @@ -2975,7 +2975,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
>         if (test)
>                 return 0;
>
> -       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
> +       dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
>         if (!dev)
>                 return -ENOMEM;
>
> @@ -3709,7 +3709,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
>                 return -ENOSPC;
>
>         new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
> -                         sizeof(struct kvm_io_range)), GFP_KERNEL);
> +                         sizeof(struct kvm_io_range)), GFP_KERNEL_ACCOUNT);
>         if (!new_bus)
>                 return -ENOMEM;
>
> @@ -3755,7 +3755,7 @@ void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
>                 return;
>
>         new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
> -                         sizeof(struct kvm_io_range)), GFP_KERNEL);
> +                         sizeof(struct kvm_io_range)), GFP_KERNEL_ACCOUNT);
>         if (!new_bus)  {
>                 pr_err("kvm: failed to shrink bus, removing it completely\n");
>                 goto broken;
> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
> index d99850c462a18..524cbd20379fb 100644
> --- a/virt/kvm/vfio.c
> +++ b/virt/kvm/vfio.c
> @@ -219,7 +219,7 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
>                         }
>                 }
>
> -               kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
> +               kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT);
>                 if (!kvg) {
>                         mutex_unlock(&kv->lock);
>                         kvm_vfio_group_put_external_user(vfio_group);
> @@ -405,7 +405,7 @@ static int kvm_vfio_create(struct kvm_device *dev, u32 type)
>                 if (tmp->ops == &kvm_vfio_ops)
>                         return -EBUSY;
>
> -       kv = kzalloc(sizeof(*kv), GFP_KERNEL);
> +       kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
>         if (!kv)
>                 return -ENOMEM;
>
> --
> 2.20.1.495.gaa96b0ce6b-goog
>
Shakeel Butt Jan. 30, 2019, 9:40 p.m. UTC | #2
On Wed, Jan 30, 2019 at 1:28 PM Shakeel Butt <shakeelb@google.com> wrote:
>
> On Wed, Jan 30, 2019 at 12:48 PM Ben Gardon <bgardon@google.com> wrote:
> >
> > There are many KVM kernel memory allocations which are tied to the life of
> > the VM process and should be charged to the VM process's cgroup. If the
> > allocations aren't tied to the process, the OOM killer will not know
> > that killing the process will free the associated kernel memory.
> > Add __GFP_ACCOUNT flags to many of the allocations which are not yet being
> > charged to the VM process's cgroup.
> >
> > Tested:
> >         Ran all kvm-unit-tests on a 64 bit Haswell machine, the patch
> >         introduced no new failures.
> >         Ran a kernel memory accounting test which creates a VM to touch
> >         memory and then checks that the kernel memory is within ceratin
> >         bounds.
> >         With this patch that memory accounting is more (see below)
> >         correct.
> >
> > There remain a few allocations which should be charged to the VM's
> > cgroup but are not. In x86, they include:
> >         vcpu->run
> >         vcpu->arch.pio_data
> >         kvm->coalesced_mmio_ring
> > There allocations are unaccounted in this patch because they are mapped
> > to userspace, and accounting them to a cgroup causes problems. This
> > should be addressed in a future patch.
>
> Yes, this is based on the assumption that memcg charged kmem should
> not be mapped to userspace and PG_kmemcg is define on page->_mapcount
> field.
>
> Seems like these fields are per-vcpu and are of PAGE_SIZE which I
> think should not be ignored as system overhead.
>
> The easiest possible solution seems like to move PG_kmemcg to actual
> page flags but page flags are very rare resource. The other way is to
> have an explicit interface similar to mem_cgroup_charge_skmem().
> However that would be error prone. For network skbuff
> allocations/deallocations are at one place but that's not the case
> here.
>
> >
> > Signed-off-by: Ben Gardon <bgardon@google.com>
>
> Reviewed-by: Shakeel Butt <shakeelb@google.com>

BTW there are kmem_cache_allocs whose kmem_caches already have
SLAB_ACCOUNT flag and thus no need to add GFP_KERNEL_ACCOUNT flag in
the allocation but I think it's fine as this will remove any
confusion.

Shakeel
diff mbox series

Patch

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index c90a5352d158f..d51c09ca9f7bd 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -1729,7 +1729,7 @@  static int kvm_hv_eventfd_assign(struct kvm *kvm, u32 conn_id, int fd)
 
 	mutex_lock(&hv->hv_lock);
 	ret = idr_alloc(&hv->conn_to_evt, eventfd, conn_id, conn_id + 1,
-			GFP_KERNEL);
+			GFP_KERNEL_ACCOUNT);
 	mutex_unlock(&hv->hv_lock);
 
 	if (ret >= 0)
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index af192895b1fc6..4a6dc54cc12be 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -653,7 +653,7 @@  struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 	pid_t pid_nr;
 	int ret;
 
-	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL);
+	pit = kzalloc(sizeof(struct kvm_pit), GFP_KERNEL_ACCOUNT);
 	if (!pit)
 		return NULL;
 
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index bdcd4139eca92..8b38bb4868a65 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -583,7 +583,7 @@  int kvm_pic_init(struct kvm *kvm)
 	struct kvm_pic *s;
 	int ret;
 
-	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
+	s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL_ACCOUNT);
 	if (!s)
 		return -ENOMEM;
 	spin_lock_init(&s->lock);
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 4e822ad363f37..1add1bc881e22 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -622,7 +622,7 @@  int kvm_ioapic_init(struct kvm *kvm)
 	struct kvm_ioapic *ioapic;
 	int ret;
 
-	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL);
+	ioapic = kzalloc(sizeof(struct kvm_ioapic), GFP_KERNEL_ACCOUNT);
 	if (!ioapic)
 		return -ENOMEM;
 	spin_lock_init(&ioapic->lock);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9f089e2e09d02..a449a18df4bc3 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -181,7 +181,8 @@  static void recalculate_apic_map(struct kvm *kvm)
 			max_id = max(max_id, kvm_x2apic_id(vcpu->arch.apic));
 
 	new = kvzalloc(sizeof(struct kvm_apic_map) +
-	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1), GFP_KERNEL);
+	                   sizeof(struct kvm_lapic *) * ((u64)max_id + 1),
+			   GFP_KERNEL_ACCOUNT);
 
 	if (!new)
 		goto out;
@@ -2257,13 +2258,13 @@  int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	ASSERT(vcpu != NULL);
 	apic_debug("apic_init %d\n", vcpu->vcpu_id);
 
-	apic = kzalloc(sizeof(*apic), GFP_KERNEL);
+	apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT);
 	if (!apic)
 		goto nomem;
 
 	vcpu->arch.apic = apic;
 
-	apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
+	apic->regs = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 	if (!apic->regs) {
 		printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
 		       vcpu->vcpu_id);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index ce770b4462385..98b60d0fe4b76 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -959,7 +959,7 @@  static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	if (cache->nobjs >= min)
 		return 0;
 	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
-		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
+		obj = kmem_cache_zalloc(base_cache, GFP_KERNEL_ACCOUNT);
 		if (!obj)
 			return cache->nobjs >= min ? 0 : -ENOMEM;
 		cache->objects[cache->nobjs++] = obj;
@@ -3700,7 +3700,7 @@  static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 
 			u64 *lm_root;
 
-			lm_root = (void*)get_zeroed_page(GFP_KERNEL);
+			lm_root = (void*)get_zeroed_page(GFP_KERNEL_ACCOUNT);
 			if (lm_root == NULL)
 				return 1;
 
@@ -5496,7 +5496,7 @@  static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 	 * Therefore we need to allocate shadow page tables in the first
 	 * 4GB of memory, which happens to fit the DMA32 zone.
 	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32);
+	page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c
index 3052a59a30655..fd04d462fdaee 100644
--- a/arch/x86/kvm/page_track.c
+++ b/arch/x86/kvm/page_track.c
@@ -42,7 +42,7 @@  int kvm_page_track_create_memslot(struct kvm_memory_slot *slot,
 	for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
 		slot->arch.gfn_track[i] =
 			kvcalloc(npages, sizeof(*slot->arch.gfn_track[i]),
-				 GFP_KERNEL);
+				 GFP_KERNEL_ACCOUNT);
 		if (!slot->arch.gfn_track[i])
 			goto track_free;
 	}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 307e5bddb6d97..01cc4b1f7bfa0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1797,7 +1797,7 @@  static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
 	if (size > PAGE_SIZE)
 		pages = vmalloc(size);
 	else
-		pages = kmalloc(size, GFP_KERNEL);
+		pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
 
 	if (!pages)
 		return NULL;
@@ -1940,7 +1940,7 @@  static int avic_vm_init(struct kvm *kvm)
 		return 0;
 
 	/* Allocating physical APIC ID table (4KB) */
-	p_page = alloc_page(GFP_KERNEL);
+	p_page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!p_page)
 		goto free_avic;
 
@@ -1948,7 +1948,7 @@  static int avic_vm_init(struct kvm *kvm)
 	clear_page(page_address(p_page));
 
 	/* Allocating logical APIC ID table (4KB) */
-	l_page = alloc_page(GFP_KERNEL);
+	l_page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!l_page)
 		goto free_avic;
 
@@ -2119,13 +2119,14 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	struct page *nested_msrpm_pages;
 	int err;
 
-	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
 	if (!svm) {
 		err = -ENOMEM;
 		goto out;
 	}
 
-	svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+	svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+						     GFP_KERNEL_ACCOUNT);
 	if (!svm->vcpu.arch.guest_fpu) {
 		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
 		err = -ENOMEM;
@@ -2137,19 +2138,19 @@  static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 		goto free_svm;
 
 	err = -ENOMEM;
-	page = alloc_page(GFP_KERNEL);
+	page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!page)
 		goto uninit;
 
-	msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
 	if (!msrpm_pages)
 		goto free_page1;
 
-	nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+	nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
 	if (!nested_msrpm_pages)
 		goto free_page2;
 
-	hsave_page = alloc_page(GFP_KERNEL);
+	hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
 	if (!hsave_page)
 		goto free_page3;
 
@@ -5196,7 +5197,7 @@  static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
 	 * Allocating new amd_iommu_pi_data, which will get
 	 * add to the per-vcpu ir_list.
 	 */
-	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+	ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
 	if (!ir) {
 		ret = -ENOMEM;
 		goto out;
@@ -6309,7 +6310,7 @@  static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error)
 	if (ret)
 		return ret;
 
-	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
 	if (!data)
 		return -ENOMEM;
 
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3170e291215d0..88d20904b16e0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -4140,11 +4140,12 @@  static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto out_vmcs02;
 
-	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
 	if (!vmx->nested.cached_vmcs12)
 		goto out_cached_vmcs12;
 
-	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE,
+						   GFP_KERNEL_ACCOUNT);
 	if (!vmx->nested.cached_shadow_vmcs12)
 		goto out_cached_shadow_vmcs12;
 
@@ -5686,6 +5687,10 @@  __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 		enable_shadow_vmcs = 0;
 	if (enable_shadow_vmcs) {
 		for (i = 0; i < VMX_BITMAP_NR; i++) {
+			/*
+			 * The vmx_bitmap is not tied to a VM and so should
+			 * not be charged to a memcg.
+			 */
 			vmx_bitmap[i] = (unsigned long *)
 				__get_free_page(GFP_KERNEL);
 			if (!vmx_bitmap[i]) {
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 4d39f731bc332..6c2779cf9472c 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -245,6 +245,10 @@  static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
 
 	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
 	    !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
+		/*
+		 * This allocation for vmx_l1d_flush_pages is not tied to a VM
+		 * lifetime and so should not be charged to a memcg.
+		 */
 		page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
 		if (!page)
 			return -ENOMEM;
@@ -2395,7 +2399,11 @@  struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
 	struct page *pages;
 	struct vmcs *vmcs;
 
-	pages = __alloc_pages_node(node, GFP_KERNEL, vmcs_config.order);
+	/*
+	 * Since the page for the VMCS is inherently tied to the VM lifetime,
+	 * we should charge this allocation to the VM's memcg.
+	 */
+	pages = __alloc_pages_node(node, GFP_KERNEL_ACCOUNT, vmcs_config.order);
 	if (!pages)
 		return NULL;
 	vmcs = page_address(pages);
@@ -2442,7 +2450,8 @@  int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 	loaded_vmcs_init(loaded_vmcs);
 
 	if (cpu_has_vmx_msr_bitmap()) {
-		loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
+		loaded_vmcs->msr_bitmap = (unsigned long *)
+				__get_free_page(GFP_KERNEL_ACCOUNT);
 		if (!loaded_vmcs->msr_bitmap)
 			goto out_vmcs;
 		memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
@@ -6680,14 +6689,16 @@  static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 	int err;
-	struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+	struct vcpu_vmx *vmx;
 	unsigned long *msr_bitmap;
 	int cpu;
 
+	vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 
-	vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache, GFP_KERNEL);
+	vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+			GFP_KERNEL_ACCOUNT);
 	if (!vmx->vcpu.arch.guest_fpu) {
 		printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
 		err = -ENOMEM;
@@ -6709,12 +6720,12 @@  static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	 * for the guest, etc.
 	 */
 	if (enable_pml) {
-		vmx->pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
 		if (!vmx->pml_pg)
 			goto uninit_vcpu;
 	}
 
-	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
 	BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
 		     > PAGE_SIZE);
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 02c8e095a2390..3aaa988c288aa 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3877,7 +3877,8 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		if (!lapic_in_kernel(vcpu))
 			goto out;
-		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
+				GFP_KERNEL_ACCOUNT);
 
 		r = -ENOMEM;
 		if (!u.lapic)
@@ -4064,7 +4065,7 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XSAVE: {
-		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xsave)
 			break;
@@ -4088,7 +4089,7 @@  long kvm_arch_vcpu_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_GET_XCRS: {
-		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
 		r = -ENOMEM;
 		if (!u.xcrs)
 			break;
@@ -9024,14 +9025,15 @@  int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		static_key_slow_inc(&kvm_no_apic_vcpu);
 
 	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
-				       GFP_KERNEL);
+				       GFP_KERNEL_ACCOUNT);
 	if (!vcpu->arch.mce_banks) {
 		r = -ENOMEM;
 		goto fail_free_lapic;
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
-	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) {
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+				GFP_KERNEL_ACCOUNT)) {
 		r = -ENOMEM;
 		goto fail_free_mce_banks;
 	}
@@ -9290,13 +9292,13 @@  int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 		slot->arch.rmap[i] =
 			kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
-				 GFP_KERNEL);
+				 GFP_KERNEL_ACCOUNT);
 		if (!slot->arch.rmap[i])
 			goto out_free;
 		if (i == 0)
 			continue;
 
-		linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL);
+		linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
 		if (!linfo)
 			goto out_free;
 
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 6855cce3e5287..5294abb3f1788 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -144,7 +144,8 @@  int kvm_vm_ioctl_register_coalesced_mmio(struct kvm *kvm,
 	if (zone->pio != 1 && zone->pio != 0)
 		return -EINVAL;
 
-	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(struct kvm_coalesced_mmio_dev),
+		      GFP_KERNEL_ACCOUNT);
 	if (!dev)
 		return -ENOMEM;
 
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index b20b751286fc6..4325250afd728 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -297,7 +297,7 @@  kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	if (!kvm_arch_intc_initialized(kvm))
 		return -EAGAIN;
 
-	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL);
+	irqfd = kzalloc(sizeof(*irqfd), GFP_KERNEL_ACCOUNT);
 	if (!irqfd)
 		return -ENOMEM;
 
@@ -345,7 +345,8 @@  kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 		}
 
 		if (!irqfd->resampler) {
-			resampler = kzalloc(sizeof(*resampler), GFP_KERNEL);
+			resampler = kzalloc(sizeof(*resampler),
+					    GFP_KERNEL_ACCOUNT);
 			if (!resampler) {
 				ret = -ENOMEM;
 				mutex_unlock(&kvm->irqfds.resampler_lock);
@@ -797,7 +798,7 @@  static int kvm_assign_ioeventfd_idx(struct kvm *kvm,
 	if (IS_ERR(eventfd))
 		return PTR_ERR(eventfd);
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 	if (!p) {
 		ret = -ENOMEM;
 		goto fail;
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index b1286c4e07122..3547b0d8c91ea 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -196,7 +196,7 @@  int kvm_set_irq_routing(struct kvm *kvm,
 	nr_rt_entries += 1;
 
 	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head)),
-		      GFP_KERNEL);
+		      GFP_KERNEL_ACCOUNT);
 
 	if (!new)
 		return -ENOMEM;
@@ -208,7 +208,7 @@  int kvm_set_irq_routing(struct kvm *kvm,
 
 	for (i = 0; i < nr; ++i) {
 		r = -ENOMEM;
-		e = kzalloc(sizeof(*e), GFP_KERNEL);
+		e = kzalloc(sizeof(*e), GFP_KERNEL_ACCOUNT);
 		if (!e)
 			goto out;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1f888a103f788..0d041a856da0d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -525,7 +525,7 @@  static struct kvm_memslots *kvm_alloc_memslots(void)
 	int i;
 	struct kvm_memslots *slots;
 
-	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
 	if (!slots)
 		return NULL;
 
@@ -601,12 +601,12 @@  static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
 
 	kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
 					 sizeof(*kvm->debugfs_stat_data),
-					 GFP_KERNEL);
+					 GFP_KERNEL_ACCOUNT);
 	if (!kvm->debugfs_stat_data)
 		return -ENOMEM;
 
 	for (p = debugfs_entries; p->name; p++) {
-		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL);
+		stat_data = kzalloc(sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
 		if (!stat_data)
 			return -ENOMEM;
 
@@ -671,7 +671,7 @@  static struct kvm *kvm_create_vm(unsigned long type)
 		goto out_err_no_irq_srcu;
 	for (i = 0; i < KVM_NR_BUSES; i++) {
 		rcu_assign_pointer(kvm->buses[i],
-			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL));
+			kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
 		if (!kvm->buses[i])
 			goto out_err;
 	}
@@ -789,7 +789,7 @@  static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
 
-	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL);
+	memslot->dirty_bitmap = kvzalloc(dirty_bytes, GFP_KERNEL_ACCOUNT);
 	if (!memslot->dirty_bitmap)
 		return -ENOMEM;
 
@@ -1018,7 +1018,7 @@  int __kvm_set_memory_region(struct kvm *kvm,
 			goto out_free;
 	}
 
-	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+	slots = kvzalloc(sizeof(struct kvm_memslots), GFP_KERNEL_ACCOUNT);
 	if (!slots)
 		goto out_free;
 	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
@@ -2975,7 +2975,7 @@  static int kvm_ioctl_create_device(struct kvm *kvm,
 	if (test)
 		return 0;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL_ACCOUNT);
 	if (!dev)
 		return -ENOMEM;
 
@@ -3709,7 +3709,7 @@  int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		return -ENOSPC;
 
 	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
-			  sizeof(struct kvm_io_range)), GFP_KERNEL);
+			  sizeof(struct kvm_io_range)), GFP_KERNEL_ACCOUNT);
 	if (!new_bus)
 		return -ENOMEM;
 
@@ -3755,7 +3755,7 @@  void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 		return;
 
 	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
-			  sizeof(struct kvm_io_range)), GFP_KERNEL);
+			  sizeof(struct kvm_io_range)), GFP_KERNEL_ACCOUNT);
 	if (!new_bus)  {
 		pr_err("kvm: failed to shrink bus, removing it completely\n");
 		goto broken;
diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c
index d99850c462a18..524cbd20379fb 100644
--- a/virt/kvm/vfio.c
+++ b/virt/kvm/vfio.c
@@ -219,7 +219,7 @@  static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg)
 			}
 		}
 
-		kvg = kzalloc(sizeof(*kvg), GFP_KERNEL);
+		kvg = kzalloc(sizeof(*kvg), GFP_KERNEL_ACCOUNT);
 		if (!kvg) {
 			mutex_unlock(&kv->lock);
 			kvm_vfio_group_put_external_user(vfio_group);
@@ -405,7 +405,7 @@  static int kvm_vfio_create(struct kvm_device *dev, u32 type)
 		if (tmp->ops == &kvm_vfio_ops)
 			return -EBUSY;
 
-	kv = kzalloc(sizeof(*kv), GFP_KERNEL);
+	kv = kzalloc(sizeof(*kv), GFP_KERNEL_ACCOUNT);
 	if (!kv)
 		return -ENOMEM;