Message ID | 1453361977-19589-4-git-send-email-aik@ozlabs.ru (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Thu, Jan 21, 2016 at 06:39:34PM +1100, Alexey Kardashevskiy wrote: > At the moment pages used for TCE tables (in addition to pages addressed > by TCEs) are not counted in locked_vm counter so a malicious userspace > tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and > lock a lot of memory. > > This adds counting for pages used for TCE tables. > > This counts the number of pages required for a table plus pages for > the kvmppc_spapr_tce_table struct (TCE table descriptor) itself. > > This changes release_spapr_tce_table() to store @npages on stack to > avoid calling kvmppc_stt_npages() in the loop (tiny optimization, > probably). > > This does not change the amount of (de)allocated memory. > > Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> > --- > Changes: > v2: > * switched from long to unsigned long types > * added WARN_ON_ONCE() in locked_vm decrement case > --- > arch/powerpc/kvm/book3s_64_vio.c | 55 +++++++++++++++++++++++++++++++++++++--- > 1 file changed, 52 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c > index 9526c34..ea498b4 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -39,19 +39,62 @@ > > #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) > > -static long kvmppc_stt_npages(unsigned long window_size) > +static unsigned long kvmppc_stt_npages(unsigned long window_size) > { > return ALIGN((window_size >> SPAPR_TCE_SHIFT) > * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; > } > > +static long kvmppc_account_memlimit(unsigned long npages, bool inc) > +{ > + long ret = 0; > + const unsigned long bytes = sizeof(struct kvmppc_spapr_tce_table) + > + (npages * sizeof(struct page *)); > + const unsigned long stt_pages = ALIGN(bytes, PAGE_SIZE) / PAGE_SIZE; Urgh, this is made pretty hard to follow by the fact that in some places npages / stt_pages refers to the number of pages occupied by the actual TCE tables, and in other places to the number of pages occupied by the overhead data structures. Please use different (and consistent) variables for the two things to make this clearer. It also seems odd the calculation of the overhead pages is done here, but the base number of pages is calculated in the caller, even though both quantities come from the stt structure itself. > + if (!current || !current->mm) > + return ret; /* process exited */ > + > + npages += stt_pages; > + > + down_write(¤t->mm->mmap_sem); > + > + if (inc) { > + unsigned long locked, lock_limit; > + > + locked = current->mm->locked_vm + npages; > + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; > + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) > + ret = -ENOMEM; > + else > + current->mm->locked_vm += npages; > + } else { > + if (WARN_ON_ONCE(npages > current->mm->locked_vm)) > + npages = current->mm->locked_vm; > + > + current->mm->locked_vm -= npages; > + } > + > + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, > + inc ? '+' : '-', > + npages << PAGE_SHIFT, > + current->mm->locked_vm << PAGE_SHIFT, > + rlimit(RLIMIT_MEMLOCK), > + ret ? " - exceeded" : ""); > + > + up_write(¤t->mm->mmap_sem); > + > + return ret; > +} > + > static void release_spapr_tce_table(struct rcu_head *head) > { > struct kvmppc_spapr_tce_table *stt = container_of(head, > struct kvmppc_spapr_tce_table, rcu); > int i; > + unsigned long npages = kvmppc_stt_npages(stt->window_size); > > - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) > + for (i = 0; i < npages; i++) > __free_page(stt->pages[i]); > > kfree(stt); > @@ -89,6 +132,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) > > kvm_put_kvm(stt->kvm); > > + kvmppc_account_memlimit(kvmppc_stt_npages(stt->window_size), false); > call_rcu(&stt->rcu, release_spapr_tce_table); > > return 0; > @@ -103,7 +147,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > struct kvm_create_spapr_tce *args) > { > struct kvmppc_spapr_tce_table *stt = NULL; > - long npages; > + unsigned long npages; > int ret = -ENOMEM; > int i; > > @@ -114,6 +158,11 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > } > > npages = kvmppc_stt_npages(args->window_size); > + ret = kvmppc_account_memlimit(npages, true); > + if (ret) { > + stt = NULL; > + goto fail; > + } > > stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), > GFP_KERNEL);
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 9526c34..ea498b4 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -39,19 +39,62 @@ #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) -static long kvmppc_stt_npages(unsigned long window_size) +static unsigned long kvmppc_stt_npages(unsigned long window_size) { return ALIGN((window_size >> SPAPR_TCE_SHIFT) * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; } +static long kvmppc_account_memlimit(unsigned long npages, bool inc) +{ + long ret = 0; + const unsigned long bytes = sizeof(struct kvmppc_spapr_tce_table) + + (npages * sizeof(struct page *)); + const unsigned long stt_pages = ALIGN(bytes, PAGE_SIZE) / PAGE_SIZE; + + if (!current || !current->mm) + return ret; /* process exited */ + + npages += stt_pages; + + down_write(¤t->mm->mmap_sem); + + if (inc) { + unsigned long locked, lock_limit; + + locked = current->mm->locked_vm + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + ret = -ENOMEM; + else + current->mm->locked_vm += npages; + } else { + if (WARN_ON_ONCE(npages > current->mm->locked_vm)) + npages = current->mm->locked_vm; + + current->mm->locked_vm -= npages; + } + + pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid, + inc ? '+' : '-', + npages << PAGE_SHIFT, + current->mm->locked_vm << PAGE_SHIFT, + rlimit(RLIMIT_MEMLOCK), + ret ? " - exceeded" : ""); + + up_write(¤t->mm->mmap_sem); + + return ret; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, struct kvmppc_spapr_tce_table, rcu); int i; + unsigned long npages = kvmppc_stt_npages(stt->window_size); - for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + for (i = 0; i < npages; i++) __free_page(stt->pages[i]); kfree(stt); @@ -89,6 +132,7 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) kvm_put_kvm(stt->kvm); + kvmppc_account_memlimit(kvmppc_stt_npages(stt->window_size), false); call_rcu(&stt->rcu, release_spapr_tce_table); return 0; @@ -103,7 +147,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; - long npages; + unsigned long npages; int ret = -ENOMEM; int i; @@ -114,6 +158,11 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, } npages = kvmppc_stt_npages(args->window_size); + ret = kvmppc_account_memlimit(npages, true); + if (ret) { + stt = NULL; + goto fail; + } stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL);
At the moment pages used for TCE tables (in addition to pages addressed by TCEs) are not counted in locked_vm counter so a malicious userspace tool can call ioctl(KVM_CREATE_SPAPR_TCE) as many times as RLIMIT_NOFILE and lock a lot of memory. This adds counting for pages used for TCE tables. This counts the number of pages required for a table plus pages for the kvmppc_spapr_tce_table struct (TCE table descriptor) itself. This changes release_spapr_tce_table() to store @npages on stack to avoid calling kvmppc_stt_npages() in the loop (tiny optimization, probably). This does not change the amount of (de)allocated memory. Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru> --- Changes: v2: * switched from long to unsigned long types * added WARN_ON_ONCE() in locked_vm decrement case --- arch/powerpc/kvm/book3s_64_vio.c | 55 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-)