From patchwork Thu Jul 29 16:25:09 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Tom Tucker X-Patchwork-Id: 115245 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.4/8.14.3) with ESMTP id o6TGP08A008708 for ; Thu, 29 Jul 2010 16:25:12 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1757864Ab0G2QZL (ORCPT ); Thu, 29 Jul 2010 12:25:11 -0400 Received: from smtp.opengridcomputing.com ([209.198.142.2]:53038 "EHLO smtp.opengridcomputing.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754609Ab0G2QZK (ORCPT ); Thu, 29 Jul 2010 12:25:10 -0400 Received: from build.ogc.int (build.ogc.int [10.10.0.2]) by smtp.opengridcomputing.com (Postfix) with ESMTP id A50547C794; Thu, 29 Jul 2010 11:25:09 -0500 (CDT) From: Tom Tucker Subject: [RFC PATCH 2/4] uverbs: Add common ib_iomem_get service To: rdreier@cisco.com Cc: linux-rdma@vger.kernel.org, brandt@sandia.gov, tom@ogc.us, swise@ogc.us Date: Thu, 29 Jul 2010 11:25:09 -0500 Message-ID: <20100729162509.14674.34237.stgit@build.ogc.int> In-Reply-To: <20100729162339.14674.15788.stgit@build.ogc.int> References: <20100729162339.14674.15788.stgit@build.ogc.int> User-Agent: StGIT/0.14.3 MIME-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Thu, 29 Jul 2010 16:25:12 +0000 (UTC) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 415e186..f103956 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -52,16 +52,18 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d int i; list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) { - ib_dma_unmap_sg(dev, chunk->page_list, - chunk->nents, DMA_BIDIRECTIONAL); - for (i = 0; i < chunk->nents; ++i) { - struct page *page = sg_page(&chunk->page_list[i]); - - if (umem->writable && dirty) - set_page_dirty_lock(page); - put_page(page); - } + if (umem->type == IB_UMEM_MEM_MAP) { + ib_dma_unmap_sg(dev, chunk->page_list, + chunk->nents, DMA_BIDIRECTIONAL); + for (i = 0; i < chunk->nents; ++i) { + struct page *page = + sg_page(&chunk->page_list[i]); + if (umem->writable && dirty) + set_page_dirty_lock(page); + put_page(page); + } + } kfree(chunk); } } @@ -150,7 +152,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, while (npages) { ret = get_user_pages(current, current->mm, cur_base, min_t(unsigned long, npages, - PAGE_SIZE / sizeof (struct page *)), + PAGE_SIZE / sizeof(struct page *)), 1, !umem->writable, page_list, vma_list); if (ret < 0) @@ -162,7 +164,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, off = 0; while (ret) { - chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) * + chunk = kmalloc(sizeof *chunk + + sizeof(struct scatterlist) * min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), GFP_KERNEL); if (!chunk) { @@ -292,3 +295,226 @@ int ib_umem_page_count(struct ib_umem *umem) return n; } EXPORT_SYMBOL(ib_umem_page_count); +/* + * Return the PFN for the specified address in the vma. This only + * works for a vma that is VM_PFNMAP. + */ +static unsigned long follow_io_pfn(struct vm_area_struct *vma, + unsigned long address, int write) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + unsigned long pfn; + struct mm_struct *mm = vma->vm_mm; + + BUG_ON(0 == (vma->vm_flags & VM_PFNMAP)); + + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return 0; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return 0; + if (unlikely(pud_bad(*pud))) + return 0; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return 0; + if (unlikely(pmd_bad(*pmd))) + return 0; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!pte_present(pte)) + goto bad; + if (write && !pte_write(pte)) + goto bad; + + pfn = pte_pfn(pte); + pte_unmap_unlock(ptep, ptl); + return pfn; + bad: + pte_unmap_unlock(ptep, ptl); + return 0; +} + +int ib_get_io_pfn(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + unsigned long *pfn_list, struct vm_area_struct **vmas) +{ + unsigned long pfn; + int i; + if (len <= 0) + return 0; + + i = 0; + do { + struct vm_area_struct *vma; + + vma = find_vma(mm, start); + if (0 == (vma->vm_flags & VM_PFNMAP)) + return -EINVAL; + + if (0 == (vma->vm_flags & VM_IO)) + return -EFAULT; + + if (is_vm_hugetlb_page(vma)) + return -EFAULT; + + do { + cond_resched(); + pfn = follow_io_pfn(vma, start, write); + if (!pfn) + return -EFAULT; + if (pfn_list) + pfn_list[i] = pfn; + if (vmas) + vmas[i] = vma; + i++; + start += PAGE_SIZE; + len--; + } while (len && start < vma->vm_end); + } while (len); + return i; +} + +/** + * ib_iomem_get - DMA map a userspace map of IO memory. + * @context: userspace context to map memory for + * @addr: userspace virtual address to start at + * @size: length of region to map + * @access: IB_ACCESS_xxx flags for memory being mapped + * @dmasync: flush in-flight DMA when the memory region is written + */ +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync) +{ + struct ib_umem *umem; + unsigned long *pfn_list; + struct ib_umem_chunk *chunk; + unsigned long locked; + unsigned long lock_limit; + unsigned long cur_base; + unsigned long npages; + int ret; + int off; + int i; + DEFINE_DMA_ATTRS(attrs); + + if (dmasync) + dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs); + + if (!can_do_mlock()) + return ERR_PTR(-EPERM); + + umem = kmalloc(sizeof *umem, GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + + umem->type = IB_UMEM_IO_MAP; + umem->context = context; + umem->length = size; + umem->offset = addr & ~PAGE_MASK; + umem->page_size = PAGE_SIZE; + /* + * We ask for writable memory if any access flags other than + * "remote read" are set. "Local write" and "remote write" + * obviously require write access. "Remote atomic" can do + * things like fetch and add, which will modify memory, and + * "MW bind" can change permissions by binding a window. + */ + umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ); + + /* IO memory is not hugetlb memory */ + umem->hugetlb = 0; + + INIT_LIST_HEAD(&umem->chunk_list); + + pfn_list = (unsigned long *) __get_free_page(GFP_KERNEL); + if (!pfn_list) { + kfree(umem); + return ERR_PTR(-ENOMEM); + } + + npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT; + + down_write(¤t->mm->mmap_sem); + + locked = npages + current->mm->locked_vm; + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur + >> PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -ENOMEM; + goto out; + } + + cur_base = addr & PAGE_MASK; + + ret = 0; + while (npages) { + ret = ib_get_io_pfn(current, current->mm, cur_base, + min_t(unsigned long, npages, + PAGE_SIZE / sizeof(unsigned long *)), + umem->writable, + !umem->writable, pfn_list, NULL); + + if (ret < 0) + goto out; + + cur_base += ret * PAGE_SIZE; + npages -= ret; + + off = 0; + + while (ret) { + chunk = kmalloc(sizeof *chunk + + sizeof(struct scatterlist) * + min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK), + GFP_KERNEL); + if (!chunk) { + ret = -ENOMEM; + goto out; + } + chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK); + sg_init_table(chunk->page_list, chunk->nents); + /* The pfn_list we built is a set of Page + * Frame Numbers (PFN) whose physical address + * is PFN << PAGE_SHIFT. The SG DMA mapping + * services expect page addresses, not PFN, + * therefore, we have to do the dma mapping + * ourselves here. */ + for (i = 0; i < chunk->nents; ++i) { + sg_set_page(&chunk->page_list[i], 0, + PAGE_SIZE, 0); + chunk->page_list[i].dma_address = + (pfn_list[i] << PAGE_SHIFT); + chunk->page_list[i].dma_length = PAGE_SIZE; + } + chunk->nmap = chunk->nents; + ret -= chunk->nents; + off += chunk->nents; + list_add_tail(&chunk->list, &umem->chunk_list); + } + + ret = 0; + } + +out: + if (ret < 0) { + __ib_umem_release(context->device, umem, 0); + kfree(umem); + } else + current->mm->locked_vm = locked; + up_write(¤t->mm->mmap_sem); + free_page((unsigned long) pfn_list); + + return ret < 0 ? ERR_PTR(ret) : umem; +} +EXPORT_SYMBOL(ib_iomem_get); + diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 9ee0d2e..2c64d82 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -39,8 +39,14 @@ struct ib_ucontext; +enum ib_umem_type { + IB_UMEM_MEM_MAP = 0, + IB_UMEM_IO_MAP = 1 +}; + struct ib_umem { struct ib_ucontext *context; + enum ib_umem_type type; size_t length; int offset; int page_size; @@ -61,6 +67,8 @@ struct ib_umem_chunk { #ifdef CONFIG_INFINIBAND_USER_MEM +struct ib_umem *ib_iomem_get(struct ib_ucontext *context, unsigned long addr, + size_t size, int access, int dmasync); struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync); void ib_umem_release(struct ib_umem *umem); @@ -70,6 +78,12 @@ int ib_umem_page_count(struct ib_umem *umem); #include +static struct ib_umem *ib_iomem_get(struct ib_ucontext *context, + unsigned long addr, size_t size, + int access, int dmasync) { + return ERR_PTR(-EINVAL); +} + static inline struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, size_t size, int access, int dmasync) {