From patchwork Fri May 7 10:19:36 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Schmidt X-Patchwork-Id: 97714 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o47AHOot022521 for ; Fri, 7 May 2010 10:19:42 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1752798Ab0EGKTl (ORCPT ); Fri, 7 May 2010 06:19:41 -0400 Received: from mtagate7.de.ibm.com ([195.212.17.167]:38759 "EHLO mtagate7.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751421Ab0EGKTk (ORCPT ); Fri, 7 May 2010 06:19:40 -0400 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate7.de.ibm.com (8.13.1/8.13.1) with ESMTP id o47AJdEA021731 for ; Fri, 7 May 2010 10:19:39 GMT Received: from d12av01.megacenter.de.ibm.com (d12av01.megacenter.de.ibm.com [9.149.165.212]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o47AJdcW1564854 for ; Fri, 7 May 2010 12:19:39 +0200 Received: from d12av01.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av01.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o47AJclm026239 for ; Fri, 7 May 2010 12:19:38 +0200 Received: from alex-laptop (dyn-9-152-241-49.boeblingen.de.ibm.com [9.152.241.49]) by d12av01.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o47AJcj6026236; Fri, 7 May 2010 12:19:38 +0200 Date: Fri, 7 May 2010 12:19:36 +0200 From: Alexander Schmidt To: Roland Dreier Cc: of-ewg , Linux RDMA , Hoang-Nam Nguyen , Stefan Roscher , Joachim Fenkes , Christoph Raisch , Alex Vainman Subject: Re: [RFC] libibverbs: ibv_fork_init() and libhugetlbfs Message-ID: <20100507121936.283a18c6@alex-laptop> In-Reply-To: References: <20100506093949.55916ab0@alex-laptop> X-Mailer: Claws Mail 3.6.1 (GTK+ 2.16.1; i486-pc-linux-gnu) Mime-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Fri, 07 May 2010 10:20:50 +0000 (UTC) --- libibverbs-1.1.2.orig/src/memory.c +++ libibverbs-1.1.2/src/memory.c @@ -40,6 +40,8 @@ #include #include #include +#include +#include #include "ibverbs.h" @@ -68,12 +70,45 @@ struct ibv_mem_node { static struct ibv_mem_node *mm_root; static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; static int page_size; +static int huge_page_size; static int too_late; +static int get_huge_page_size(void) +{ + int ret = -1; + FILE *file; + char *path = "/proc/meminfo"; + char buf[1024], type[128]; + + file = fopen(path, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + unsigned long size; + + n = sscanf(buf, "%127s %lu %*s", &type, &size); + + if (n < 2) + continue; + + if (!strcmp(type, "Hugepagesize:")) { + /* huge page size is printed in Kb */ + ret = size * 1024; + break; + } + } + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { void *tmp; - int ret; + int ret, size; if (mm_root) return 0; @@ -85,11 +120,18 @@ int ibv_fork_init(void) if (page_size < 0) return errno; - if (posix_memalign(&tmp, page_size, page_size)) + huge_page_size = get_huge_page_size(); + + if (huge_page_size > page_size) + size = huge_page_size; + else + size = page_size; + + if (posix_memalign(&tmp, size, size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + ret = madvise(tmp, size, MADV_DONTFORK) || + madvise(tmp, size, MADV_DOFORK); free(tmp); @@ -446,11 +488,51 @@ static struct ibv_mem_node *__mm_find_st return node; } +static int is_huge_page(void *base) +{ + int ret = 0; + pid_t pid; + FILE *file; + char buf[1024], lib[128]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/maps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + char *substr; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%lx-%lx %*s %*x %*s %*u %127s", + &range_start, &range_end, &lib); + + if (n < 3) + continue; + + substr = strstr(lib, "libhugetlbfs"); + if (substr) { + if ((uintptr_t) base >= range_start && + (uintptr_t) base < range_end) { + ret = 1; + break; + } + } + } + fclose(file); + +out: + return ret; +} + static int ibv_madvise_range(void *base, size_t size, int advice) { uintptr_t start, end; struct ibv_mem_node *node, *tmp; - int inc; + int inc, range_page_size; int ret = 0; if (!size) @@ -458,9 +540,14 @@ static int ibv_madvise_range(void *base, inc = advice == MADV_DONTFORK ? 1 : -1; - start = (uintptr_t) base & ~(page_size - 1); - end = ((uintptr_t) (base + size + page_size - 1) & - ~(page_size - 1)) - 1; + if (huge_page_size > page_size && is_huge_page(base)) + range_page_size = huge_page_size; + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; pthread_mutex_lock(&mm_mutex);