From patchwork Mon May 31 09:13:59 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Schmidt X-Patchwork-Id: 103237 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o4V9DoKa016019 for ; Mon, 31 May 2010 09:14:08 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1756063Ab0EaJOH (ORCPT ); Mon, 31 May 2010 05:14:07 -0400 Received: from mtagate5.uk.ibm.com ([194.196.100.165]:56824 "EHLO mtagate5.uk.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1756071Ab0EaJOH (ORCPT ); Mon, 31 May 2010 05:14:07 -0400 Received: from d06nrmr1407.portsmouth.uk.ibm.com (d06nrmr1407.portsmouth.uk.ibm.com [9.149.38.185]) by mtagate5.uk.ibm.com (8.13.1/8.13.1) with ESMTP id o4V9E3cM030531 for ; Mon, 31 May 2010 09:14:03 GMT Received: from d06av04.portsmouth.uk.ibm.com (d06av04.portsmouth.uk.ibm.com [9.149.37.216]) by d06nrmr1407.portsmouth.uk.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o4V9E34Y688170 for ; Mon, 31 May 2010 10:14:03 +0100 Received: from d06av04.portsmouth.uk.ibm.com (loopback [127.0.0.1]) by d06av04.portsmouth.uk.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o4V9E2So026254 for ; Mon, 31 May 2010 10:14:03 +0100 Received: from alex-laptop (dyn-9-152-241-49.boeblingen.de.ibm.com [9.152.241.49]) by d06av04.portsmouth.uk.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o4V9E2BX026162; Mon, 31 May 2010 10:14:02 +0100 Date: Mon, 31 May 2010 11:13:59 +0200 From: Alexander Schmidt To: Roland Dreier Cc: Stefan Roscher , Christoph Raisch , of-ewg , Linux RDMA , Alex Vainman Subject: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs Message-ID: <20100531111359.4c0696ab@alex-laptop> X-Mailer: Claws Mail 3.6.1 (GTK+ 2.16.1; i486-pc-linux-gnu) Mime-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Mon, 31 May 2010 09:14:08 +0000 (UTC) --- libibverbs-1.1.2.orig/src/memory.c +++ libibverbs-1.1.2/src/memory.c @@ -40,6 +40,10 @@ #include #include #include +#include +#include +#include +#include #include "ibverbs.h" @@ -68,12 +72,117 @@ struct ibv_mem_node { static struct ibv_mem_node *mm_root; static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; static int page_size; +static int huge_page_enabled; static int too_late; +static int is_huge_page_enabled(void) +{ + int n, ret = 0; + char *bufp; + DIR *dir; + struct dirent *entry; + FILE *file; + unsigned long nr_hugepages; + char buf[1024]; + + dir = opendir("/sys/kernel/mm/hugepages/"); + if (!dir) + return 0; + + while ((entry = readdir(dir))) { + if (strncmp(entry->d_name, "hugepages-", 10)) + continue; + + snprintf(buf, sizeof(buf), "/sys/kernel/mm/hugepages/%s/nr_hugepages", + entry->d_name); + + file = fopen(buf, "r"); + if (!file) + continue; + + bufp = fgets(buf, sizeof(buf), file); + fclose(file); + if (!bufp) + continue; + + n = sscanf(buf, "%lu", &nr_hugepages); + if (n < 1) + continue; + + if (nr_hugepages) { + ret = 1; + goto out; + } + } + +out: + closedir(dir); + + return ret; +} + +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%lx-%lx", &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { - void *tmp; + void *tmp, *tmp_aligned; int ret; + unsigned long size; if (mm_root) return 0; @@ -88,8 +197,18 @@ int ibv_fork_init(void) if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + huge_page_enabled = is_huge_page_enabled(); + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *)((uintptr_t)tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); free(tmp); @@ -452,15 +571,21 @@ static int ibv_madvise_range(void *base, struct ibv_mem_node *node, *tmp; int inc; int ret = 0; + unsigned long range_page_size; if (!size) return 0; inc = advice == MADV_DONTFORK ? 1 : -1; - start = (uintptr_t) base & ~(page_size - 1); - end = ((uintptr_t) (base + size + page_size - 1) & - ~(page_size - 1)) - 1; + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; pthread_mutex_lock(&mm_mutex);