From patchwork Mon Jul 19 11:34:29 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Schmidt X-Patchwork-Id: 112643 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.4/8.14.3) with ESMTP id o6JBYZMi022579 for ; Mon, 19 Jul 2010 11:34:48 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1760576Ab0GSLee (ORCPT ); Mon, 19 Jul 2010 07:34:34 -0400 Received: from mtagate2.de.ibm.com ([195.212.17.162]:47134 "EHLO mtagate2.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1760572Ab0GSLed (ORCPT ); Mon, 19 Jul 2010 07:34:33 -0400 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate2.de.ibm.com (8.13.1/8.13.1) with ESMTP id o6JBYWU4023569 for ; Mon, 19 Jul 2010 11:34:32 GMT Received: from d12av03.megacenter.de.ibm.com (d12av03.megacenter.de.ibm.com [9.149.165.213]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o6JBYVZ71830946 for ; Mon, 19 Jul 2010 13:34:31 +0200 Received: from d12av03.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av03.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o6JBYVnP027850 for ; Mon, 19 Jul 2010 13:34:31 +0200 Received: from alex-laptop (dyn-9-152-241-62.boeblingen.de.ibm.com [9.152.241.62]) by d12av03.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o6JBYUUI027835; Mon, 19 Jul 2010 13:34:30 +0200 Date: Mon, 19 Jul 2010 13:34:29 +0200 From: Alexander Schmidt To: Linux RDMA , Roland Dreier Cc: Christoph Raisch , Stefan Roscher , Alex Vainman , of-ewg Subject: [PATCH v4] libibverbs: ibv_fork_init() and huge pages Message-ID: <20100719133429.60aeb3c9@alex-laptop> X-Mailer: Claws Mail 3.6.1 (GTK+ 2.16.1; i486-pc-linux-gnu) Mime-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Mon, 19 Jul 2010 11:34:49 +0000 (UTC) --- libibverbs.git.orig/src/memory.c +++ libibverbs.git/src/memory.c @@ -40,6 +40,10 @@ #include #include #include +#include +#include +#include +#include #include "ibverbs.h" @@ -68,12 +72,71 @@ struct ibv_mem_node { static struct ibv_mem_node *mm_root; static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER; static int page_size; +static int huge_page_enabled; static int too_late; +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = page_size; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = page_size; + pid_t pid; + FILE *file; + char buf[1024]; + + pid = getpid(); + snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid); + + file = fopen(buf, "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%lx-%lx", &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { - void *tmp; + void *tmp, *tmp_aligned; int ret; + unsigned long size; if (mm_root) return 0; @@ -88,8 +151,21 @@ int ibv_fork_init(void) if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + if (getenv("RDMAV_HUGEPAGES_SAFE")) + huge_page_enabled = 1; + else + huge_page_enabled = 0; + + if (huge_page_enabled) { + size = get_page_size(tmp); + tmp_aligned = (void *)((uintptr_t)tmp & ~(size - 1)); + } else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); free(tmp); @@ -529,13 +605,19 @@ static int ibv_madvise_range(void *base, int inc; int rolling_back = 0; int ret = 0; + unsigned long range_page_size; if (!size) return 0; - start = (uintptr_t) base & ~(page_size - 1); - end = ((uintptr_t) (base + size + page_size - 1) & - ~(page_size - 1)) - 1; + if (huge_page_enabled) + range_page_size = get_page_size(base); + else + range_page_size = page_size; + + start = (uintptr_t) base & ~(range_page_size - 1); + end = ((uintptr_t) (base + size + range_page_size - 1) & + ~(range_page_size - 1)) - 1; pthread_mutex_lock(&mm_mutex); again: