From patchwork Wed Jun 9 09:47:50 2010 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexander Schmidt X-Patchwork-Id: 105088 Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by demeter.kernel.org (8.14.3/8.14.3) with ESMTP id o599ltHE005524 for ; Wed, 9 Jun 2010 09:47:55 GMT Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1755012Ab0FIJry (ORCPT ); Wed, 9 Jun 2010 05:47:54 -0400 Received: from mtagate6.de.ibm.com ([195.212.17.166]:51903 "EHLO mtagate6.de.ibm.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1754882Ab0FIJry (ORCPT ); Wed, 9 Jun 2010 05:47:54 -0400 Received: from d12nrmr1607.megacenter.de.ibm.com (d12nrmr1607.megacenter.de.ibm.com [9.149.167.49]) by mtagate6.de.ibm.com (8.13.1/8.13.1) with ESMTP id o599lqH2030668 for ; Wed, 9 Jun 2010 09:47:52 GMT Received: from d12av02.megacenter.de.ibm.com (d12av02.megacenter.de.ibm.com [9.149.165.228]) by d12nrmr1607.megacenter.de.ibm.com (8.13.8/8.13.8/NCO v10.0) with ESMTP id o599lq2P995442 for ; Wed, 9 Jun 2010 11:47:52 +0200 Received: from d12av02.megacenter.de.ibm.com (loopback [127.0.0.1]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.13.3) with ESMTP id o599lpxp025710 for ; Wed, 9 Jun 2010 11:47:52 +0200 Received: from alex-laptop (dyn-9-152-241-49.boeblingen.de.ibm.com [9.152.241.49]) by d12av02.megacenter.de.ibm.com (8.12.11.20060308/8.12.11) with ESMTP id o599lp0u025707; Wed, 9 Jun 2010 11:47:51 +0200 Date: Wed, 9 Jun 2010 11:47:50 +0200 From: Alexander Schmidt To: Roland Dreier Cc: Stefan Roscher , Christoph Raisch , of-ewg , Linux RDMA , Alex Vainman Subject: Re: [PATCH v2] libibverbs: ibv_fork_init() and libhugetlbfs Message-ID: <20100609114750.0798c664@alex-laptop> In-Reply-To: References: <20100531111359.4c0696ab@alex-laptop> X-Mailer: Claws Mail 3.6.1 (GTK+ 2.16.1; i486-pc-linux-gnu) Mime-Version: 1.0 Sender: linux-rdma-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org X-Greylist: IP, sender and recipient auto-whitelisted, not delayed by milter-greylist-4.2.3 (demeter.kernel.org [140.211.167.41]); Wed, 09 Jun 2010 09:47:55 +0000 (UTC) --- libibverbs.git.orig/src/memory.c +++ libibverbs.git/src/memory.c @@ -40,6 +40,8 @@ #include #include #include +#include +#include #include "ibverbs.h" @@ -70,10 +72,64 @@ static pthread_mutex_t mm_mutex = PTHREA static int page_size; static int too_late; +static unsigned long smaps_page_size(FILE *file) +{ + int n; + unsigned long size = 0; + char buf[1024]; + + while (fgets(buf, sizeof(buf), file) != NULL) { + if (!strstr(buf, "KernelPageSize:")) + continue; + + n = sscanf(buf, "%*s %lu", &size); + if (n < 1) + continue; + + /* page size is printed in Kb */ + size = size * 1024; + + break; + } + + return size; +} + +static unsigned long get_page_size(void *base) +{ + unsigned long ret = 0; + FILE *file; + char buf[1024]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) + goto out; + + while (fgets(buf, sizeof(buf), file) != NULL) { + int n; + uintptr_t range_start, range_end; + + n = sscanf(buf, "%lx-%lx", &range_start, &range_end); + + if (n < 2) + continue; + + if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) { + ret = smaps_page_size(file); + break; + } + } + fclose(file); + +out: + return ret; +} + int ibv_fork_init(void) { - void *tmp; + void *tmp, *tmp_aligned; int ret; + unsigned long size; if (mm_root) return 0; @@ -88,8 +144,17 @@ int ibv_fork_init(void) if (posix_memalign(&tmp, page_size, page_size)) return ENOMEM; - ret = madvise(tmp, page_size, MADV_DONTFORK) || - madvise(tmp, page_size, MADV_DOFORK); + size = get_page_size(tmp); + + if (size) + tmp_aligned = (void *)((uintptr_t)tmp & ~(size - 1)); + else { + size = page_size; + tmp_aligned = tmp; + } + + ret = madvise(tmp_aligned, size, MADV_DONTFORK) || + madvise(tmp_aligned, size, MADV_DOFORK); free(tmp); @@ -522,7 +587,8 @@ static struct ibv_mem_node *undo_node(st return node; } -static int ibv_madvise_range(void *base, size_t size, int advice) +static int ibv_madvise_range(void *base, size_t size, int advice, + unsigned long page_size) { uintptr_t start, end; struct ibv_mem_node *node, *tmp; @@ -612,10 +678,28 @@ out: return ret; } +static int ibv_fork_range(void *base, size_t size, int advice) +{ + int ret; + unsigned long range_page_size; + + ret = ibv_madvise_range(base, size, advice, page_size); + + if (ret == -1 && errno == EINVAL) { + range_page_size = get_page_size(base); + + if (range_page_size) + ret = ibv_madvise_range(base, size, advice, + range_page_size); + } + + return ret; +} + int ibv_dontfork_range(void *base, size_t size) { if (mm_root) - return ibv_madvise_range(base, size, MADV_DONTFORK); + return ibv_fork_range(base, size, MADV_DONTFORK); else { too_late = 1; return 0; @@ -625,7 +709,7 @@ int ibv_dontfork_range(void *base, size_ int ibv_dofork_range(void *base, size_t size) { if (mm_root) - return ibv_madvise_range(base, size, MADV_DOFORK); + return ibv_fork_range(base, size, MADV_DOFORK); else { too_late = 1; return 0;