diff mbox

[1/3] buffer: add an aligned buffer with less alignment than a page

Message ID 1410796508-28711-1-git-send-email-j@jannau.net (mailing list archive)
State New, archived
Headers show

Commit Message

Janne Grunau Sept. 15, 2014, 3:55 p.m. UTC
SIMD optimized erasure code computation needs aligned memory. Buffers
aligned to a page boundary are wasted on it though. The buffers used
for the erasure code computation are typical smaller than a page.

Currently an alignement of 16 bytes is enough for the used
SIMD instruction sets (SSE4 and NEON).

Signed-off-by: Janne Grunau <j@jannau.net>
---
 configure.ac         |   9 +++++
 src/common/buffer.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
 src/include/buffer.h |  10 ++++++
 3 files changed, 119 insertions(+)

Comments

Loic Dachary Sept. 15, 2014, 4:46 p.m. UTC | #1
Look great !

Running on git builder under the branch wip-9408-buffer-alignment at http://ceph.com/gitbuilder.cgi

On 15/09/2014 17:55, Janne Grunau wrote:
> SIMD optimized erasure code computation needs aligned memory. Buffers
> aligned to a page boundary are wasted on it though. The buffers used
> for the erasure code computation are typical smaller than a page.
> 
> Currently an alignement of 16 bytes is enough for the used
> SIMD instruction sets (SSE4 and NEON).
> 
> Signed-off-by: Janne Grunau <j@jannau.net>
> ---
>  configure.ac         |   9 +++++
>  src/common/buffer.cc | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  src/include/buffer.h |  10 ++++++
>  3 files changed, 119 insertions(+)
> 
> diff --git a/configure.ac b/configure.ac
> index cccf2d9..1bb27c4 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -793,6 +793,15 @@ AC_MSG_RESULT([no])
>  ])
>  
>  #
> +# Check for functions to provide aligned memory
> +#
> +AC_CHECK_HEADERS([malloc.h])
> +AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
> +               [found_memalign=yes; break])
> +
> +AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
> +
> +#
>  # Check for pthread spinlock (depends on ACX_PTHREAD)
>  #
>  saved_LIBS="$LIBS"
> diff --git a/src/common/buffer.cc b/src/common/buffer.cc
> index b141759..acc221f 100644
> --- a/src/common/buffer.cc
> +++ b/src/common/buffer.cc
> @@ -30,6 +30,10 @@
>  #include <sys/uio.h>
>  #include <limits.h>
>  
> +#ifdef HAVE_MALLOC_H
> +#include <malloc.h>
> +#endif
> +
>  namespace ceph {
>  
>  #ifdef BUFFER_DEBUG
> @@ -155,9 +159,15 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      virtual int zero_copy_to_fd(int fd, loff_t *offset) {
>        return -ENOTSUP;
>      }
> +    virtual bool is_aligned() {
> +      return ((long)data & ~CEPH_ALIGN_MASK) == 0;
> +    }
>      virtual bool is_page_aligned() {
>        return ((long)data & ~CEPH_PAGE_MASK) == 0;
>      }
> +    bool is_n_align_sized() {
> +      return (len & ~CEPH_ALIGN_MASK) == 0;
> +    }
>      bool is_n_page_sized() {
>        return (len & ~CEPH_PAGE_MASK) == 0;
>      }
> @@ -209,6 +219,41 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      }
>    };
>  
> +  class buffer::raw_aligned : public buffer::raw {
> +  public:
> +    raw_aligned(unsigned l) : raw(l) {
> +      if (len) {
> +#if HAVE_POSIX_MEMALIGN
> +        if (posix_memalign((void **) &data, CEPH_ALIGN, len))
> +          data = 0;
> +#elif HAVE__ALIGNED_MALLOC
> +        data = _aligned_malloc(len, CEPH_ALIGN);
> +#elif HAVE_MEMALIGN
> +        data = memalign(CEPH_ALIGN, len);
> +#elif HAVE_ALIGNED_MALLOC
> +        data = aligned_malloc((len + CEPH_ALIGN - 1) & ~CEPH_ALIGN_MASK,
> +                              CEPH_ALIGN);
> +#else
> +        data = malloc(len);
> +#endif
> +        if (!data)
> +          throw bad_alloc();
> +      } else {
> +        data = 0;
> +      }
> +      inc_total_alloc(len);
> +      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    ~raw_aligned() {
> +      free(data);
> +      dec_total_alloc(len);
> +      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
> +    }
> +    raw* clone_empty() {
> +      return new raw_aligned(len);
> +    }
> +  };
> +
>  #ifndef __CYGWIN__
>    class buffer::raw_mmap_pages : public buffer::raw {
>    public:
> @@ -334,6 +379,10 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>        return true;
>      }
>  
> +    bool is_aligned() {
> +      return false;
> +    }
> +
>      bool is_page_aligned() {
>        return false;
>      }
> @@ -520,6 +569,9 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>    buffer::raw* buffer::create_static(unsigned len, char *buf) {
>      return new raw_static(buf, len);
>    }
> +  buffer::raw* buffer::create_aligned(unsigned len) {
> +    return new raw_aligned(len);
> +  }
>    buffer::raw* buffer::create_page_aligned(unsigned len) {
>  #ifndef __CYGWIN__
>      //return new raw_mmap_pages(len);
> @@ -1013,6 +1065,16 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      return true;
>    }
>  
> +  bool buffer::list::is_aligned() const
> +  {
> +    for (std::list<ptr>::const_iterator it = _buffers.begin();
> +         it != _buffers.end();
> +         ++it)
> +      if (!it->is_aligned())
> +        return false;
> +    return true;
> +  }
> +
>    bool buffer::list::is_page_aligned() const
>    {
>      for (std::list<ptr>::const_iterator it = _buffers.begin();
> @@ -1101,6 +1163,44 @@ static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
>      _buffers.push_back(nb);
>    }
>  
> +void buffer::list::rebuild_aligned()
> +{
> +  std::list<ptr>::iterator p = _buffers.begin();
> +  while (p != _buffers.end()) {
> +    // keep anything that's already page sized+aligned
> +    if (p->is_aligned() && p->is_n_align_sized()) {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
> +             << " length " << p->length()
> +             << " " << (p->length() & ~CEPH_ALIGN_MASK) << " ok" << std::endl;
> +      */
> +      ++p;
> +      continue;
> +    }
> +
> +    // consolidate unaligned items, until we get something that is sized+aligned
> +    list unaligned;
> +    unsigned offset = 0;
> +    do {
> +      /*cout << " segment " << (void*)p->c_str()
> +             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
> +             << " length " << p->length() << " " << (p->length() & ~CEPH_ALIGN_MASK)
> +             << " overall offset " << offset << " " << (offset & ~CEPH_ALIGN_MASK)
> +             << " not ok" << std::endl;
> +      */
> +      offset += p->length();
> +      unaligned.push_back(*p);
> +      _buffers.erase(p++);
> +    } while (p != _buffers.end() &&
> +	     (!p->is_aligned() ||
> +	      !p->is_n_align_sized() ||
> +	      (offset & ~CEPH_ALIGN_MASK)));
> +    ptr nb(buffer::create_aligned(unaligned._len));
> +    unaligned.rebuild(nb);
> +    _buffers.insert(p, unaligned._buffers.front());
> +  }
> +}
> +
>  void buffer::list::rebuild_page_aligned()
>  {
>    std::list<ptr>::iterator p = _buffers.begin();
> diff --git a/src/include/buffer.h b/src/include/buffer.h
> index e5c1b50..ea362e7 100644
> --- a/src/include/buffer.h
> +++ b/src/include/buffer.h
> @@ -56,6 +56,9 @@
>  # include <assert.h>
>  #endif
>  
> +#define CEPH_ALIGN 16
> +#define CEPH_ALIGN_MASK (~(CEPH_ALIGN - 1LLU))
> +
>  namespace ceph {
>  
>  class buffer {
> @@ -124,6 +127,7 @@ private:
>     */
>    class raw;
>    class raw_malloc;
> +  class raw_aligned;
>    class raw_static;
>    class raw_mmap_pages;
>    class raw_posix_aligned;
> @@ -144,6 +148,7 @@ public:
>    static raw* create_malloc(unsigned len);
>    static raw* claim_malloc(unsigned len, char *buf);
>    static raw* create_static(unsigned len, char *buf);
> +  static raw* create_aligned(unsigned len);
>    static raw* create_page_aligned(unsigned len);
>    static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
>  
> @@ -177,7 +182,9 @@ public:
>      bool at_buffer_head() const { return _off == 0; }
>      bool at_buffer_tail() const;
>  
> +    bool is_aligned() const { return ((long)c_str() & ~CEPH_ALIGN_MASK) == 0; }
>      bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
> +    bool is_n_align_sized() const { return (length() & ~CEPH_ALIGN_MASK) == 0; }
>      bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
>  
>      // accessors
> @@ -344,7 +351,9 @@ public:
>      bool contents_equal(buffer::list& other);
>  
>      bool can_zero_copy() const;
> +    bool is_aligned() const;
>      bool is_page_aligned() const;
> +    bool is_n_align_sized() const;
>      bool is_n_page_sized() const;
>  
>      bool is_zero() const;
> @@ -382,6 +391,7 @@ public:
>      bool is_contiguous();
>      void rebuild();
>      void rebuild(ptr& nb);
> +    void rebuild_aligned();
>      void rebuild_page_aligned();
>  
>      // sort-of-like-assignment-op
>
Janne Grunau Sept. 18, 2014, 10:33 a.m. UTC | #2
Hi,

following a is an updated patchset. It passes now make check in src

It has following changes:
 * use 32-byte alignment since the isa plugin use AVX2
   (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
   but I can't see a reason why it would need more than 32-bytes
 * ErasureCode::encode_prepare() handles more than one chunk with padding

cheers

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Joachim Peters Sept. 18, 2014, 12:18 p.m. UTC | #3
Hi Janne, 
=> (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers

I should update the README since it is misleading ... it should say 8*k or 16*k byte aligned chunk size depending on the compiler/platform used, it is not the alignment of the allocated buffer addresses.The get_alignment in the plug-in function is used to compute the chunk size for the encoding (as I said not the start address alignment). 

If you pass k buffers for decoding each buffer should be aligned at least to 16 or as you pointed out better 32 bytes. 

For encoding there is normally a single buffer split 'virtually' into k pieces. To make all pieces starting at an aligned address one needs to align the chunk size to e.g. 16*k. For the best possible performance on all platforms we should change the get_alignment function in the ISA plug-in to return 32*k if there are no other objections ?!?!
 
Cheers Andreas.
Andreas Joachim Peters Sept. 18, 2014, 12:34 p.m. UTC | #4
Hi Janne/Loic, 
there is more confusion atleast on my side ...

I had now a look at the jerasure plug-in and I am now slightly confused why you have two ways to return in get_alignment ... one is as I assume and another one is "per_chunk_alignment" ... what should the function return Loic?

Cheers Andreas.
Janne Grunau Sept. 18, 2014, 12:40 p.m. UTC | #5
Hi,

On 2014-09-18 12:18:59 +0000, Andreas Joachim Peters wrote:
> 
> => (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
> 
> I should update the README since it is misleading ... it should say 
> 8*k or 16*k byte aligned chunk size depending on the compiler/platform 
> used, it is not the alignment of the allocated buffer addresses.The 
> get_alignment in the plug-in function is used to compute the chunk 
> size for the encoding (as I said not the start address alignment). 

I've seen that

> If you pass k buffers for decoding each buffer should be aligned at 
> least to 16 or as you pointed out better 32 bytes. 

ok, that makes sense

> For encoding there is normally a single buffer split 'virtually' into 
> k pieces. To make all pieces starting at an aligned address one needs 
> to align the chunk size to e.g. 16*k.

I don't get that. How is the buffer splitted? into k (+ m) chunk size 
parts? As long as the start and the length are both 16 (or 32) byte 
aligned all parts are properly aligned too. I don't see where the k 
comes into play.

cheers

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Janne Grunau Sept. 18, 2014, 12:53 p.m. UTC | #6
Hi,

On 2014-09-18 12:34:49 +0000, Andreas Joachim Peters wrote:
>
> there is more confusion atleast on my side ...
> 
> I had now a look at the jerasure plug-in and I am now slightly 
> confused why you have two ways to return in get_alignment ... one is 
> as I assume and another one is "per_chunk_alignment" ... what should 
> the function return Loic?

the per_chunk_alignment is just a bool which says that each chunk has to
start at an aligned address.

get_alignement() seems to be used to align the chunk size.

It might come from gf-complete' strange alignment requirements. Instead 
of requiring aligned buffers it requires that src and dst buffer have 
the same remainder when divided by 16. The best way to archieve that is 
to align the length to 16 and use a single buffer.

I agree it's convoluted.

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Joachim Peters Sept. 18, 2014, 1:01 p.m. UTC | #7
Hi Janne, 

>> For encoding there is normally a single buffer split 'virtually' into
>> k pieces. To make all pieces starting at an aligned address one needs
>> to align the chunk size to e.g. 16*k.

>I don't get that. How is the buffer splitted? into k (+ m) chunk size
>parts? As long as the start and the length are both 16 (or 32) byte
>aligned all parts are properly aligned too. I don't see where the k
>comes into play.

The original data block to encode has to be split into k equally long pieces. Each piece is given as one of the k input buffers to the erasure code algorithm producing m output buffers and each piece has to have an aligned starting address and length.

If you deal with 128 byte data input buffers for k=4 it splits like

offset=00 len=32 as chunk1
offset=32 len=32 as chunk2
offset=64 len=32 as chunk3
offset=96 len=32 as chunk4

If the desired IO size would be 196 bytes the 32 byte alignment requirement blows this buffer up to 256 bytes:

offset=00 len=64 as chunk1
offset=64 len=64 as chunk2
offset=128 len=64 as chunk3
offset=196 len=64 as chunk4

For the typical 4kb only k=2,4,8,16,32,64,128 do not increase the buffer. If someone configures e.g. k=10 the buffer is increased from 4096 to 4160 bytes and it creates 1.5% storage volume overhead.

Cheers Andreas.
Janne Grunau Sept. 18, 2014, 1:23 p.m. UTC | #8
On 2014-09-18 13:01:03 +0000, Andreas Joachim Peters wrote:
> 
> >> For encoding there is normally a single buffer split 'virtually' 
> >> into
> >> k pieces. To make all pieces starting at an aligned address one 
> >> needs
> >> to align the chunk size to e.g. 16*k.
> 
> >I don't get that. How is the buffer splitted? into k (+ m) chunk size
> >parts? As long as the start and the length are both 16 (or 32) byte
> >aligned all parts are properly aligned too. I don't see where the k
> >comes into play.
> 
> The original data block to encode has to be split into k equally long 
> pieces. Each piece is given as one of the k input buffers to the 
> erasure code algorithm producing m output buffers and each piece has 
> to have an aligned starting address and length.
> 
> If you deal with 128 byte data input buffers for k=4 it splits like
> 
> offset=00 len=32 as chunk1
> offset=32 len=32 as chunk2
> offset=64 len=32 as chunk3
> offset=96 len=32 as chunk4
> 
> If the desired IO size would be 196 bytes the 32 byte alignment 
> requirement blows this buffer up to 256 bytes:
> 
> offset=00 len=64 as chunk1
> offset=64 len=64 as chunk2
> offset=128 len=64 as chunk3
> offset=196 len=64 as chunk4

I fail to see how the 32 * k is related to alignment. It's only used for 
to pad the total size so it becomes a mulitple of k * 32. That is ok 
since we want k 32-byte aligned chunks. The alignment for each chunk is 
just 32-bytes.

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andreas Joachim Peters Sept. 18, 2014, 2:47 p.m. UTC | #9
> I fail to see how the 32 * k is related to alignment. It's only used for
> to pad the total size so it becomes a mulitple of k * 32. That is ok
> since we want k 32-byte aligned chunks. The alignment for each chunk is
> just 32-bytes.

Yes, agreed! The alignment for each chunk should be 32 bytes. 

And the implementation is most efficient if the given encoding buffer is already padded to k*32 bytes, it avoids an additional buffer allocation and copy.

Cheers Andreas.--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Loic Dachary Sept. 19, 2014, 9:18 a.m. UTC | #10
Hi Andreas,

The per_chunk_alignment addresses a backward compatible change in the way they are calculated. The problem was that the initial calculation lead to oversized chunks. The long explanation is at https://github.com/ceph/ceph/commit/c7daaaf5e63d0bd1d444385e62611fe276f6ce29

Please let me know if you see something wrong :-)

Cheers

On 18/09/2014 14:34, Andreas Joachim Peters wrote:
> Hi Janne/Loic, 
> there is more confusion atleast on my side ...
> 
> I had now a look at the jerasure plug-in and I am now slightly confused why you have two ways to return in get_alignment ... one is as I assume and another one is "per_chunk_alignment" ... what should the function return Loic?
> 
> Cheers Andreas.
> ________________________________________
> From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Andreas Joachim Peters [Andreas.Joachim.Peters@cern.ch]
> Sent: 18 September 2014 14:18
> To: Janne Grunau; ceph-devel@vger.kernel.org
> Subject: RE: v2 aligned buffer changes for erasure codes
> 
> Hi Janne,
> => (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
> 
> I should update the README since it is misleading ... it should say 8*k or 16*k byte aligned chunk size depending on the compiler/platform used, it is not the alignment of the allocated buffer addresses.The get_alignment in the plug-in function is used to compute the chunk size for the encoding (as I said not the start address alignment).
> 
> If you pass k buffers for decoding each buffer should be aligned at least to 16 or as you pointed out better 32 bytes.
> 
> For encoding there is normally a single buffer split 'virtually' into k pieces. To make all pieces starting at an aligned address one needs to align the chunk size to e.g. 16*k. For the best possible performance on all platforms we should change the get_alignment function in the ISA plug-in to return 32*k if there are no other objections ?!?!
> 
> Cheers Andreas.
> ________________________________________
> From: ceph-devel-owner@vger.kernel.org [ceph-devel-owner@vger.kernel.org] on behalf of Janne Grunau [j@jannau.net]
> Sent: 18 September 2014 12:33
> To: ceph-devel@vger.kernel.org
> Subject: v2 aligned buffer changes for erasure codes
> 
> Hi,
> 
> following a is an updated patchset. It passes now make check in src
> 
> It has following changes:
>  * use 32-byte alignment since the isa plugin use AVX2
>    (src/erasure-code/isa/README claims it needs 16*k byte aligned buffers
>    but I can't see a reason why it would need more than 32-bytes
>  * ErasureCode::encode_prepare() handles more than one chunk with padding
> 
> cheers
> 
> Janne
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Janne Grunau Sept. 29, 2014, 12:34 p.m. UTC | #11
Hi,

reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558

variable alignment instead of the hardcoded 32-byte alignment
fixed copy and paste error in a comment
change buffer alignment for decoding too (much simpler than the encoding
changes)

I'll do a github pull request once the last make check run finishes
locally.

Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align

regards

Janne

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Loic Dachary Sept. 29, 2014, 1:15 p.m. UTC | #12
Hi,

On 29/09/2014 14:34, Janne Grunau wrote:
> Hi,
> 
> reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558
> 
> variable alignment instead of the hardcoded 32-byte alignment
> fixed copy and paste error in a comment
> change buffer alignment for decoding too (much simpler than the encoding
> changes)
> 
> I'll do a github pull request once the last make check run finishes
> locally.

I was too quick it seems : https://github.com/ceph/ceph/pull/2595

Looks great !

Cheers

> 
> Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align
> 
> regards
> 
> Janne
> 
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Milosz Tanski Sept. 29, 2014, 3:18 p.m. UTC | #13
Janne, I have a couple questions and forgive me if I missed something
from the past.

Did you consider using std::aligned_storage? That way you don't have
to worry as much about portability yourself. Also, there is a
boost::aligned_storage as well if Ceph can't build against C++11
goodness.

A second more general Ceph question is somewhat off-topic. What about
C++11 use in the Ceph code base (like in this case)? It's not
explicitly prohibited by the coding style document, but I imagine the
goal is to build on as many systems as possible and quite a few
supported distros have pretty old versions of GCC. I'm asking this
because I imagine some of the performance work that's about to happen
will want to use things like lockless queues, and then you get into
C++11 memory model and std::atomic... etc.

- Milosz

On Mon, Sep 29, 2014 at 8:34 AM, Janne Grunau <j@jannau.net> wrote:
> Hi,
>
> reworked patchset to address the comments in https://github.com/ceph/ceph/pull/2558
>
> variable alignment instead of the hardcoded 32-byte alignment
> fixed copy and paste error in a comment
> change buffer alignment for decoding too (much simpler than the encoding
> changes)
>
> I'll do a github pull request once the last make check run finishes
> locally.
>
> Also available from http://git.jannau.net/ceph.git/log/?h=buffer_align
>
> regards
>
> Janne
>
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
Sage Weil Sept. 29, 2014, 3:24 p.m. UTC | #14
On Mon, 29 Sep 2014, Milosz Tanski wrote:
> A second more general Ceph question is somewhat off-topic. What about
> C++11 use in the Ceph code base (like in this case)? It's not
> explicitly prohibited by the coding style document, but I imagine the
> goal is to build on as many systems as possible and quite a few
> supported distros have pretty old versions of GCC. I'm asking this
> because I imagine some of the performance work that's about to happen
> will want to use things like lockless queues, and then you get into
> C++11 memory model and std::atomic... etc.

We are all very eager to move to C++11.  The challenge is that we still 
need to build packages for the target distros.  That doesn't necessarily 
mean that the compilers on those distros need to support c++11 as long as 
the runtime does... if we can make the build enviroment sane.

I'm really curious what other projects do here...

sage
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Milosz Tanski Sept. 29, 2014, 3:44 p.m. UTC | #15
On Mon, Sep 29, 2014 at 11:24 AM, Sage Weil <sweil@redhat.com> wrote:
> On Mon, 29 Sep 2014, Milosz Tanski wrote:
>> A second more general Ceph question is somewhat off-topic. What about
>> C++11 use in the Ceph code base (like in this case)? It's not
>> explicitly prohibited by the coding style document, but I imagine the
>> goal is to build on as many systems as possible and quite a few
>> supported distros have pretty old versions of GCC. I'm asking this
>> because I imagine some of the performance work that's about to happen
>> will want to use things like lockless queues, and then you get into
>> C++11 memory model and std::atomic... etc.
>
> We are all very eager to move to C++11.  The challenge is that we still
> need to build packages for the target distros.  That doesn't necessarily
> mean that the compilers on those distros need to support c++11 as long as
> the runtime does... if we can make the build enviroment sane.
>
> I'm really curious what other projects do here...
>
> sage

I know that Ubuntu (12.04) and RHEL (6) provide updated compilers. I
think you can get 4.8.x in ubuntu and 4.7 in RHEL6. One problem you're
going to into is that the system (eg. old) version boost will be build
whatever is the default compiler (4.6 for ubuntu, 4.4 for RHEL6)
without C++11 features. Your new runtime vector becomes very large.
Wido den Hollander Sept. 29, 2014, 5:56 p.m. UTC | #16
On 29-09-14 17:24, Sage Weil wrote:
> On Mon, 29 Sep 2014, Milosz Tanski wrote:
>> A second more general Ceph question is somewhat off-topic. What about
>> C++11 use in the Ceph code base (like in this case)? It's not
>> explicitly prohibited by the coding style document, but I imagine the
>> goal is to build on as many systems as possible and quite a few
>> supported distros have pretty old versions of GCC. I'm asking this
>> because I imagine some of the performance work that's about to happen
>> will want to use things like lockless queues, and then you get into
>> C++11 memory model and std::atomic... etc.
> 
> We are all very eager to move to C++11.  The challenge is that we still 
> need to build packages for the target distros.  That doesn't necessarily 
> mean that the compilers on those distros need to support c++11 as long as 
> the runtime does... if we can make the build enviroment sane.
> 
> I'm really curious what other projects do here...
> 

At the CloudStack project we recently switched from Java 6 to Java 7 and
we said that from version X we required Java 7 on the system.

For Ceph, what keeps us from saying that version H (after Giant)
requires a C++11 compliant compiler?

That might rule out Ubuntu 12.04 and CentOS6/RHEL6, but does that really
matter that much for something 'new' like Ceph?

> sage
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Janne Grunau Oct. 2, 2014, 12:15 p.m. UTC | #17
Hi,

On 2014-09-29 11:18:03 -0400, Milosz Tanski wrote:
> Janne, I have a couple questions and forgive me if I missed something
> from the past.
> 
> Did you consider using std::aligned_storage? That way you don't have
> to worry as much about portability yourself. Also, there is a
> boost::aligned_storage as well if Ceph can't build against C++11
> goodness.

no. I'm more familiar with C than C++ so those are not in my active 
C/C++ vocabulary.

Janne
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/configure.ac b/configure.ac
index cccf2d9..1bb27c4 100644
--- a/configure.ac
+++ b/configure.ac
@@ -793,6 +793,15 @@  AC_MSG_RESULT([no])
 ])
 
 #
+# Check for functions to provide aligned memory
+#
+AC_CHECK_HEADERS([malloc.h])
+AC_CHECK_FUNCS([posix_memalign _aligned_malloc memalign aligned_malloc],
+               [found_memalign=yes; break])
+
+AS_IF([test "x$found_memalign" != "xyes"], [AC_MSG_WARN([No function for aligned memory allocation found])])
+
+#
 # Check for pthread spinlock (depends on ACX_PTHREAD)
 #
 saved_LIBS="$LIBS"
diff --git a/src/common/buffer.cc b/src/common/buffer.cc
index b141759..acc221f 100644
--- a/src/common/buffer.cc
+++ b/src/common/buffer.cc
@@ -30,6 +30,10 @@ 
 #include <sys/uio.h>
 #include <limits.h>
 
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
 namespace ceph {
 
 #ifdef BUFFER_DEBUG
@@ -155,9 +159,15 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     virtual int zero_copy_to_fd(int fd, loff_t *offset) {
       return -ENOTSUP;
     }
+    virtual bool is_aligned() {
+      return ((long)data & ~CEPH_ALIGN_MASK) == 0;
+    }
     virtual bool is_page_aligned() {
       return ((long)data & ~CEPH_PAGE_MASK) == 0;
     }
+    bool is_n_align_sized() {
+      return (len & ~CEPH_ALIGN_MASK) == 0;
+    }
     bool is_n_page_sized() {
       return (len & ~CEPH_PAGE_MASK) == 0;
     }
@@ -209,6 +219,41 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     }
   };
 
+  class buffer::raw_aligned : public buffer::raw {
+  public:
+    raw_aligned(unsigned l) : raw(l) {
+      if (len) {
+#if HAVE_POSIX_MEMALIGN
+        if (posix_memalign((void **) &data, CEPH_ALIGN, len))
+          data = 0;
+#elif HAVE__ALIGNED_MALLOC
+        data = _aligned_malloc(len, CEPH_ALIGN);
+#elif HAVE_MEMALIGN
+        data = memalign(CEPH_ALIGN, len);
+#elif HAVE_ALIGNED_MALLOC
+        data = aligned_malloc((len + CEPH_ALIGN - 1) & ~CEPH_ALIGN_MASK,
+                              CEPH_ALIGN);
+#else
+        data = malloc(len);
+#endif
+        if (!data)
+          throw bad_alloc();
+      } else {
+        data = 0;
+      }
+      inc_total_alloc(len);
+      bdout << "raw_aligned " << this << " alloc " << (void *)data << " " << l << " " << buffer::get_total_alloc() << bendl;
+    }
+    ~raw_aligned() {
+      free(data);
+      dec_total_alloc(len);
+      bdout << "raw_aligned " << this << " free " << (void *)data << " " << buffer::get_total_alloc() << bendl;
+    }
+    raw* clone_empty() {
+      return new raw_aligned(len);
+    }
+  };
+
 #ifndef __CYGWIN__
   class buffer::raw_mmap_pages : public buffer::raw {
   public:
@@ -334,6 +379,10 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
       return true;
     }
 
+    bool is_aligned() {
+      return false;
+    }
+
     bool is_page_aligned() {
       return false;
     }
@@ -520,6 +569,9 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
   buffer::raw* buffer::create_static(unsigned len, char *buf) {
     return new raw_static(buf, len);
   }
+  buffer::raw* buffer::create_aligned(unsigned len) {
+    return new raw_aligned(len);
+  }
   buffer::raw* buffer::create_page_aligned(unsigned len) {
 #ifndef __CYGWIN__
     //return new raw_mmap_pages(len);
@@ -1013,6 +1065,16 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     return true;
   }
 
+  bool buffer::list::is_aligned() const
+  {
+    for (std::list<ptr>::const_iterator it = _buffers.begin();
+         it != _buffers.end();
+         ++it)
+      if (!it->is_aligned())
+        return false;
+    return true;
+  }
+
   bool buffer::list::is_page_aligned() const
   {
     for (std::list<ptr>::const_iterator it = _buffers.begin();
@@ -1101,6 +1163,44 @@  static simple_spinlock_t buffer_debug_lock = SIMPLE_SPINLOCK_INITIALIZER;
     _buffers.push_back(nb);
   }
 
+void buffer::list::rebuild_aligned()
+{
+  std::list<ptr>::iterator p = _buffers.begin();
+  while (p != _buffers.end()) {
+    // keep anything that's already page sized+aligned
+    if (p->is_aligned() && p->is_n_align_sized()) {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length()
+             << " " << (p->length() & ~CEPH_ALIGN_MASK) << " ok" << std::endl;
+      */
+      ++p;
+      continue;
+    }
+
+    // consolidate unaligned items, until we get something that is sized+aligned
+    list unaligned;
+    unsigned offset = 0;
+    do {
+      /*cout << " segment " << (void*)p->c_str()
+             << " offset " << ((unsigned long)p->c_str() & ~CEPH_ALIGN_MASK)
+             << " length " << p->length() << " " << (p->length() & ~CEPH_ALIGN_MASK)
+             << " overall offset " << offset << " " << (offset & ~CEPH_ALIGN_MASK)
+             << " not ok" << std::endl;
+      */
+      offset += p->length();
+      unaligned.push_back(*p);
+      _buffers.erase(p++);
+    } while (p != _buffers.end() &&
+	     (!p->is_aligned() ||
+	      !p->is_n_align_sized() ||
+	      (offset & ~CEPH_ALIGN_MASK)));
+    ptr nb(buffer::create_aligned(unaligned._len));
+    unaligned.rebuild(nb);
+    _buffers.insert(p, unaligned._buffers.front());
+  }
+}
+
 void buffer::list::rebuild_page_aligned()
 {
   std::list<ptr>::iterator p = _buffers.begin();
diff --git a/src/include/buffer.h b/src/include/buffer.h
index e5c1b50..ea362e7 100644
--- a/src/include/buffer.h
+++ b/src/include/buffer.h
@@ -56,6 +56,9 @@ 
 # include <assert.h>
 #endif
 
+#define CEPH_ALIGN 16
+#define CEPH_ALIGN_MASK (~(CEPH_ALIGN - 1LLU))
+
 namespace ceph {
 
 class buffer {
@@ -124,6 +127,7 @@  private:
    */
   class raw;
   class raw_malloc;
+  class raw_aligned;
   class raw_static;
   class raw_mmap_pages;
   class raw_posix_aligned;
@@ -144,6 +148,7 @@  public:
   static raw* create_malloc(unsigned len);
   static raw* claim_malloc(unsigned len, char *buf);
   static raw* create_static(unsigned len, char *buf);
+  static raw* create_aligned(unsigned len);
   static raw* create_page_aligned(unsigned len);
   static raw* create_zero_copy(unsigned len, int fd, int64_t *offset);
 
@@ -177,7 +182,9 @@  public:
     bool at_buffer_head() const { return _off == 0; }
     bool at_buffer_tail() const;
 
+    bool is_aligned() const { return ((long)c_str() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_page_aligned() const { return ((long)c_str() & ~CEPH_PAGE_MASK) == 0; }
+    bool is_n_align_sized() const { return (length() & ~CEPH_ALIGN_MASK) == 0; }
     bool is_n_page_sized() const { return (length() & ~CEPH_PAGE_MASK) == 0; }
 
     // accessors
@@ -344,7 +351,9 @@  public:
     bool contents_equal(buffer::list& other);
 
     bool can_zero_copy() const;
+    bool is_aligned() const;
     bool is_page_aligned() const;
+    bool is_n_align_sized() const;
     bool is_n_page_sized() const;
 
     bool is_zero() const;
@@ -382,6 +391,7 @@  public:
     bool is_contiguous();
     void rebuild();
     void rebuild(ptr& nb);
+    void rebuild_aligned();
     void rebuild_page_aligned();
 
     // sort-of-like-assignment-op