diff mbox

inconsistent rados semantics between automake/cmake builds

Message ID CAB2gnbUxzE7z6ec39=cPg2sqNOtMc6cS8_GN9ybxxdvB7zXW+g@mail.gmail.com (mailing list archive)
State New, archived
Headers show

Commit Message

Noah Watkins Jan. 21, 2016, 3:53 p.m. UTC
I'm seeing an anomaly of semantics between Ceph built with Automake
and with CMake. I've not been lucky so far tracking this down so I
thought I'd toss it out here.

Here is the basic test description to reproduce (patch below, and
available at mtime branch: github.com/noahdesu/ceph mtime):

0. create object class method to return object mtime via stat
1. from librados client, create an object with write_full
2. print mtime1 returned from ioctx.stat
3. print mtime2 returned from object class method via exec (step [0])
4. compare mtime1,2: values should be equal (step [3] didn't modify anything)

The first valid question is assertion [4] true?

In either case, what is weird, is that this test works (ie both mtime
values are equal) when I build with automake, and does not work when I
build with cmake. The times have a large difference, and the mtime on
the OSD seems to travel back in time w.r.t. to the time on the client.
Tests are the same machine.

Automake Steps:

./autogen.sh
./configure
make
cd src
vstart.sh -x -n -l
./ceph_test_cls_hello --gtest_filter=ClsHello.ReturnMtime

[ RUN      ] ClsHello.ReturnMtime
from client: 1453391360
   from osd: 1453391360 (EQUAL)

CMake Steps:

mkdir build
cd build
cmake ..
make
./src/vstart.sh -x -n -l
./src/test/test_cls_hello --gtest_filter=ClsHello.ReturnMtime

[ RUN      ] ClsHello.ReturnMtime
from client: 1453528535
   from osd: 1453388425 (OLDER)

-----

--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Yehuda Sadeh Jan. 21, 2016, 4:03 p.m. UTC | #1
On Thu, Jan 21, 2016 at 7:53 AM, Noah Watkins <noahwatkins@gmail.com> wrote:
> I'm seeing an anomaly of semantics between Ceph built with Automake
> and with CMake. I've not been lucky so far tracking this down so I
> thought I'd toss it out here.
>
> Here is the basic test description to reproduce (patch below, and
> available at mtime branch: github.com/noahdesu/ceph mtime):
>
> 0. create object class method to return object mtime via stat
> 1. from librados client, create an object with write_full
> 2. print mtime1 returned from ioctx.stat
> 3. print mtime2 returned from object class method via exec (step [0])
> 4. compare mtime1,2: values should be equal (step [3] didn't modify anything)
>
> The first valid question is assertion [4] true?
>
> In either case, what is weird, is that this test works (ie both mtime
> values are equal) when I build with automake, and does not work when I
> build with cmake. The times have a large difference, and the mtime on
> the OSD seems to travel back in time w.r.t. to the time on the client.
> Tests are the same machine.


This sounds somewhat like an issue Casey and Orit had seen, one that I
wasn't able to replicate (both of them are using cmake, I'm using
automake). I think they tracked it to Adam's c++11 time change.

Yehuda

>
> Automake Steps:
>
> ./autogen.sh
> ./configure
> make
> cd src
> vstart.sh -x -n -l
> ./ceph_test_cls_hello --gtest_filter=ClsHello.ReturnMtime
>
> [ RUN      ] ClsHello.ReturnMtime
> from client: 1453391360
>    from osd: 1453391360 (EQUAL)
>
> CMake Steps:
>
> mkdir build
> cd build
> cmake ..
> make
> ./src/vstart.sh -x -n -l
> ./src/test/test_cls_hello --gtest_filter=ClsHello.ReturnMtime
>
> [ RUN      ] ClsHello.ReturnMtime
> from client: 1453528535
>    from osd: 1453388425 (OLDER)
>
> -----
>
> diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc
> index 878130f..1c2d00c 100644
> --- a/src/cls/hello/cls_hello.cc
> +++ b/src/cls/hello/cls_hello.cc
> @@ -47,6 +47,7 @@ cls_method_handle_t h_writes_dont_return_data;
>  cls_method_handle_t h_turn_it_to_11;
>  cls_method_handle_t h_bad_reader;
>  cls_method_handle_t h_bad_writer;
> +cls_method_handle_t h_return_mtime;
>
>  /**
>   * say hello - a "read" method that does not depend on the object
> @@ -291,6 +292,22 @@ PGLSFilter *hello_filter()
>    return new PGLSHelloFilter();
>  }
>
> +static int return_mtime(cls_method_context_t hctx, bufferlist *in,
> bufferlist *out)
> +{
> +  uint64_t size;
> +  time_t mtime;
> +
> +  int ret = cls_cxx_stat(hctx, &size, &mtime);
> +  if (ret) {
> +    CLS_ERR("ERROR: return_mtime %d", ret);
> +    return ret;
> +  }
> +
> +  ::encode(mtime, *out);
> +
> +  return 0;
> +}
> +
>
>  /**
>   * initialize class
> @@ -339,6 +356,11 @@ void __cls_init()
>    cls_register_cxx_method(h_class, "bad_writer", CLS_METHOD_RD,
>    bad_writer, &h_bad_writer);
>
> +  // mtime test
> +  cls_register_cxx_method(h_class, "return_mtime",
> +      CLS_METHOD_RD | CLS_METHOD_WR, return_mtime,
> +      &h_return_mtime);
> +
>    // A PGLS filter
>    cls_register_cxx_filter(h_class, "hello", hello_filter);
>  }
> diff --git a/src/test/cls_hello/test_cls_hello.cc
> b/src/test/cls_hello/test_cls_hello.cc
> index efd9fc7..70364d0 100644
> --- a/src/test/cls_hello/test_cls_hello.cc
> +++ b/src/test/cls_hello/test_cls_hello.cc
> @@ -183,3 +183,32 @@ TEST(ClsHello, Filter) {
>    ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
>  }
>
> +TEST(ClsHello, ReturnMtime) {
> +  Rados cluster;
> +  std::string pool_name = get_temp_pool_name();
> +  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
> +  IoCtx ioctx;
> +  cluster.ioctx_create(pool_name.c_str(), ioctx);
> +
> +  char buf[128];
> +  memset(buf, 0xcc, sizeof(buf));
> +  bufferlist obj_content;
> +  obj_content.append(buf, sizeof(buf));
> +
> +  ASSERT_EQ(0, ioctx.write_full("oid", obj_content));
> +
> +  uint64_t size;
> +  time_t mtime;
> +
> +  ASSERT_EQ(0, ioctx.stat("oid", &size, &mtime));
> +  std::cout << "from client: " << mtime << std::endl;
> +
> +  bufferlist in, out;
> +  ASSERT_EQ(0, ioctx.exec("oid", "hello", "return_mtime", in, out));
> +
> +  bufferlist::iterator it = out.begin();
> +  ::decode(mtime, it);
> +  std::cout << "   from osd: " << mtime << std::endl;
> +
> +  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
> +}
> --
> To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adam C. Emerson Jan. 21, 2016, 5:03 p.m. UTC | #2
On 21/01/2016, Yehuda Sadeh-Weinraub wrote:
[snip]
> This sounds somewhat like an issue Casey and Orit had seen, one that I
> wasn't able to replicate (both of them are using cmake, I'm using
> automake). I think they tracked it to Adam's c++11 time change.
[snip]

I was thinking about this the other night. There doesn't seem to be any problem
with the actual /read/ that I can find. (There'd be failures in stat all over
the place.)

But I recalled that Objecter allows you to specify the mtime for operations and
thought perhaps that might be where the problem is. If we know what operation is
creating/last touching these objects, can someone who's able to reproduce the
problem run a cycle or two of it through valgrind? It might be feeding in a
not-properly-initialized time value, and if this shows up under one build system
but not another that feels like it could be a touch more likely.
Noah Watkins Jan. 21, 2016, 5:13 p.m. UTC | #3
It's easy for me to reproduce. What would you like me to run?

On Thu, Jan 21, 2016 at 9:03 AM, Adam C. Emerson <aemerson@redhat.com> wrote:
> On 21/01/2016, Yehuda Sadeh-Weinraub wrote:
> [snip]
>> This sounds somewhat like an issue Casey and Orit had seen, one that I
>> wasn't able to replicate (both of them are using cmake, I'm using
>> automake). I think they tracked it to Adam's c++11 time change.
> [snip]
>
> I was thinking about this the other night. There doesn't seem to be any problem
> with the actual /read/ that I can find. (There'd be failures in stat all over
> the place.)
>
> But I recalled that Objecter allows you to specify the mtime for operations and
> thought perhaps that might be where the problem is. If we know what operation is
> creating/last touching these objects, can someone who's able to reproduce the
> problem run a cycle or two of it through valgrind? It might be feeding in a
> not-properly-initialized time value, and if this shows up under one build system
> but not another that feels like it could be a touch more likely.
>
> --
> Senior Software Engineer           Red Hat Storage, Ann Arbor, MI, US
> IRC: Aemerson@{RedHat, OFTC, Freenode}
> 0x80F7544B90EDBFB9 E707 86BA 0C1B 62CC 152C  7C12 80F7 544B 90ED BFB9
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Adam C. Emerson Jan. 21, 2016, 5:48 p.m. UTC | #4
On 21/01/2016, Noah Watkins wrote:
> It's easy for me to reproduce. What would you like me to run?

Sorry about that, I had misread the message I responded to and thought
it was still bound up with RadosGW. I'm running your test case, I should
have a patch sometime soon to fix it.
Adam C. Emerson Jan. 22, 2016, 2:49 a.m. UTC | #5
On 21/01/2016, Noah Watkins wrote:
> I'm seeing an anomaly of semantics between Ceph built with Automake
> and with CMake. I've not been lucky so far tracking this down so I
> thought I'd toss it out here.

All right.  Commit 9717714cf62eeb7db979b0e6b509ef305cb757eb in 
github.com/cohortfsllc/cohort-ceph should have this. It's the top entry on the
mtime branch in that repository. If you like things web-style,
https://github.com/cohortfsllc/cohortfs-ceph/commit/9717714cf62eeb7db979b0e6b509ef305cb757eb

Casey chased this down to the encode/decode functions and once I took a look at
them I realized the cast was screwy. It now passes your unit test reliably (at
least it worked a few times) under cmake.
Casey Bodley Jan. 29, 2016, 3:55 p.m. UTC | #6
Hi Noah,

We were tracking this bug in rgw for a while, and couldn't find a
simple reproducer. Your test case made it so much easier to fix,
thank you!

A pull request with the fix is pending at:
https://github.com/ceph/ceph/pull/7328

Casey

----- Original Message -----
> On 21/01/2016, Noah Watkins wrote:
> > I'm seeing an anomaly of semantics between Ceph built with Automake
> > and with CMake. I've not been lucky so far tracking this down so I
> > thought I'd toss it out here.
> 
> All right.  Commit 9717714cf62eeb7db979b0e6b509ef305cb757eb in
> github.com/cohortfsllc/cohort-ceph should have this. It's the top entry on
> the
> mtime branch in that repository. If you like things web-style,
> https://github.com/cohortfsllc/cohortfs-ceph/commit/9717714cf62eeb7db979b0e6b509ef305cb757eb
> 
> Casey chased this down to the encode/decode functions and once I took a look
> at
> them I realized the cast was screwy. It now passes your unit test reliably
> (at
> least it worked a few times) under cmake.
> 
> --
> Senior Software Engineer           Red Hat Storage, Ann Arbor, MI, US
> IRC: Aemerson@{RedHat, OFTC, Freenode}
> 0x80F7544B90EDBFB9 E707 86BA 0C1B 62CC 152C  7C12 80F7 544B 90ED BFB9
> 
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/src/cls/hello/cls_hello.cc b/src/cls/hello/cls_hello.cc
index 878130f..1c2d00c 100644
--- a/src/cls/hello/cls_hello.cc
+++ b/src/cls/hello/cls_hello.cc
@@ -47,6 +47,7 @@  cls_method_handle_t h_writes_dont_return_data;
 cls_method_handle_t h_turn_it_to_11;
 cls_method_handle_t h_bad_reader;
 cls_method_handle_t h_bad_writer;
+cls_method_handle_t h_return_mtime;

 /**
  * say hello - a "read" method that does not depend on the object
@@ -291,6 +292,22 @@  PGLSFilter *hello_filter()
   return new PGLSHelloFilter();
 }

+static int return_mtime(cls_method_context_t hctx, bufferlist *in,
bufferlist *out)
+{
+  uint64_t size;
+  time_t mtime;
+
+  int ret = cls_cxx_stat(hctx, &size, &mtime);
+  if (ret) {
+    CLS_ERR("ERROR: return_mtime %d", ret);
+    return ret;
+  }
+
+  ::encode(mtime, *out);
+
+  return 0;
+}
+

 /**
  * initialize class
@@ -339,6 +356,11 @@  void __cls_init()
   cls_register_cxx_method(h_class, "bad_writer", CLS_METHOD_RD,
   bad_writer, &h_bad_writer);

+  // mtime test
+  cls_register_cxx_method(h_class, "return_mtime",
+      CLS_METHOD_RD | CLS_METHOD_WR, return_mtime,
+      &h_return_mtime);
+
   // A PGLS filter
   cls_register_cxx_filter(h_class, "hello", hello_filter);
 }
diff --git a/src/test/cls_hello/test_cls_hello.cc
b/src/test/cls_hello/test_cls_hello.cc
index efd9fc7..70364d0 100644
--- a/src/test/cls_hello/test_cls_hello.cc
+++ b/src/test/cls_hello/test_cls_hello.cc
@@ -183,3 +183,32 @@  TEST(ClsHello, Filter) {
   ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
 }

+TEST(ClsHello, ReturnMtime) {
+  Rados cluster;
+  std::string pool_name = get_temp_pool_name();
+  ASSERT_EQ("", create_one_pool_pp(pool_name, cluster));
+  IoCtx ioctx;
+  cluster.ioctx_create(pool_name.c_str(), ioctx);
+
+  char buf[128];
+  memset(buf, 0xcc, sizeof(buf));
+  bufferlist obj_content;
+  obj_content.append(buf, sizeof(buf));
+
+  ASSERT_EQ(0, ioctx.write_full("oid", obj_content));
+
+  uint64_t size;
+  time_t mtime;
+
+  ASSERT_EQ(0, ioctx.stat("oid", &size, &mtime));
+  std::cout << "from client: " << mtime << std::endl;
+
+  bufferlist in, out;
+  ASSERT_EQ(0, ioctx.exec("oid", "hello", "return_mtime", in, out));
+
+  bufferlist::iterator it = out.begin();
+  ::decode(mtime, it);
+  std::cout << "   from osd: " << mtime << std::endl;
+
+  ASSERT_EQ(0, destroy_one_pool_pp(pool_name, cluster));
+}