Orangefs ABI documentation
diff mbox

Message ID CAOg9mSS11uSvfBkNYPeNBw8xMebvHAM3vzh82BH=W273-7oNyg@mail.gmail.com
State New
Headers show

Commit Message

Mike Marshall Feb. 18, 2016, 6:58 p.m. UTC
Still busted, exactly the same, I think. The doomed op gets a good
return code from is_daemon_in_service in service_operation but
gets EAGAIN from wait_for_matching_downcall... an edge case kind of
problem.

Here's the raw (well, slightly edited for readability) logs showing
the doomed op and subsequent failed op that uses the bogus handle
and fsid from the doomed op.



Alloced OP (ffff880012898000: 10889 OP_CREATE)
service_operation: orangefs_create op:ffff880012898000:



wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
service_operation: orangefs_create op:ffff880012898000:
service_operation:client core is NOT in service, ffff880012898000



service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
service_operation orangefs_create returning: 0 for ffff880012898000
orangefs_create: PPTOOLS1.PPA:
handle:00000000-0000-0000-0000-000000000000: fsid:0:
new_op:ffff880012898000: ret:0:



Alloced OP (ffff880012888000: 10958 OP_GETATTR)
service_operation: orangefs_inode_getattr op:ffff880012888000:
service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
Releasing OP (ffff880012888000: 10958
orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
Releasing OP (ffff880012898000: 10889




What I'm testing with differs from what is at kernel.org#for-next by
  - diffs from Al's most recent email
  - 1 souped up gossip message
  - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
  - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation



      "Interrupted: Removed op %p from request_list\n",
@@ -225,24 +231,18 @@ static void
orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
  /* op must be removed from the in progress htable */
  spin_unlock(&op->lock);
  spin_lock(&htable_ops_in_progress_lock);
- list_del(&op->list);
+ list_del_init(&op->list);
  spin_unlock(&htable_ops_in_progress_lock);
  gossip_debug(GOSSIP_WAIT_DEBUG,
      "Interrupted: Removed op %p"
      " from htable_ops_in_progress\n",
      op);
- } else if (!op_state_serviced(op)) {
+ } else {
  spin_unlock(&op->lock);
  gossip_err("interrupted operation is in a weird state 0x%x\n",
    op->op_state);
- } else {
- /*
- * It is not intended for execution to flow here,
- * but having this unlock here makes sparse happy.
- */
- gossip_err("%s: can't get here.\n", __func__);
- spin_unlock(&op->lock);
  }
+ reinit_completion(&op->waitq);
 }

 /*

On Thu, Feb 18, 2016 at 6:11 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 18, 2016 at 12:04:39AM +0000, Al Viro wrote:
>> Looks like the right approach is to have orangefs_clean_... hitting the
>> sucker being copied to/from daemon to wait until that's finished (and
>> discarded).  That, BTW, would have an extra benefit of making life simpler
>> for refcounting.
>>
>> So...  We need to have them marked as "being copied" for the duration, instead
>> of bumping the refcount.  That setting and dropping that flag should happen
>> under op->lock.  Setting it should happen only if it's not given up (that would
>> be interpreted as "not found").  Cleaning, OTOH, would recheck the "given up"
>> and do complete(&op->waitq) in case it's been given up...
>>
>> How about this (instead of the previous variant, includes a fix for
>> errno bogosity spotted a bit upthread; if it works, it'll need a bit of
>> splitup)
>
> Better yet, let's use list_del_init() on op->list instead of those list_del().
> Then, seeing that ..._clean_interrupted_... can't be called in case of
> serviced (we hadn't dropped op->lock since the time we'd checked it), we
> can use list_empty(&op->list) as a test for "given up while copying to/from
> daemon", so there's no need for separate flag that way:
>         * we never pick given up op from list/hash
>         * daemon read/write_iter never modifies op->list after op has
> been given up
>         * if op is given up while copying to/from userland in daemon
> read/write_iter, it will call complete(&op->waitq) once it finds that,
> so giveup side can wait for completion if it finds op it's about to give
> up not on any list.
>
> Should be equivalent to the previous variant, but IMO it's cleaner that
> way...
>
> diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
> index b27ed1c..f7914f5 100644
> --- a/fs/orangefs/devorangefs-req.c
> +++ b/fs/orangefs/devorangefs-req.c
> @@ -58,9 +58,9 @@ static struct orangefs_kernel_op_s *orangefs_devreq_remove_op(__u64 tag)
>                                  next,
>                                  &htable_ops_in_progress[index],
>                                  list) {
> -               if (op->tag == tag && !op_state_purged(op)) {
> +               if (op->tag == tag && !op_state_purged(op) &&
> +                   !op_state_given_up(op)) {
>                         list_del_init(&op->list);
> -                       get_op(op); /* increase ref count. */
>                         spin_unlock(&htable_ops_in_progress_lock);
>                         return op;
>                 }
> @@ -133,7 +133,7 @@ restart:
>                 __s32 fsid;
>                 /* This lock is held past the end of the loop when we break. */
>                 spin_lock(&op->lock);
> -               if (unlikely(op_state_purged(op))) {
> +               if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
>                         spin_unlock(&op->lock);
>                         continue;
>                 }
> @@ -199,13 +199,12 @@ restart:
>          */
>         if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
>                 gossip_err("orangefs: ERROR: Current op already queued.\n");
> -               list_del(&cur_op->list);
> +               list_del_init(&cur_op->list);
>                 spin_unlock(&cur_op->lock);
>                 spin_unlock(&orangefs_request_list_lock);
>                 return -EAGAIN;
>         }
>         list_del_init(&cur_op->list);
> -       get_op(op);
>         spin_unlock(&orangefs_request_list_lock);
>
>         spin_unlock(&cur_op->lock);
> @@ -230,7 +229,7 @@ restart:
>         if (unlikely(op_state_given_up(cur_op))) {
>                 spin_unlock(&cur_op->lock);
>                 spin_unlock(&htable_ops_in_progress_lock);
> -               op_release(cur_op);
> +               complete(&cur_op->waitq);
>                 goto restart;
>         }
>
> @@ -242,7 +241,6 @@ restart:
>         orangefs_devreq_add_op(cur_op);
>         spin_unlock(&cur_op->lock);
>         spin_unlock(&htable_ops_in_progress_lock);
> -       op_release(cur_op);
>
>         /* The client only asks to read one size buffer. */
>         return MAX_DEV_REQ_UPSIZE;
> @@ -258,10 +256,12 @@ error:
>         if (likely(!op_state_given_up(cur_op))) {
>                 set_op_state_waiting(cur_op);
>                 list_add(&cur_op->list, &orangefs_request_list);
> +               spin_unlock(&cur_op->lock);
> +       } else {
> +               spin_unlock(&cur_op->lock);
> +               complete(&cur_op->waitq);
>         }
> -       spin_unlock(&cur_op->lock);
>         spin_unlock(&orangefs_request_list_lock);
> -       op_release(cur_op);
>         return -EFAULT;
>  }
>
> @@ -333,8 +333,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         n = copy_from_iter(&op->downcall, downcall_size, iter);
>         if (n != downcall_size) {
>                 gossip_err("%s: failed to copy downcall.\n", __func__);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         if (op->downcall.status)
> @@ -354,8 +353,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                            downcall_size,
>                            op->downcall.trailer_size,
>                            total);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         /* Only READDIR operations should have trailers. */
> @@ -364,8 +362,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                 gossip_err("%s: %x operation with trailer.",
>                            __func__,
>                            op->downcall.type);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         /* READDIR operations should always have trailers. */
> @@ -374,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>                 gossip_err("%s: %x operation with no trailer.",
>                            __func__,
>                            op->downcall.type);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>         if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
> @@ -386,8 +382,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         if (op->downcall.trailer_buf == NULL) {
>                 gossip_err("%s: failed trailer vmalloc.\n",
>                            __func__);
> -               ret = -ENOMEM;
> -               goto Broken;
> +               goto Enomem;
>         }
>         memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
>         n = copy_from_iter(op->downcall.trailer_buf,
> @@ -396,8 +391,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
>         if (n != op->downcall.trailer_size) {
>                 gossip_err("%s: failed to copy trailer.\n", __func__);
>                 vfree(op->downcall.trailer_buf);
> -               ret = -EFAULT;
> -               goto Broken;
> +               goto Efault;
>         }
>
>  wakeup:
> @@ -406,38 +400,27 @@ wakeup:
>          * that this op is done
>          */
>         spin_lock(&op->lock);
> -       if (unlikely(op_state_given_up(op))) {
> +       if (unlikely(op_is_cancel(op))) {
>                 spin_unlock(&op->lock);
> -               goto out;
> -       }
> -       set_op_state_serviced(op);
> -       spin_unlock(&op->lock);
> -
> -       /*
> -        * If this operation is an I/O operation we need to wait
> -        * for all data to be copied before we can return to avoid
> -        * buffer corruption and races that can pull the buffers
> -        * out from under us.
> -        *
> -        * Essentially we're synchronizing with other parts of the
> -        * vfs implicitly by not allowing the user space
> -        * application reading/writing this device to return until
> -        * the buffers are done being used.
> -        */
> -out:
> -       if (unlikely(op_is_cancel(op)))
>                 put_cancel(op);
> -       op_release(op);
> -       return ret;
> -
> -Broken:
> -       spin_lock(&op->lock);
> -       if (!op_state_given_up(op)) {
> -               op->downcall.status = ret;
> +       } else if (unlikely(op_state_given_up(op))) {
> +               spin_unlock(&op->lock);
> +               complete(&op->waitq);
> +       } else {
>                 set_op_state_serviced(op);
> +               spin_unlock(&op->lock);
>         }
> -       spin_unlock(&op->lock);
> -       goto out;
> +       return ret;
> +
> +Efault:
> +       op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
> +       ret = -EFAULT;
> +       goto wakeup;
> +
> +Enomem:
> +       op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
> +       ret = -ENOMEM;
> +       goto wakeup;
>  }
>
>  /* Returns whether any FS are still pending remounted */
> diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
> index 817092a..900a2e3 100644
> --- a/fs/orangefs/orangefs-cache.c
> +++ b/fs/orangefs/orangefs-cache.c
> @@ -120,8 +120,6 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
>                 spin_lock_init(&new_op->lock);
>                 init_completion(&new_op->waitq);
>
> -               atomic_set(&new_op->ref_count, 1);
> -
>                 new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
>                 new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
>                 new_op->downcall.status = -1;
> @@ -149,7 +147,7 @@ struct orangefs_kernel_op_s *op_alloc(__s32 type)
>         return new_op;
>  }
>
> -void __op_release(struct orangefs_kernel_op_s *orangefs_op)
> +void op_release(struct orangefs_kernel_op_s *orangefs_op)
>  {
>         if (orangefs_op) {
>                 gossip_debug(GOSSIP_CACHE_DEBUG,
> diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
> index 1f8310c..e387d3c 100644
> --- a/fs/orangefs/orangefs-kernel.h
> +++ b/fs/orangefs/orangefs-kernel.h
> @@ -205,8 +205,6 @@ struct orangefs_kernel_op_s {
>         struct completion waitq;
>         spinlock_t lock;
>
> -       atomic_t ref_count;
> -
>         /* VFS aio fields */
>
>         int attempts;
> @@ -230,23 +228,7 @@ static inline void set_op_state_serviced(struct orangefs_kernel_op_s *op)
>  #define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
>  #define op_is_cancel(op)         ((op)->downcall.type == ORANGEFS_VFS_OP_CANCEL)
>
> -static inline void get_op(struct orangefs_kernel_op_s *op)
> -{
> -       atomic_inc(&op->ref_count);
> -       gossip_debug(GOSSIP_DEV_DEBUG,
> -                       "(get) Alloced OP (%p:%llu)\n", op, llu(op->tag));
> -}
> -
> -void __op_release(struct orangefs_kernel_op_s *op);
> -
> -static inline void op_release(struct orangefs_kernel_op_s *op)
> -{
> -       if (atomic_dec_and_test(&op->ref_count)) {
> -               gossip_debug(GOSSIP_DEV_DEBUG,
> -                       "(put) Releasing OP (%p:%llu)\n", op, llu((op)->tag));
> -               __op_release(op);
> -       }
> -}
> +void op_release(struct orangefs_kernel_op_s *op);
>
>  extern void orangefs_bufmap_put(int);
>  static inline void put_cancel(struct orangefs_kernel_op_s *op)
> @@ -259,7 +241,7 @@ static inline void set_op_state_purged(struct orangefs_kernel_op_s *op)
>  {
>         spin_lock(&op->lock);
>         if (unlikely(op_is_cancel(op))) {
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&op->lock);
>                 put_cancel(op);
>         } else {
> diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
> index d980240..3f9e430 100644
> --- a/fs/orangefs/waitqueue.c
> +++ b/fs/orangefs/waitqueue.c
> @@ -208,15 +208,20 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
>          * Called with op->lock held.
>          */
>         op->op_state |= OP_VFS_STATE_GIVEN_UP;
> -
> -       if (op_state_waiting(op)) {
> +       /* from that point on it can't be moved by anybody else */
> +       if (list_empty(&op->list)) {
> +               /* caught copying to/from daemon */
> +               BUG_ON(op_state_serviced(op));
> +               spin_unlock(&op->lock);
> +               wait_for_completion(&op->waitq);
> +       } else if (op_state_waiting(op)) {
>                 /*
>                  * upcall hasn't been read; remove op from upcall request
>                  * list.
>                  */
>                 spin_unlock(&op->lock);
>                 spin_lock(&orangefs_request_list_lock);
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&orangefs_request_list_lock);
>                 gossip_debug(GOSSIP_WAIT_DEBUG,
>                              "Interrupted: Removed op %p from request_list\n",
> @@ -225,23 +230,16 @@ static void orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
>                 /* op must be removed from the in progress htable */
>                 spin_unlock(&op->lock);
>                 spin_lock(&htable_ops_in_progress_lock);
> -               list_del(&op->list);
> +               list_del_init(&op->list);
>                 spin_unlock(&htable_ops_in_progress_lock);
>                 gossip_debug(GOSSIP_WAIT_DEBUG,
>                              "Interrupted: Removed op %p"
>                              " from htable_ops_in_progress\n",
>                              op);
> -       } else if (!op_state_serviced(op)) {
> +       } else {
>                 spin_unlock(&op->lock);
>                 gossip_err("interrupted operation is in a weird state 0x%x\n",
>                            op->op_state);
> -       } else {
> -               /*
> -                * It is not intended for execution to flow here,
> -                * but having this unlock here makes sparse happy.
> -                */
> -               gossip_err("%s: can't get here.\n", __func__);
> -               spin_unlock(&op->lock);
>         }
>         reinit_completion(&op->waitq);
>  }
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Al Viro Feb. 18, 2016, 7:20 p.m. UTC | #1
On Thu, Feb 18, 2016 at 01:58:52PM -0500, Mike Marshall wrote:

> wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
> service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
> Interrupted: Removed op ffff880012898000 from htable_ops_in_progress

state is "in progress"

> tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
> service_operation: orangefs_create op:ffff880012898000:

moved to "waiting"

> service_operation:client core is NOT in service, ffff880012898000
> 
> 
> 
> service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
> service_operation orangefs_create returning: 0 for ffff880012898000

... and we've got to "serviced" somehow.

IDGI...  Are you sure that it's not a daemon replying with zero fsid?

Could you slap
        gossip_debug(GOSSIP_WAIT_DEBUG,
                     "%s: %s op:%p: process:%s state -> %d\n",
                     __func__,
                     op_name,
                     op,
                     current->comm,
		     op->op_state);
after assignments to ->op_state in set_op_state_purged() and
set_op_state_serviced() as well as after the calls of set_op_state_waiting()
(in service_operation() and orangefs_devreq_read()) and
set_op_state_inprogress() (in orangefs_devreq_read()).

Another thing: in orangefs_devreq_write_iter(), just before the
set_op_state_serviced() add
	WARN_ON(op->upcall.type == ORANGEFS_OP_VFS_CREATE &&
		!op->downcall.create.refn.fs_id);
to make sure that this crap isn't coming from the daemon.

While we are at it -
#define op_is_cancel(op)         ((op)->downcall.type == ORANGEFS_VFS_OP_CANCEL)is checking the wrong thing; should be
#define op_is_cancel(op)         ((op)->upcall.type == ORANGEFS_VFS_OP_CANCEL)

Shouldn't be worse than a leak, though, so I doubt that it could be causing
this problem...
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin Brandenburg Feb. 18, 2016, 7:49 p.m. UTC | #2
On Thu, 18 Feb 2016, Mike Marshall wrote:

> Still busted, exactly the same, I think. The doomed op gets a good
> return code from is_daemon_in_service in service_operation but
> gets EAGAIN from wait_for_matching_downcall... an edge case kind of
> problem.
> 
> Here's the raw (well, slightly edited for readability) logs showing
> the doomed op and subsequent failed op that uses the bogus handle
> and fsid from the doomed op.
> 
> 
> 
> Alloced OP (ffff880012898000: 10889 OP_CREATE)
> service_operation: orangefs_create op:ffff880012898000:
> 
> 
> 
> wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
> service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
> Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
> tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
> service_operation: orangefs_create op:ffff880012898000:
> service_operation:client core is NOT in service, ffff880012898000
> 
> 
> 
> service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
> service_operation orangefs_create returning: 0 for ffff880012898000
> orangefs_create: PPTOOLS1.PPA:
> handle:00000000-0000-0000-0000-000000000000: fsid:0:
> new_op:ffff880012898000: ret:0:
> 
> 
> 
> Alloced OP (ffff880012888000: 10958 OP_GETATTR)
> service_operation: orangefs_inode_getattr op:ffff880012888000:
> service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
> service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
> Releasing OP (ffff880012888000: 10958
> orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
> Releasing OP (ffff880012898000: 10889
> 
> 
> 
> 
> What I'm testing with differs from what is at kernel.org#for-next by
>   - diffs from Al's most recent email
>   - 1 souped up gossip message
>   - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
>   - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation
> 
> 
> 

Mike,

what error do you get from userspace (i.e. from dbench)?

open("./clients/client0/~dmtmp/EXCEL/5D7C0000", O_RDWR|O_CREAT, 0600) = -1 ENODEV (No such device)

An interesting note is that I can't reproduce at all
with only one dbench process. It seems there's not
enough load.

I don't see how the kernel could return ENODEV at all.
This may be coming from our client-core.

-- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 18, 2016, 8:08 p.m. UTC | #3
I haven't been trussing it... it reports EINVAL to stderr... I find
the ops to look
at in the debug output by looking for the -22...

(373) open ./clients/client8/~dmtmp/PARADOX/STUDENTS.DB failed for
handle 9981 (Invalid argument)

I just got the whacky code <g> from Al's last message to compile, I'll
have results from that soon...

-Mike

On Thu, Feb 18, 2016 at 2:49 PM, Martin Brandenburg <martin@omnibond.com> wrote:
> On Thu, 18 Feb 2016, Mike Marshall wrote:
>
>> Still busted, exactly the same, I think. The doomed op gets a good
>> return code from is_daemon_in_service in service_operation but
>> gets EAGAIN from wait_for_matching_downcall... an edge case kind of
>> problem.
>>
>> Here's the raw (well, slightly edited for readability) logs showing
>> the doomed op and subsequent failed op that uses the bogus handle
>> and fsid from the doomed op.
>>
>>
>>
>> Alloced OP (ffff880012898000: 10889 OP_CREATE)
>> service_operation: orangefs_create op:ffff880012898000:
>>
>>
>>
>> wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
>> service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
>> Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
>> tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
>> service_operation: orangefs_create op:ffff880012898000:
>> service_operation:client core is NOT in service, ffff880012898000
>>
>>
>>
>> service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
>> service_operation orangefs_create returning: 0 for ffff880012898000
>> orangefs_create: PPTOOLS1.PPA:
>> handle:00000000-0000-0000-0000-000000000000: fsid:0:
>> new_op:ffff880012898000: ret:0:
>>
>>
>>
>> Alloced OP (ffff880012888000: 10958 OP_GETATTR)
>> service_operation: orangefs_inode_getattr op:ffff880012888000:
>> service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
>> service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
>> Releasing OP (ffff880012888000: 10958
>> orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
>> Releasing OP (ffff880012898000: 10889
>>
>>
>>
>>
>> What I'm testing with differs from what is at kernel.org#for-next by
>>   - diffs from Al's most recent email
>>   - 1 souped up gossip message
>>   - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
>>   - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation
>>
>>
>>
>
> Mike,
>
> what error do you get from userspace (i.e. from dbench)?
>
> open("./clients/client0/~dmtmp/EXCEL/5D7C0000", O_RDWR|O_CREAT, 0600) = -1 ENODEV (No such device)
>
> An interesting note is that I can't reproduce at all
> with only one dbench process. It seems there's not
> enough load.
>
> I don't see how the kernel could return ENODEV at all.
> This may be coming from our client-core.
>
> -- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 18, 2016, 8:22 p.m. UTC | #4
I haven't edited up a list of how the debug output looked,
but most importantly: the WARN_ON is hit... it appears that
the client-core is sending over fsid:0:

-Mike

On Thu, Feb 18, 2016 at 3:08 PM, Mike Marshall <hubcap@omnibond.com> wrote:
> I haven't been trussing it... it reports EINVAL to stderr... I find
> the ops to look
> at in the debug output by looking for the -22...
>
> (373) open ./clients/client8/~dmtmp/PARADOX/STUDENTS.DB failed for
> handle 9981 (Invalid argument)
>
> I just got the whacky code <g> from Al's last message to compile, I'll
> have results from that soon...
>
> -Mike
>
> On Thu, Feb 18, 2016 at 2:49 PM, Martin Brandenburg <martin@omnibond.com> wrote:
>> On Thu, 18 Feb 2016, Mike Marshall wrote:
>>
>>> Still busted, exactly the same, I think. The doomed op gets a good
>>> return code from is_daemon_in_service in service_operation but
>>> gets EAGAIN from wait_for_matching_downcall... an edge case kind of
>>> problem.
>>>
>>> Here's the raw (well, slightly edited for readability) logs showing
>>> the doomed op and subsequent failed op that uses the bogus handle
>>> and fsid from the doomed op.
>>>
>>>
>>>
>>> Alloced OP (ffff880012898000: 10889 OP_CREATE)
>>> service_operation: orangefs_create op:ffff880012898000:
>>>
>>>
>>>
>>> wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
>>> service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
>>> Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
>>> tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
>>> service_operation: orangefs_create op:ffff880012898000:
>>> service_operation:client core is NOT in service, ffff880012898000
>>>
>>>
>>>
>>> service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
>>> service_operation orangefs_create returning: 0 for ffff880012898000
>>> orangefs_create: PPTOOLS1.PPA:
>>> handle:00000000-0000-0000-0000-000000000000: fsid:0:
>>> new_op:ffff880012898000: ret:0:
>>>
>>>
>>>
>>> Alloced OP (ffff880012888000: 10958 OP_GETATTR)
>>> service_operation: orangefs_inode_getattr op:ffff880012888000:
>>> service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
>>> service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
>>> Releasing OP (ffff880012888000: 10958
>>> orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
>>> Releasing OP (ffff880012898000: 10889
>>>
>>>
>>>
>>>
>>> What I'm testing with differs from what is at kernel.org#for-next by
>>>   - diffs from Al's most recent email
>>>   - 1 souped up gossip message
>>>   - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
>>>   - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation
>>>
>>>
>>>
>>
>> Mike,
>>
>> what error do you get from userspace (i.e. from dbench)?
>>
>> open("./clients/client0/~dmtmp/EXCEL/5D7C0000", O_RDWR|O_CREAT, 0600) = -1 ENODEV (No such device)
>>
>> An interesting note is that I can't reproduce at all
>> with only one dbench process. It seems there's not
>> enough load.
>>
>> I don't see how the kernel could return ENODEV at all.
>> This may be coming from our client-core.
>>
>> -- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 18, 2016, 8:38 p.m. UTC | #5
Yeah, it looks like the fault is entirely with the client-core...

orangefs-kernel.h:      OP_VFS_STATE_UNKNOWN = 0,
orangefs-kernel.h:      OP_VFS_STATE_WAITING = 1,
orangefs-kernel.h:      OP_VFS_STATE_INPROGR = 2,
orangefs-kernel.h:      OP_VFS_STATE_SERVICED = 4,
orangefs-kernel.h:      OP_VFS_STATE_PURGED = 8,
orangefs-kernel.h:      OP_VFS_STATE_GIVEN_UP = 16,


Alloced OP (ffff880011078000: 20210 OP_CREATE)
service_operation: orangefs_create op:ffff880011078000:
service_op: orangefs_create op:ffff880011078000: process:dbench state -> 1

orangefs_devreq_read: op:ffff880011078000: process:pvfs2-client-co state -> 2

set_op_state_purged: op:ffff880011078000: process:pvfs2-client-co state -> 10

wait_for_matching_downcall: operation purged (tag 20210, ffff880011078000, att 0
service_operation: wait_for_matching_downcall returned -11 for ffff880011078000
Interrupted: Removed op ffff880011078000 from htable_ops_in_progress
tag 20210 (orangefs_create) -- operation to be retried (1 attempt)
service_operation: orangefs_create op:ffff880011078000:
process:dbench: pid:1171service_op: orangefs_create
op:ffff880011078000: process:dbench state -> 1
service_operation:client core is NOT in service, ffff880011078000

orangefs_devreq_read: op:ffff880011078000: process:pvfs2-client-co state -> 2

WARNING: CPU: 0 PID: 1216 at fs/orangefs/devorangefs-req.c:423
set_op_state_serviced: op:ffff880011078000: process:pvfs2-client-co state -> 4
service_operation: wait_for_matching_downcall returned 0 for ffff880011078000
service_operation orangefs_create returning: 0 for ffff880011078000
orangefs_create: BENCHS.LWP:
handle:00000000-0000-0000-0000-000000000000: fsid:0:
new_op:ffff880011078000: ret:0:

-Mike

On Thu, Feb 18, 2016 at 3:22 PM, Mike Marshall <hubcap@omnibond.com> wrote:
> I haven't edited up a list of how the debug output looked,
> but most importantly: the WARN_ON is hit... it appears that
> the client-core is sending over fsid:0:
>
> -Mike
>
> On Thu, Feb 18, 2016 at 3:08 PM, Mike Marshall <hubcap@omnibond.com> wrote:
>> I haven't been trussing it... it reports EINVAL to stderr... I find
>> the ops to look
>> at in the debug output by looking for the -22...
>>
>> (373) open ./clients/client8/~dmtmp/PARADOX/STUDENTS.DB failed for
>> handle 9981 (Invalid argument)
>>
>> I just got the whacky code <g> from Al's last message to compile, I'll
>> have results from that soon...
>>
>> -Mike
>>
>> On Thu, Feb 18, 2016 at 2:49 PM, Martin Brandenburg <martin@omnibond.com> wrote:
>>> On Thu, 18 Feb 2016, Mike Marshall wrote:
>>>
>>>> Still busted, exactly the same, I think. The doomed op gets a good
>>>> return code from is_daemon_in_service in service_operation but
>>>> gets EAGAIN from wait_for_matching_downcall... an edge case kind of
>>>> problem.
>>>>
>>>> Here's the raw (well, slightly edited for readability) logs showing
>>>> the doomed op and subsequent failed op that uses the bogus handle
>>>> and fsid from the doomed op.
>>>>
>>>>
>>>>
>>>> Alloced OP (ffff880012898000: 10889 OP_CREATE)
>>>> service_operation: orangefs_create op:ffff880012898000:
>>>>
>>>>
>>>>
>>>> wait_for_matching_downcall: operation purged (tag 10889, ffff880012898000, att 0
>>>> service_operation: wait_for_matching_downcall returned -11 for ffff880012898000
>>>> Interrupted: Removed op ffff880012898000 from htable_ops_in_progress
>>>> tag 10889 (orangefs_create) -- operation to be retried (1 attempt)
>>>> service_operation: orangefs_create op:ffff880012898000:
>>>> service_operation:client core is NOT in service, ffff880012898000
>>>>
>>>>
>>>>
>>>> service_operation: wait_for_matching_downcall returned 0 for ffff880012898000
>>>> service_operation orangefs_create returning: 0 for ffff880012898000
>>>> orangefs_create: PPTOOLS1.PPA:
>>>> handle:00000000-0000-0000-0000-000000000000: fsid:0:
>>>> new_op:ffff880012898000: ret:0:
>>>>
>>>>
>>>>
>>>> Alloced OP (ffff880012888000: 10958 OP_GETATTR)
>>>> service_operation: orangefs_inode_getattr op:ffff880012888000:
>>>> service_operation: wait_for_matching_downcall returned 0 for ffff880012888000
>>>> service_operation orangefs_inode_getattr returning: -22 for ffff880012888000
>>>> Releasing OP (ffff880012888000: 10958
>>>> orangefs_create: Failed to allocate inode for file :PPTOOLS1.PPA:
>>>> Releasing OP (ffff880012898000: 10889
>>>>
>>>>
>>>>
>>>>
>>>> What I'm testing with differs from what is at kernel.org#for-next by
>>>>   - diffs from Al's most recent email
>>>>   - 1 souped up gossip message
>>>>   - changed 0 to OP_VFS_STATE_UNKNOWN one place in service_operation
>>>>   - reinit_completion(&op->waitq) in orangefs_clean_up_interrupted_operation
>>>>
>>>>
>>>>
>>>
>>> Mike,
>>>
>>> what error do you get from userspace (i.e. from dbench)?
>>>
>>> open("./clients/client0/~dmtmp/EXCEL/5D7C0000", O_RDWR|O_CREAT, 0600) = -1 ENODEV (No such device)
>>>
>>> An interesting note is that I can't reproduce at all
>>> with only one dbench process. It seems there's not
>>> enough load.
>>>
>>> I don't see how the kernel could return ENODEV at all.
>>> This may be coming from our client-core.
>>>
>>> -- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 18, 2016, 8:49 p.m. UTC | #6
On Thu, Feb 18, 2016 at 03:22:33PM -0500, Mike Marshall wrote:
> I haven't edited up a list of how the debug output looked,
> but most importantly: the WARN_ON is hit... it appears that
> the client-core is sending over fsid:0:

OK, that's a bit of relief...  The next question, of course, is whether it's
a genuine reply or buggered attempt to copy it from userland and/or something
stomping on that memory.

It should've come from package_downcall_members(), right?  And there you
have this:
                if (*error_code == -PVFS_EEXIST)
                {
                    PVFS_hint hints;
                    PVFS_credential *credential;

                    fill_hints(&hints, vfs_request);

                    credential = lookup_credential(
                        vfs_request->in_upcall.uid,
                        vfs_request->in_upcall.gid);

                    /* compat */
                    refn1.handle =
                     pvfs2_khandle_to_ino(
                      &(vfs_request->in_upcall.req.create.parent_refn.khandle));
                    refn1.fs_id =
                      vfs_request->in_upcall.req.create.parent_refn.fs_id;
                    refn1.__pad1 =
                      vfs_request->in_upcall.req.create.parent_refn.__pad1;


//hubcap            vfs_request->out_downcall.resp.create.refn =
                    refn2 =
                      perform_lookup_on_create_error(

And AFAICS nothing in there sets resp.create.refn.  Is it actually set
earlier?
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 18, 2016, 8:52 p.m. UTC | #7
On Thu, Feb 18, 2016 at 03:38:26PM -0500, Mike Marshall wrote:
> WARNING: CPU: 0 PID: 1216 at fs/orangefs/devorangefs-req.c:423
> set_op_state_serviced: op:ffff880011078000: process:pvfs2-client-co state -> 4
> service_operation: wait_for_matching_downcall returned 0 for ffff880011078000
> service_operation orangefs_create returning: 0 for ffff880011078000
> orangefs_create: BENCHS.LWP:
> handle:00000000-0000-0000-0000-000000000000: fsid:0:
> new_op:ffff880011078000: ret:0:

Smells like retry hitting EEXIST and package_downcall_members() treatment of
that case doesn't set create.refn at all - used to, but that code is commented
out.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 18, 2016, 9:50 p.m. UTC | #8
As part of the attempt to go upstream, this "hubcap" guy you see
in the comments worked on a thing that changes 64bit userspace handles
back and forth into 128bit kernel handles... we did this because
one day, when we have orangefs3, we will be using 128bit uuid-derived
handles, and we believe it is our responsibility to not break the
upstream kernel module.

Anywho, I bet you are right Al, he messed up this part of it...
I'll look and see if that is really so, and get it fixed.

-Mike "hubcap"

On Thu, Feb 18, 2016 at 3:52 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 18, 2016 at 03:38:26PM -0500, Mike Marshall wrote:
>> WARNING: CPU: 0 PID: 1216 at fs/orangefs/devorangefs-req.c:423
>> set_op_state_serviced: op:ffff880011078000: process:pvfs2-client-co state -> 4
>> service_operation: wait_for_matching_downcall returned 0 for ffff880011078000
>> service_operation orangefs_create returning: 0 for ffff880011078000
>> orangefs_create: BENCHS.LWP:
>> handle:00000000-0000-0000-0000-000000000000: fsid:0:
>> new_op:ffff880011078000: ret:0:
>
> Smells like retry hitting EEXIST and package_downcall_members() treatment of
> that case doesn't set create.refn at all - used to, but that code is commented
> out.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 19, 2016, 12:25 a.m. UTC | #9
On Thu, Feb 18, 2016 at 04:50:11PM -0500, Mike Marshall wrote:
> As part of the attempt to go upstream, this "hubcap" guy you see
> in the comments worked on a thing that changes 64bit userspace handles
> back and forth into 128bit kernel handles... we did this because
> one day, when we have orangefs3, we will be using 128bit uuid-derived
> handles, and we believe it is our responsibility to not break the
> upstream kernel module.
> 
> Anywho, I bet you are right Al, he messed up this part of it...
> I'll look and see if that is really so, and get it fixed.
> 
> -Mike "hubcap"

OK...  I'll fold the trivial braino fix (op_is_cancel() checking the wrong
thing) into "orangefs: delay freeing slot until cancel completes" where it
had been introduced, but the rest of it is probably too far and will have
to be a couple of commits on top of that queue.  Had it been just my tree,
I probably would still reorder and fold, but I know that my habits in that
respect are rather extreme.

FWIW, the scenario spotted by Martin wouldn't cause any real problems, but
only because by the time we ended copying to/from daemon service_operation()
couldn't have reached resubmit - it only happens if there had been a purge
and that can't happen while somebody is inside a control device method.

So the original code had been correct, but it was more brittle than
I'd like *and* making sure that nobody else sees an op by the time
orangefs_clean_interrupted_operation() returns is a good thing.

New logics gives that, and avoids the need to play with refcounts on ops.

I've pushed that into #orangefs-untested; if that works, please switch your
for-next to it.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 19, 2016, 10:11 p.m. UTC | #10
Yay! The problem is fixed.

Boo! Now a new problem is uncovered, I don't have a handle on it yet.
Now it is possible to create a broken file on the orangefs server
across a restart of the client-core.

dbench:
(808) open ./clients/client0/~dmtmp/PWRPNT/PPTC112.TMP failed for
handle 10042 (No such file or directory)

ls -l /pvfsmnt/clients/client0/~dmtmp/PWRPNT
ls: cannot access /pvfsmnt/clients/client0/~dmtmp/PWRPNT/PPTC112.TMP:
No such file or directory
total 1364
-rw-------. 1 root root  85026 Feb 19 14:53 NEWPCB.PPT
-rw-------. 1 root root 260096 Feb 19 14:52 PCBENCHM.PPT
??????????? ? ?    ?         ?            ? PPTC112.TMP
-rw-------. 1 root root 260096 Feb 19 14:51 PPTOOLS1.PPA
-rw-------. 1 root root 260096 Feb 19 14:51 TIPS.PPT
-rw-------. 1 root root 260096 Feb 19 14:51 TRIDOTS.POT
-rw-------. 1 root root 260096 Feb 19 14:51 ZD16.BMP

The filename comes back from the server in the readdir buffer.

I can reproduce this, so I'll have to work the problem some more
to find more information. First place I'll look is the khandle
code <g>...

Anywho...

The fixed version of the client-core for the other problem is in
this SVN repository:

http://www.orangefs.org/svn/orangefs/branches/trunk.kernel.update/

As far as orangefs for-next is concerned... I don't see how to
update it without destroying the top few commit messages in the
commit history.

I plan to update the kernel.org orangefs for-next tree to look exactly
like the "current" branch of my github tree, unless someone says
not to:

github.com/hubcapsc/linux/tree/current           Latest commit c1223ca

-Mike

On Thu, Feb 18, 2016 at 7:25 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Thu, Feb 18, 2016 at 04:50:11PM -0500, Mike Marshall wrote:
>> As part of the attempt to go upstream, this "hubcap" guy you see
>> in the comments worked on a thing that changes 64bit userspace handles
>> back and forth into 128bit kernel handles... we did this because
>> one day, when we have orangefs3, we will be using 128bit uuid-derived
>> handles, and we believe it is our responsibility to not break the
>> upstream kernel module.
>>
>> Anywho, I bet you are right Al, he messed up this part of it...
>> I'll look and see if that is really so, and get it fixed.
>>
>> -Mike "hubcap"
>
> OK...  I'll fold the trivial braino fix (op_is_cancel() checking the wrong
> thing) into "orangefs: delay freeing slot until cancel completes" where it
> had been introduced, but the rest of it is probably too far and will have
> to be a couple of commits on top of that queue.  Had it been just my tree,
> I probably would still reorder and fold, but I know that my habits in that
> respect are rather extreme.
>
> FWIW, the scenario spotted by Martin wouldn't cause any real problems, but
> only because by the time we ended copying to/from daemon service_operation()
> couldn't have reached resubmit - it only happens if there had been a purge
> and that can't happen while somebody is inside a control device method.
>
> So the original code had been correct, but it was more brittle than
> I'd like *and* making sure that nobody else sees an op by the time
> orangefs_clean_interrupted_operation() returns is a good thing.
>
> New logics gives that, and avoids the need to play with refcounts on ops.
>
> I've pushed that into #orangefs-untested; if that works, please switch your
> for-next to it.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 19, 2016, 10:22 p.m. UTC | #11
On Fri, Feb 19, 2016 at 05:11:29PM -0500, Mike Marshall wrote:

> I plan to update the kernel.org orangefs for-next tree to look exactly
> like the "current" branch of my github tree, unless someone says
> not to:
> 
> github.com/hubcapsc/linux/tree/current           Latest commit c1223ca

$ git checkout current
$ git fetch git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git orangefs-untested
$ git diff FETCH_HEAD # should report no differences
$ git reset --hard FETCH_HEAD
$ git push --force

then push the same branch into your kernel.org (as for-next, again with -force).
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 19, 2016, 10:32 p.m. UTC | #12
On Fri, Feb 19, 2016 at 05:11:29PM -0500, Mike Marshall wrote:

> Boo! Now a new problem is uncovered, I don't have a handle on it yet.
> Now it is possible to create a broken file on the orangefs server
> across a restart of the client-core.

I suspect that it's your "getattr after create and leave dentry negative if
that getattr fails".  Might make sense to d_drop() the sucker in case of
such late failure - or mark it so that subsequent d_revalidate() would
*not* skip getattr, despite NULL ->d_inode.

Incidentally, why does your ->d_revalidate() bother with d_drop()?  Just
have it return 0 and let the caller DTRT...
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin Brandenburg Feb. 19, 2016, 10:45 p.m. UTC | #13
On Fri, 19 Feb 2016, Al Viro wrote:

> On Fri, Feb 19, 2016 at 05:11:29PM -0500, Mike Marshall wrote:
> 
> > Boo! Now a new problem is uncovered, I don't have a handle on it yet.
> > Now it is possible to create a broken file on the orangefs server
> > across a restart of the client-core.
> 
> I suspect that it's your "getattr after create and leave dentry negative if
> that getattr fails".  Might make sense to d_drop() the sucker in case of
> such late failure - or mark it so that subsequent d_revalidate() would
> *not* skip getattr, despite NULL ->d_inode.
> 
> Incidentally, why does your ->d_revalidate() bother with d_drop()?  Just
> have it return 0 and let the caller DTRT...
> 

Because I recently worked on it and didn't know that was
desirable. *oops*

I see what fs/namei.c does now.

I suppose you see the request I just sent out for review
of that code.

-- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Martin Brandenburg Feb. 19, 2016, 10:50 p.m. UTC | #14
On Fri, 19 Feb 2016, Al Viro wrote:

> On Fri, Feb 19, 2016 at 05:11:29PM -0500, Mike Marshall wrote:
> 
> > Boo! Now a new problem is uncovered, I don't have a handle on it yet.
> > Now it is possible to create a broken file on the orangefs server
> > across a restart of the client-core.
> 
> I suspect that it's your "getattr after create and leave dentry negative if
> that getattr fails".  Might make sense to d_drop() the sucker in case of
> such late failure - or mark it so that subsequent d_revalidate() would
> *not* skip getattr, despite NULL ->d_inode.
> 
> Incidentally, why does your ->d_revalidate() bother with d_drop()?  Just
> have it return 0 and let the caller DTRT...
> 

However I'm not so sure the kernel is at fault here. We
see with a userspace tool which just opens a socket to
the server that orangefs-readdir lists the file and
orangefs-stat says ENOENT.

Looks like the server's database is corrupt.

-- Martin
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 20, 2016, 12:14 p.m. UTC | #15
Hi Al...

There's something I'll be thinking about all weekend (while my friend
Stanley the grader helps me distribute 40 tons of gravel)...

Your orangefs-untested branch has 5625087 commits. My "current" branch
has 5625087 commits. In each all of the commit signatures match, except
for the most recent 15 commits. The last 15 commits in my "current"
branch were made from your orangefs-untested branch with "git format-patch"
and applied to my "current" branch with "git am -s". "git log -p" shows that
my most recent 15 commits differ from your most recent 15 commits by
the addition of my "sign off" line.

I will absolutely update my kernel.org for-next branch with the procedure you
outlined, because you said so.

I wish I understood it better, though... I can only guess at this point that
the procedure you outlined will do some desirable thing to git metadata...?

-Mike

On Fri, Feb 19, 2016 at 5:22 PM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Fri, Feb 19, 2016 at 05:11:29PM -0500, Mike Marshall wrote:
>
>> I plan to update the kernel.org orangefs for-next tree to look exactly
>> like the "current" branch of my github tree, unless someone says
>> not to:
>>
>> github.com/hubcapsc/linux/tree/current           Latest commit c1223ca
>
> $ git checkout current
> $ git fetch git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs.git orangefs-untested
> $ git diff FETCH_HEAD # should report no differences
> $ git reset --hard FETCH_HEAD
> $ git push --force
>
> then push the same branch into your kernel.org (as for-next, again with -force).
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Al Viro Feb. 20, 2016, 1:36 p.m. UTC | #16
On Sat, Feb 20, 2016 at 07:14:26AM -0500, Mike Marshall wrote:

> Your orangefs-untested branch has 5625087 commits. My "current" branch
> has 5625087 commits. In each all of the commit signatures match, except
> for the most recent 15 commits. The last 15 commits in my "current"
> branch were made from your orangefs-untested branch with "git format-patch"
> and applied to my "current" branch with "git am -s". "git log -p" shows that
> my most recent 15 commits differ from your most recent 15 commits by
> the addition of my "sign off" line.

*blinks*
*checks*

OK, ignore what I asked, then.  Looks like I'd screwed up checking last time.

> I will absolutely update my kernel.org for-next branch with the procedure you
> outlined, because you said so.
> 
> I wish I understood it better, though... I can only guess at this point that
> the procedure you outlined will do some desirable thing to git metadata...?

None whatsoever, ignore it.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Mike Marshall Feb. 22, 2016, 4:20 p.m. UTC | #17
> Looks like I'd screwed up checking last time.

Probably not that <g>... my branch did diverge over the course
of the few days that we were thrashing around in the kernel trying
to fix what I had broken two years ago in userspace.

I can relate to why you were motivated to remove the thrashing
around from the git history, but your git-foo is much stronger
than mine. I wanted to try and get my branch back into line using
a methodology that I understand to keep from ending up like
this fellow:

http://myweb.clemson.edu/~hubcap/harris.jpg

I'm glad it worked out... my kernel.org for-next branch is updated now.

so, I'll keep working the problem, using your d_drop idea first off...
I'll be back with more information, and hopefully even have it fixed, soon...

-Mike

On Sat, Feb 20, 2016 at 8:36 AM, Al Viro <viro@zeniv.linux.org.uk> wrote:
> On Sat, Feb 20, 2016 at 07:14:26AM -0500, Mike Marshall wrote:
>
>> Your orangefs-untested branch has 5625087 commits. My "current" branch
>> has 5625087 commits. In each all of the commit signatures match, except
>> for the most recent 15 commits. The last 15 commits in my "current"
>> branch were made from your orangefs-untested branch with "git format-patch"
>> and applied to my "current" branch with "git am -s". "git log -p" shows that
>> my most recent 15 commits differ from your most recent 15 commits by
>> the addition of my "sign off" line.
>
> *blinks*
> *checks*
>
> OK, ignore what I asked, then.  Looks like I'd screwed up checking last time.
>
>> I will absolutely update my kernel.org for-next branch with the procedure you
>> outlined, because you said so.
>>
>> I wish I understood it better, though... I can only guess at this point that
>> the procedure you outlined will do some desirable thing to git metadata...?
>
> None whatsoever, ignore it.
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Patch
diff mbox

diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index b27ed1c..f7914f5 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -58,9 +58,9 @@  static struct orangefs_kernel_op_s
*orangefs_devreq_remove_op(__u64 tag)
  next,
  &htable_ops_in_progress[index],
  list) {
- if (op->tag == tag && !op_state_purged(op)) {
+ if (op->tag == tag && !op_state_purged(op) &&
+    !op_state_given_up(op)) {
  list_del_init(&op->list);
- get_op(op); /* increase ref count. */
  spin_unlock(&htable_ops_in_progress_lock);
  return op;
  }
@@ -133,7 +133,7 @@  restart:
  __s32 fsid;
  /* This lock is held past the end of the loop when we break. */
  spin_lock(&op->lock);
- if (unlikely(op_state_purged(op))) {
+ if (unlikely(op_state_purged(op) || op_state_given_up(op))) {
  spin_unlock(&op->lock);
  continue;
  }
@@ -199,13 +199,12 @@  restart:
  */
  if (op_state_in_progress(cur_op) || op_state_serviced(cur_op)) {
  gossip_err("orangefs: ERROR: Current op already queued.\n");
- list_del(&cur_op->list);
+ list_del_init(&cur_op->list);
  spin_unlock(&cur_op->lock);
  spin_unlock(&orangefs_request_list_lock);
  return -EAGAIN;
  }
  list_del_init(&cur_op->list);
- get_op(op);
  spin_unlock(&orangefs_request_list_lock);

  spin_unlock(&cur_op->lock);
@@ -230,7 +229,7 @@  restart:
  if (unlikely(op_state_given_up(cur_op))) {
  spin_unlock(&cur_op->lock);
  spin_unlock(&htable_ops_in_progress_lock);
- op_release(cur_op);
+ complete(&cur_op->waitq);
  goto restart;
  }

@@ -242,7 +241,6 @@  restart:
  orangefs_devreq_add_op(cur_op);
  spin_unlock(&cur_op->lock);
  spin_unlock(&htable_ops_in_progress_lock);
- op_release(cur_op);

  /* The client only asks to read one size buffer. */
  return MAX_DEV_REQ_UPSIZE;
@@ -258,10 +256,12 @@  error:
  if (likely(!op_state_given_up(cur_op))) {
  set_op_state_waiting(cur_op);
  list_add(&cur_op->list, &orangefs_request_list);
+ spin_unlock(&cur_op->lock);
+ } else {
+ spin_unlock(&cur_op->lock);
+ complete(&cur_op->waitq);
  }
- spin_unlock(&cur_op->lock);
  spin_unlock(&orangefs_request_list_lock);
- op_release(cur_op);
  return -EFAULT;
 }

@@ -333,8 +333,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
  n = copy_from_iter(&op->downcall, downcall_size, iter);
  if (n != downcall_size) {
  gossip_err("%s: failed to copy downcall.\n", __func__);
- ret = -EFAULT;
- goto Broken;
+ goto Efault;
  }

  if (op->downcall.status)
@@ -354,8 +353,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
    downcall_size,
    op->downcall.trailer_size,
    total);
- ret = -EFAULT;
- goto Broken;
+ goto Efault;
  }

  /* Only READDIR operations should have trailers. */
@@ -364,8 +362,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
  gossip_err("%s: %x operation with trailer.",
    __func__,
    op->downcall.type);
- ret = -EFAULT;
- goto Broken;
+ goto Efault;
  }

  /* READDIR operations should always have trailers. */
@@ -374,8 +371,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
  gossip_err("%s: %x operation with no trailer.",
    __func__,
    op->downcall.type);
- ret = -EFAULT;
- goto Broken;
+ goto Efault;
  }

  if (op->downcall.type != ORANGEFS_VFS_OP_READDIR)
@@ -386,8 +382,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
  if (op->downcall.trailer_buf == NULL) {
  gossip_err("%s: failed trailer vmalloc.\n",
    __func__);
- ret = -ENOMEM;
- goto Broken;
+ goto Enomem;
  }
  memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
  n = copy_from_iter(op->downcall.trailer_buf,
@@ -396,8 +391,7 @@  static ssize_t orangefs_devreq_write_iter(struct
kiocb *iocb,
  if (n != op->downcall.trailer_size) {
  gossip_err("%s: failed to copy trailer.\n", __func__);
  vfree(op->downcall.trailer_buf);
- ret = -EFAULT;
- goto Broken;
+ goto Efault;
  }

 wakeup:
@@ -406,38 +400,27 @@  wakeup:
  * that this op is done
  */
  spin_lock(&op->lock);
- if (unlikely(op_state_given_up(op))) {
+ if (unlikely(op_is_cancel(op))) {
  spin_unlock(&op->lock);
- goto out;
- }
- set_op_state_serviced(op);
- spin_unlock(&op->lock);
-
- /*
- * If this operation is an I/O operation we need to wait
- * for all data to be copied before we can return to avoid
- * buffer corruption and races that can pull the buffers
- * out from under us.
- *
- * Essentially we're synchronizing with other parts of the
- * vfs implicitly by not allowing the user space
- * application reading/writing this device to return until
- * the buffers are done being used.
- */
-out:
- if (unlikely(op_is_cancel(op)))
  put_cancel(op);
- op_release(op);
- return ret;
-
-Broken:
- spin_lock(&op->lock);
- if (!op_state_given_up(op)) {
- op->downcall.status = ret;
+ } else if (unlikely(op_state_given_up(op))) {
+ spin_unlock(&op->lock);
+ complete(&op->waitq);
+ } else {
  set_op_state_serviced(op);
+ spin_unlock(&op->lock);
  }
- spin_unlock(&op->lock);
- goto out;
+ return ret;
+
+Efault:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 9);
+ ret = -EFAULT;
+ goto wakeup;
+
+Enomem:
+ op->downcall.status = -(ORANGEFS_ERROR_BIT | 8);
+ ret = -ENOMEM;
+ goto wakeup;
 }

 /* Returns whether any FS are still pending remounted */
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 817092a..900a2e3 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -120,8 +120,6 @@  struct orangefs_kernel_op_s *op_alloc(__s32 type)
  spin_lock_init(&new_op->lock);
  init_completion(&new_op->waitq);

- atomic_set(&new_op->ref_count, 1);
-
  new_op->upcall.type = ORANGEFS_VFS_OP_INVALID;
  new_op->downcall.type = ORANGEFS_VFS_OP_INVALID;
  new_op->downcall.status = -1;
@@ -149,7 +147,7 @@  struct orangefs_kernel_op_s *op_alloc(__s32 type)
  return new_op;
 }

-void __op_release(struct orangefs_kernel_op_s *orangefs_op)
+void op_release(struct orangefs_kernel_op_s *orangefs_op)
 {
  if (orangefs_op) {
  gossip_debug(GOSSIP_CACHE_DEBUG,
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 1f8310c..e387d3c 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -205,8 +205,6 @@  struct orangefs_kernel_op_s {
  struct completion waitq;
  spinlock_t lock;

- atomic_t ref_count;
-
  /* VFS aio fields */

  int attempts;
@@ -230,23 +228,7 @@  static inline void set_op_state_serviced(struct
orangefs_kernel_op_s *op)
 #define op_state_given_up(op)    ((op)->op_state & OP_VFS_STATE_GIVEN_UP)
 #define op_is_cancel(op)         ((op)->downcall.type ==
ORANGEFS_VFS_OP_CANCEL)

-static inline void get_op(struct orangefs_kernel_op_s *op)
-{
- atomic_inc(&op->ref_count);
- gossip_debug(GOSSIP_DEV_DEBUG,
- "(get) Alloced OP (%p:%llu)\n", op, llu(op->tag));
-}
-
-void __op_release(struct orangefs_kernel_op_s *op);
-
-static inline void op_release(struct orangefs_kernel_op_s *op)
-{
- if (atomic_dec_and_test(&op->ref_count)) {
- gossip_debug(GOSSIP_DEV_DEBUG,
- "(put) Releasing OP (%p:%llu)\n", op, llu((op)->tag));
- __op_release(op);
- }
-}
+void op_release(struct orangefs_kernel_op_s *op);

 extern void orangefs_bufmap_put(int);
 static inline void put_cancel(struct orangefs_kernel_op_s *op)
@@ -259,7 +241,7 @@  static inline void set_op_state_purged(struct
orangefs_kernel_op_s *op)
 {
  spin_lock(&op->lock);
  if (unlikely(op_is_cancel(op))) {
- list_del(&op->list);
+ list_del_init(&op->list);
  spin_unlock(&op->lock);
  put_cancel(op);
  } else {
diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c
index 2528d58..0ea2741 100644
--- a/fs/orangefs/waitqueue.c
+++ b/fs/orangefs/waitqueue.c
@@ -65,7 +65,7 @@  int service_operation(struct orangefs_kernel_op_s *op,
  op->upcall.pid = current->pid;

 retry_servicing:
- op->downcall.status = 0;
+ op->downcall.status = OP_VFS_STATE_UNKNOWN;
  gossip_debug(GOSSIP_WAIT_DEBUG,
      "%s: %s op:%p: process:%s: pid:%d:\n",
      __func__,
@@ -103,8 +103,9 @@  retry_servicing:
  wake_up_interruptible(&orangefs_request_list_waitq);
  if (!__is_daemon_in_service()) {
  gossip_debug(GOSSIP_WAIT_DEBUG,
-     "%s:client core is NOT in service.\n",
-     __func__);
+     "%s:client core is NOT in service, %p.\n",
+     __func__,
+     op);
  timeout = op_timeout_secs * HZ;
  }
  spin_unlock(&orangefs_request_list_lock);
@@ -208,15 +209,20 @@  static void
orangefs_clean_up_interrupted_operation(struct orangefs_kernel_op_s
  * Called with op->lock held.
  */
  op->op_state |= OP_VFS_STATE_GIVEN_UP;
-
- if (op_state_waiting(op)) {
+ /* from that point on it can't be moved by anybody else */
+ if (list_empty(&op->list)) {
+ /* caught copying to/from daemon */
+ BUG_ON(op_state_serviced(op));
+ spin_unlock(&op->lock);
+ wait_for_completion(&op->waitq);
+ } else if (op_state_waiting(op)) {
  /*
  * upcall hasn't been read; remove op from upcall request
  * list.
  */
  spin_unlock(&op->lock);
  spin_lock(&orangefs_request_list_lock);
- list_del(&op->list);
+ list_del_init(&op->list);
  spin_unlock(&orangefs_request_list_lock);
  gossip_debug(GOSSIP_WAIT_DEBUG,