Message ID | 20210610091800.1833-5-christian.koenig@amd.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [1/7] dma-buf: some dma_fence_chain improvements | expand |
On Thu, Jun 10, 2021 at 11:17:57AM +0200, Christian König wrote: > Add some rather sophisticated lockless garbage collection > for dma_fence_chain objects. > > For this keep all initialized dma_fence_chain nodes an a > queue and trigger garbage collection before a new one is > allocated. > > Signed-off-by: Christian König <christian.koenig@amd.com> Uh hand-rolled lockless list, I'm not a fan. But the real question here is why? This is a global list, so it's going to look great on your desktop, but gpus are for servers now and those are NUMA. So just from that pov doing garbage-collection individually still feels like a much better idea. So what's the problem your trying to solve here? -Daniel > --- > drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++----- > include/linux/dma-fence-chain.h | 5 + > 2 files changed, 142 insertions(+), 23 deletions(-) > > diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c > index 1b4cb3e5cec9..c2f0b69eabb7 100644 > --- a/drivers/dma-buf/dma-fence-chain.c > +++ b/drivers/dma-buf/dma-fence-chain.c > @@ -9,8 +9,53 @@ > > #include <linux/dma-fence-chain.h> > > +static struct dma_fence_chain __rcu *fifo_front; > +static struct dma_fence_chain __rcu **fifo_back = &fifo_front; > + > static bool dma_fence_chain_enable_signaling(struct dma_fence *fence); > > +/** > + * dma_fence_chain_enqueue - enqeue a chain node for garbage collection > + * @chain: the chain node to enqueue > + * > + * Add the chain node to the end of the gc fifo. > + */ > +static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) > +{ > + struct dma_fence_chain __rcu **tmp; > + > + RCU_INIT_POINTER(chain->next, NULL); > + tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, > + &chain->next); > + > + /* This is intentionally unordered since we only need the fifo for gc */ > + rcu_assign_pointer(*tmp, chain); > +} > + > +/** > + * dma_fence_chain_dequeue - deqeueue a chain node for garbage collection > + * > + * Remove the first chain node from the gc fifo while making sure that always > + * keep at least one node on the fifo for lockless fifo implementation. > + * Returns the dequeued chain node or NULL. > + */ > +static struct dma_fence_chain *dma_fence_chain_dequeue(void) > +{ > + struct dma_fence_chain *chain, *tmp; > + > + rcu_read_lock(); > + chain = rcu_dereference(fifo_front); > + /* Never dequeue the last chain node for lockless fifo */ > + if (unlikely(!chain || !rcu_access_pointer(chain->next))) { > + rcu_read_unlock(); > + return NULL; > + } > + tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front, > + chain, rcu_access_pointer(chain->next)); > + rcu_read_unlock(); > + return tmp == chain ? chain : NULL; > +} > + > /** > * dma_fence_chain_get_prev - use RCU to get a reference to the previous fence > * @chain: chain node to get the previous node from > @@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) > return prev; > } > > +/** > + * dma_fence_chain_try_replace - try to replace the prev node > + * @chain: Chain node we try to replace prev. > + * @prev: the old prev node > + * > + * Try to replace the previous chain node when it or its containing fence is > + * signaled. Returns true if we tried, false if we need to wait. > + */ > +static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain, > + struct dma_fence *prev) > +{ > + struct dma_fence *replacement, *tmp; > + struct dma_fence_chain *prev_chain; > + > + prev_chain = to_dma_fence_chain(prev); > + if (prev_chain) { > + if (!dma_fence_is_signaled(prev_chain->fence)) > + return false; > + > + replacement = dma_fence_chain_get_prev(prev_chain); > + } else { > + if (!dma_fence_is_signaled(prev)) > + return false; > + > + replacement = NULL; > + } > + > + tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev, > + replacement); > + if (tmp == prev) > + dma_fence_put(tmp); > + else > + dma_fence_put(replacement); > + > + return true; > +} > + > /** > * dma_fence_chain_walk - chain walking function > * @fence: current chain node > @@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) > */ > struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) > { > - struct dma_fence_chain *chain, *prev_chain; > - struct dma_fence *prev, *replacement, *tmp; > + struct dma_fence_chain *chain; > + struct dma_fence *prev; > > chain = to_dma_fence_chain(fence); > if (!chain) { > @@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) > } > > while ((prev = dma_fence_chain_get_prev(chain))) { > - > - prev_chain = to_dma_fence_chain(prev); > - if (prev_chain) { > - if (!dma_fence_is_signaled(prev_chain->fence)) > - break; > - > - replacement = dma_fence_chain_get_prev(prev_chain); > - } else { > - if (!dma_fence_is_signaled(prev)) > - break; > - > - replacement = NULL; > - } > - > - tmp = cmpxchg((struct dma_fence __force **)&chain->prev, > - prev, replacement); > - if (tmp == prev) > - dma_fence_put(tmp); > - else > - dma_fence_put(replacement); > + if (!dma_fence_chain_try_replace(chain, prev)) > + break; > dma_fence_put(prev); > } > > @@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence) > dma_fence_put(prev); > > dma_fence_put(chain->fence); > - dma_fence_free(fence); > + WRITE_ONCE(chain->fence, NULL); > + > + /* > + * Don't garbage collect here to avoid recursion and potential stack > + * overflow. > + */ > + chain = dma_fence_chain_dequeue(); > + if (!chain) > + return; > + > + if (kref_read(&chain->base.refcount) || > + READ_ONCE(chain->fence)) > + dma_fence_chain_enqueue(chain); > + else > + dma_fence_free(&chain->base); > } > > const struct dma_fence_ops dma_fence_chain_ops = { > @@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = { > }; > EXPORT_SYMBOL(dma_fence_chain_ops); > > +/** > + * dma_fence_chain_garbage_collect - cleanup chain nodes > + * > + * Do some garbage collection and try to release chain nodes. > + */ > +void dma_fence_chain_garbage_collect(void) > +{ > + struct dma_fence_chain *chain = dma_fence_chain_dequeue(); > + > + if (!chain) > + return; > + > + if (!dma_fence_get_rcu(&chain->base)) { > + /* Unused, check if it's also clean */ > + if (likely(!READ_ONCE(chain->fence))) { > + dma_fence_free(&chain->base); > + return; > + } > + > + } else { > + struct dma_fence *prev; > + > + /* Used, do some chain walk */ > + prev = dma_fence_chain_get_prev(chain); > + if (prev) { > + dma_fence_chain_try_replace(chain, prev); > + dma_fence_put(prev); > + } > + dma_fence_put(&chain->base); > + } > + dma_fence_chain_enqueue(chain); > +} > +EXPORT_SYMBOL(dma_fence_chain_garbage_collect); > + > /** > * dma_fence_chain_init - initialize a fence chain > * @chain: the chain node to initialize > @@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, > > dma_fence_init(&chain->base, &dma_fence_chain_ops, > &chain->lock, context, seqno); > + dma_fence_chain_enqueue(chain); > } > + > EXPORT_SYMBOL(dma_fence_chain_init); > diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h > index 5f45689a6d2e..b412b5396434 100644 > --- a/include/linux/dma-fence-chain.h > +++ b/include/linux/dma-fence-chain.h > @@ -19,6 +19,7 @@ > * @base: fence base class > * @lock: spinlock for fence handling > * @prev: previous fence of the chain > + * @next: next chain node for garbage collection > * @prev_seqno: original previous seqno before garbage collection > * @fence: encapsulated fence > * @cb: callback structure for signaling > @@ -27,6 +28,7 @@ > struct dma_fence_chain { > struct dma_fence base; > struct dma_fence __rcu *prev; > + struct dma_fence_chain __rcu *next; > u64 prev_seqno; > struct dma_fence *fence; > union { > @@ -38,6 +40,8 @@ struct dma_fence_chain { > > extern const struct dma_fence_ops dma_fence_chain_ops; > > +void dma_fence_chain_garbage_collect(void); > + > /** > * to_dma_fence_chain - cast a fence to a dma_fence_chain > * @fence: fence to cast to a dma_fence_array > @@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence) > */ > static inline struct dma_fence_chain *dma_fence_chain_alloc(void) > { > + dma_fence_chain_garbage_collect(); > return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); > }; > > -- > 2.25.1 >
Am 11.06.21 um 10:58 schrieb Daniel Vetter: > On Thu, Jun 10, 2021 at 11:17:57AM +0200, Christian König wrote: >> Add some rather sophisticated lockless garbage collection >> for dma_fence_chain objects. >> >> For this keep all initialized dma_fence_chain nodes an a >> queue and trigger garbage collection before a new one is >> allocated. >> >> Signed-off-by: Christian König <christian.koenig@amd.com> > Uh hand-rolled lockless list, I'm not a fan. > > But the real question here is why? This is a global list, so it's going to > look great on your desktop, but gpus are for servers now and those are > NUMA. So just from that pov doing garbage-collection individually still > feels like a much better idea. Yeah, I was pondering on that quite a bit as well. That why I added the multi threaded alloc/free test. > So what's the problem your trying to solve here? I was not sure if the chain is garbage collected enough when used for the dma_resv exclusive object. But I think we could just drop this for now and just see how it goes. Christian. > -Daniel > >> --- >> drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++----- >> include/linux/dma-fence-chain.h | 5 + >> 2 files changed, 142 insertions(+), 23 deletions(-) >> >> diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c >> index 1b4cb3e5cec9..c2f0b69eabb7 100644 >> --- a/drivers/dma-buf/dma-fence-chain.c >> +++ b/drivers/dma-buf/dma-fence-chain.c >> @@ -9,8 +9,53 @@ >> >> #include <linux/dma-fence-chain.h> >> >> +static struct dma_fence_chain __rcu *fifo_front; >> +static struct dma_fence_chain __rcu **fifo_back = &fifo_front; >> + >> static bool dma_fence_chain_enable_signaling(struct dma_fence *fence); >> >> +/** >> + * dma_fence_chain_enqueue - enqeue a chain node for garbage collection >> + * @chain: the chain node to enqueue >> + * >> + * Add the chain node to the end of the gc fifo. >> + */ >> +static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) >> +{ >> + struct dma_fence_chain __rcu **tmp; >> + >> + RCU_INIT_POINTER(chain->next, NULL); >> + tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, >> + &chain->next); >> + >> + /* This is intentionally unordered since we only need the fifo for gc */ >> + rcu_assign_pointer(*tmp, chain); >> +} >> + >> +/** >> + * dma_fence_chain_dequeue - deqeueue a chain node for garbage collection >> + * >> + * Remove the first chain node from the gc fifo while making sure that always >> + * keep at least one node on the fifo for lockless fifo implementation. >> + * Returns the dequeued chain node or NULL. >> + */ >> +static struct dma_fence_chain *dma_fence_chain_dequeue(void) >> +{ >> + struct dma_fence_chain *chain, *tmp; >> + >> + rcu_read_lock(); >> + chain = rcu_dereference(fifo_front); >> + /* Never dequeue the last chain node for lockless fifo */ >> + if (unlikely(!chain || !rcu_access_pointer(chain->next))) { >> + rcu_read_unlock(); >> + return NULL; >> + } >> + tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front, >> + chain, rcu_access_pointer(chain->next)); >> + rcu_read_unlock(); >> + return tmp == chain ? chain : NULL; >> +} >> + >> /** >> * dma_fence_chain_get_prev - use RCU to get a reference to the previous fence >> * @chain: chain node to get the previous node from >> @@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) >> return prev; >> } >> >> +/** >> + * dma_fence_chain_try_replace - try to replace the prev node >> + * @chain: Chain node we try to replace prev. >> + * @prev: the old prev node >> + * >> + * Try to replace the previous chain node when it or its containing fence is >> + * signaled. Returns true if we tried, false if we need to wait. >> + */ >> +static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain, >> + struct dma_fence *prev) >> +{ >> + struct dma_fence *replacement, *tmp; >> + struct dma_fence_chain *prev_chain; >> + >> + prev_chain = to_dma_fence_chain(prev); >> + if (prev_chain) { >> + if (!dma_fence_is_signaled(prev_chain->fence)) >> + return false; >> + >> + replacement = dma_fence_chain_get_prev(prev_chain); >> + } else { >> + if (!dma_fence_is_signaled(prev)) >> + return false; >> + >> + replacement = NULL; >> + } >> + >> + tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev, >> + replacement); >> + if (tmp == prev) >> + dma_fence_put(tmp); >> + else >> + dma_fence_put(replacement); >> + >> + return true; >> +} >> + >> /** >> * dma_fence_chain_walk - chain walking function >> * @fence: current chain node >> @@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) >> */ >> struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) >> { >> - struct dma_fence_chain *chain, *prev_chain; >> - struct dma_fence *prev, *replacement, *tmp; >> + struct dma_fence_chain *chain; >> + struct dma_fence *prev; >> >> chain = to_dma_fence_chain(fence); >> if (!chain) { >> @@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) >> } >> >> while ((prev = dma_fence_chain_get_prev(chain))) { >> - >> - prev_chain = to_dma_fence_chain(prev); >> - if (prev_chain) { >> - if (!dma_fence_is_signaled(prev_chain->fence)) >> - break; >> - >> - replacement = dma_fence_chain_get_prev(prev_chain); >> - } else { >> - if (!dma_fence_is_signaled(prev)) >> - break; >> - >> - replacement = NULL; >> - } >> - >> - tmp = cmpxchg((struct dma_fence __force **)&chain->prev, >> - prev, replacement); >> - if (tmp == prev) >> - dma_fence_put(tmp); >> - else >> - dma_fence_put(replacement); >> + if (!dma_fence_chain_try_replace(chain, prev)) >> + break; >> dma_fence_put(prev); >> } >> >> @@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence) >> dma_fence_put(prev); >> >> dma_fence_put(chain->fence); >> - dma_fence_free(fence); >> + WRITE_ONCE(chain->fence, NULL); >> + >> + /* >> + * Don't garbage collect here to avoid recursion and potential stack >> + * overflow. >> + */ >> + chain = dma_fence_chain_dequeue(); >> + if (!chain) >> + return; >> + >> + if (kref_read(&chain->base.refcount) || >> + READ_ONCE(chain->fence)) >> + dma_fence_chain_enqueue(chain); >> + else >> + dma_fence_free(&chain->base); >> } >> >> const struct dma_fence_ops dma_fence_chain_ops = { >> @@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = { >> }; >> EXPORT_SYMBOL(dma_fence_chain_ops); >> >> +/** >> + * dma_fence_chain_garbage_collect - cleanup chain nodes >> + * >> + * Do some garbage collection and try to release chain nodes. >> + */ >> +void dma_fence_chain_garbage_collect(void) >> +{ >> + struct dma_fence_chain *chain = dma_fence_chain_dequeue(); >> + >> + if (!chain) >> + return; >> + >> + if (!dma_fence_get_rcu(&chain->base)) { >> + /* Unused, check if it's also clean */ >> + if (likely(!READ_ONCE(chain->fence))) { >> + dma_fence_free(&chain->base); >> + return; >> + } >> + >> + } else { >> + struct dma_fence *prev; >> + >> + /* Used, do some chain walk */ >> + prev = dma_fence_chain_get_prev(chain); >> + if (prev) { >> + dma_fence_chain_try_replace(chain, prev); >> + dma_fence_put(prev); >> + } >> + dma_fence_put(&chain->base); >> + } >> + dma_fence_chain_enqueue(chain); >> +} >> +EXPORT_SYMBOL(dma_fence_chain_garbage_collect); >> + >> /** >> * dma_fence_chain_init - initialize a fence chain >> * @chain: the chain node to initialize >> @@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, >> >> dma_fence_init(&chain->base, &dma_fence_chain_ops, >> &chain->lock, context, seqno); >> + dma_fence_chain_enqueue(chain); >> } >> + >> EXPORT_SYMBOL(dma_fence_chain_init); >> diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h >> index 5f45689a6d2e..b412b5396434 100644 >> --- a/include/linux/dma-fence-chain.h >> +++ b/include/linux/dma-fence-chain.h >> @@ -19,6 +19,7 @@ >> * @base: fence base class >> * @lock: spinlock for fence handling >> * @prev: previous fence of the chain >> + * @next: next chain node for garbage collection >> * @prev_seqno: original previous seqno before garbage collection >> * @fence: encapsulated fence >> * @cb: callback structure for signaling >> @@ -27,6 +28,7 @@ >> struct dma_fence_chain { >> struct dma_fence base; >> struct dma_fence __rcu *prev; >> + struct dma_fence_chain __rcu *next; >> u64 prev_seqno; >> struct dma_fence *fence; >> union { >> @@ -38,6 +40,8 @@ struct dma_fence_chain { >> >> extern const struct dma_fence_ops dma_fence_chain_ops; >> >> +void dma_fence_chain_garbage_collect(void); >> + >> /** >> * to_dma_fence_chain - cast a fence to a dma_fence_chain >> * @fence: fence to cast to a dma_fence_array >> @@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence) >> */ >> static inline struct dma_fence_chain *dma_fence_chain_alloc(void) >> { >> + dma_fence_chain_garbage_collect(); >> return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); >> }; >> >> -- >> 2.25.1 >>
On Fri, Jun 11, 2021 at 12:07:00PM +0200, Christian König wrote: > Am 11.06.21 um 10:58 schrieb Daniel Vetter: > > On Thu, Jun 10, 2021 at 11:17:57AM +0200, Christian König wrote: > > > Add some rather sophisticated lockless garbage collection > > > for dma_fence_chain objects. > > > > > > For this keep all initialized dma_fence_chain nodes an a > > > queue and trigger garbage collection before a new one is > > > allocated. > > > > > > Signed-off-by: Christian König <christian.koenig@amd.com> > > Uh hand-rolled lockless list, I'm not a fan. > > > > But the real question here is why? This is a global list, so it's going to > > look great on your desktop, but gpus are for servers now and those are > > NUMA. So just from that pov doing garbage-collection individually still > > feels like a much better idea. > > Yeah, I was pondering on that quite a bit as well. That why I added the > multi threaded alloc/free test. > > > So what's the problem your trying to solve here? > > I was not sure if the chain is garbage collected enough when used for the > dma_resv exclusive object. > > But I think we could just drop this for now and just see how it goes. Yeah I think fixing a perf/tuning problem when we're hitting it is much better than trying to guess what it might be and writing something really clever to anticipate that. We generally get it wrong. Also with the explaination of what it tests added the testcase makes sense. Just some documentation or comment would have helped. -Daniel > > Christian. > > > -Daniel > > > > > --- > > > drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++----- > > > include/linux/dma-fence-chain.h | 5 + > > > 2 files changed, 142 insertions(+), 23 deletions(-) > > > > > > diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c > > > index 1b4cb3e5cec9..c2f0b69eabb7 100644 > > > --- a/drivers/dma-buf/dma-fence-chain.c > > > +++ b/drivers/dma-buf/dma-fence-chain.c > > > @@ -9,8 +9,53 @@ > > > #include <linux/dma-fence-chain.h> > > > +static struct dma_fence_chain __rcu *fifo_front; > > > +static struct dma_fence_chain __rcu **fifo_back = &fifo_front; > > > + > > > static bool dma_fence_chain_enable_signaling(struct dma_fence *fence); > > > +/** > > > + * dma_fence_chain_enqueue - enqeue a chain node for garbage collection > > > + * @chain: the chain node to enqueue > > > + * > > > + * Add the chain node to the end of the gc fifo. > > > + */ > > > +static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) > > > +{ > > > + struct dma_fence_chain __rcu **tmp; > > > + > > > + RCU_INIT_POINTER(chain->next, NULL); > > > + tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, > > > + &chain->next); > > > + > > > + /* This is intentionally unordered since we only need the fifo for gc */ > > > + rcu_assign_pointer(*tmp, chain); > > > +} > > > + > > > +/** > > > + * dma_fence_chain_dequeue - deqeueue a chain node for garbage collection > > > + * > > > + * Remove the first chain node from the gc fifo while making sure that always > > > + * keep at least one node on the fifo for lockless fifo implementation. > > > + * Returns the dequeued chain node or NULL. > > > + */ > > > +static struct dma_fence_chain *dma_fence_chain_dequeue(void) > > > +{ > > > + struct dma_fence_chain *chain, *tmp; > > > + > > > + rcu_read_lock(); > > > + chain = rcu_dereference(fifo_front); > > > + /* Never dequeue the last chain node for lockless fifo */ > > > + if (unlikely(!chain || !rcu_access_pointer(chain->next))) { > > > + rcu_read_unlock(); > > > + return NULL; > > > + } > > > + tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front, > > > + chain, rcu_access_pointer(chain->next)); > > > + rcu_read_unlock(); > > > + return tmp == chain ? chain : NULL; > > > +} > > > + > > > /** > > > * dma_fence_chain_get_prev - use RCU to get a reference to the previous fence > > > * @chain: chain node to get the previous node from > > > @@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) > > > return prev; > > > } > > > +/** > > > + * dma_fence_chain_try_replace - try to replace the prev node > > > + * @chain: Chain node we try to replace prev. > > > + * @prev: the old prev node > > > + * > > > + * Try to replace the previous chain node when it or its containing fence is > > > + * signaled. Returns true if we tried, false if we need to wait. > > > + */ > > > +static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain, > > > + struct dma_fence *prev) > > > +{ > > > + struct dma_fence *replacement, *tmp; > > > + struct dma_fence_chain *prev_chain; > > > + > > > + prev_chain = to_dma_fence_chain(prev); > > > + if (prev_chain) { > > > + if (!dma_fence_is_signaled(prev_chain->fence)) > > > + return false; > > > + > > > + replacement = dma_fence_chain_get_prev(prev_chain); > > > + } else { > > > + if (!dma_fence_is_signaled(prev)) > > > + return false; > > > + > > > + replacement = NULL; > > > + } > > > + > > > + tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev, > > > + replacement); > > > + if (tmp == prev) > > > + dma_fence_put(tmp); > > > + else > > > + dma_fence_put(replacement); > > > + > > > + return true; > > > +} > > > + > > > /** > > > * dma_fence_chain_walk - chain walking function > > > * @fence: current chain node > > > @@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) > > > */ > > > struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) > > > { > > > - struct dma_fence_chain *chain, *prev_chain; > > > - struct dma_fence *prev, *replacement, *tmp; > > > + struct dma_fence_chain *chain; > > > + struct dma_fence *prev; > > > chain = to_dma_fence_chain(fence); > > > if (!chain) { > > > @@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) > > > } > > > while ((prev = dma_fence_chain_get_prev(chain))) { > > > - > > > - prev_chain = to_dma_fence_chain(prev); > > > - if (prev_chain) { > > > - if (!dma_fence_is_signaled(prev_chain->fence)) > > > - break; > > > - > > > - replacement = dma_fence_chain_get_prev(prev_chain); > > > - } else { > > > - if (!dma_fence_is_signaled(prev)) > > > - break; > > > - > > > - replacement = NULL; > > > - } > > > - > > > - tmp = cmpxchg((struct dma_fence __force **)&chain->prev, > > > - prev, replacement); > > > - if (tmp == prev) > > > - dma_fence_put(tmp); > > > - else > > > - dma_fence_put(replacement); > > > + if (!dma_fence_chain_try_replace(chain, prev)) > > > + break; > > > dma_fence_put(prev); > > > } > > > @@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence) > > > dma_fence_put(prev); > > > dma_fence_put(chain->fence); > > > - dma_fence_free(fence); > > > + WRITE_ONCE(chain->fence, NULL); > > > + > > > + /* > > > + * Don't garbage collect here to avoid recursion and potential stack > > > + * overflow. > > > + */ > > > + chain = dma_fence_chain_dequeue(); > > > + if (!chain) > > > + return; > > > + > > > + if (kref_read(&chain->base.refcount) || > > > + READ_ONCE(chain->fence)) > > > + dma_fence_chain_enqueue(chain); > > > + else > > > + dma_fence_free(&chain->base); > > > } > > > const struct dma_fence_ops dma_fence_chain_ops = { > > > @@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = { > > > }; > > > EXPORT_SYMBOL(dma_fence_chain_ops); > > > +/** > > > + * dma_fence_chain_garbage_collect - cleanup chain nodes > > > + * > > > + * Do some garbage collection and try to release chain nodes. > > > + */ > > > +void dma_fence_chain_garbage_collect(void) > > > +{ > > > + struct dma_fence_chain *chain = dma_fence_chain_dequeue(); > > > + > > > + if (!chain) > > > + return; > > > + > > > + if (!dma_fence_get_rcu(&chain->base)) { > > > + /* Unused, check if it's also clean */ > > > + if (likely(!READ_ONCE(chain->fence))) { > > > + dma_fence_free(&chain->base); > > > + return; > > > + } > > > + > > > + } else { > > > + struct dma_fence *prev; > > > + > > > + /* Used, do some chain walk */ > > > + prev = dma_fence_chain_get_prev(chain); > > > + if (prev) { > > > + dma_fence_chain_try_replace(chain, prev); > > > + dma_fence_put(prev); > > > + } > > > + dma_fence_put(&chain->base); > > > + } > > > + dma_fence_chain_enqueue(chain); > > > +} > > > +EXPORT_SYMBOL(dma_fence_chain_garbage_collect); > > > + > > > /** > > > * dma_fence_chain_init - initialize a fence chain > > > * @chain: the chain node to initialize > > > @@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, > > > dma_fence_init(&chain->base, &dma_fence_chain_ops, > > > &chain->lock, context, seqno); > > > + dma_fence_chain_enqueue(chain); > > > } > > > + > > > EXPORT_SYMBOL(dma_fence_chain_init); > > > diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h > > > index 5f45689a6d2e..b412b5396434 100644 > > > --- a/include/linux/dma-fence-chain.h > > > +++ b/include/linux/dma-fence-chain.h > > > @@ -19,6 +19,7 @@ > > > * @base: fence base class > > > * @lock: spinlock for fence handling > > > * @prev: previous fence of the chain > > > + * @next: next chain node for garbage collection > > > * @prev_seqno: original previous seqno before garbage collection > > > * @fence: encapsulated fence > > > * @cb: callback structure for signaling > > > @@ -27,6 +28,7 @@ > > > struct dma_fence_chain { > > > struct dma_fence base; > > > struct dma_fence __rcu *prev; > > > + struct dma_fence_chain __rcu *next; > > > u64 prev_seqno; > > > struct dma_fence *fence; > > > union { > > > @@ -38,6 +40,8 @@ struct dma_fence_chain { > > > extern const struct dma_fence_ops dma_fence_chain_ops; > > > +void dma_fence_chain_garbage_collect(void); > > > + > > > /** > > > * to_dma_fence_chain - cast a fence to a dma_fence_chain > > > * @fence: fence to cast to a dma_fence_array > > > @@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence) > > > */ > > > static inline struct dma_fence_chain *dma_fence_chain_alloc(void) > > > { > > > + dma_fence_chain_garbage_collect(); > > > return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); > > > }; > > > -- > > > 2.25.1 > > > >
Hi "Christian, I love your patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master v5.13-rc6 next-20210616] [cannot apply to drm-tip/drm-tip drm/drm-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Christian-K-nig/dma-buf-some-dma_fence_chain-improvements/20210616-182806 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: alpha-randconfig-s031-20210615 (attached as .config) compiler: alpha-linux-gcc (GCC) 9.3.0 reproduce: wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # apt-get install sparse # sparse version: v0.6.3-341-g8af24329-dirty # https://github.com/0day-ci/linux/commit/058b99a2929f84c7f2cb1516dcc8674bf3f2a661 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Christian-K-nig/dma-buf-some-dma_fence_chain-improvements/20210616-182806 git checkout 058b99a2929f84c7f2cb1516dcc8674bf3f2a661 # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' W=1 ARCH=alpha If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> sparse warnings: (new ones prefixed by >>) >> drivers/dma-buf/dma-fence-chain.c:28:15: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct dma_fence_chain **_x_ @@ got struct dma_fence_chain [noderef] __rcu ** @@ drivers/dma-buf/dma-fence-chain.c:28:15: sparse: expected struct dma_fence_chain **_x_ drivers/dma-buf/dma-fence-chain.c:28:15: sparse: got struct dma_fence_chain [noderef] __rcu ** >> drivers/dma-buf/dma-fence-chain.c:28:13: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct dma_fence_chain [noderef] __rcu **tmp @@ got struct dma_fence_chain **[assigned] __ret @@ drivers/dma-buf/dma-fence-chain.c:28:13: sparse: expected struct dma_fence_chain [noderef] __rcu **tmp drivers/dma-buf/dma-fence-chain.c:28:13: sparse: got struct dma_fence_chain **[assigned] __ret vim +28 drivers/dma-buf/dma-fence-chain.c 16 17 /** 18 * dma_fence_chain_enqueue - enqeue a chain node for garbage collection 19 * @chain: the chain node to enqueue 20 * 21 * Add the chain node to the end of the gc fifo. 22 */ 23 static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) 24 { 25 struct dma_fence_chain __rcu **tmp; 26 27 RCU_INIT_POINTER(chain->next, NULL); > 28 tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, 29 &chain->next); 30 31 /* This is intentionally unordered since we only need the fifo for gc */ 32 rcu_assign_pointer(*tmp, chain); 33 } 34 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
Hi "Christian, I love your patch! Perhaps something to improve: [auto build test WARNING on drm-intel/for-linux-next] [also build test WARNING on drm-exynos/exynos-drm-next tegra-drm/drm/tegra/for-next linus/master v5.13-rc6 next-20210616] [cannot apply to drm-tip/drm-tip drm/drm-next] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Christian-K-nig/dma-buf-some-dma_fence_chain-improvements/20210616-182806 base: git://anongit.freedesktop.org/drm-intel for-linux-next config: i386-randconfig-s001-20210615 (attached as .config) compiler: gcc-9 (Debian 9.3.0-22) 9.3.0 reproduce: # apt-get install sparse # sparse version: v0.6.3-341-g8af24329-dirty # https://github.com/0day-ci/linux/commit/058b99a2929f84c7f2cb1516dcc8674bf3f2a661 git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Christian-K-nig/dma-buf-some-dma_fence_chain-improvements/20210616-182806 git checkout 058b99a2929f84c7f2cb1516dcc8674bf3f2a661 # save the attached .config to linux build tree make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' W=1 ARCH=i386 If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot <lkp@intel.com> sparse warnings: (new ones prefixed by >>) >> drivers/dma-buf/dma-fence-chain.c:28:15: sparse: sparse: incorrect type in initializer (different address spaces) @@ expected struct dma_fence_chain **__ret @@ got struct dma_fence_chain [noderef] __rcu ** @@ drivers/dma-buf/dma-fence-chain.c:28:15: sparse: expected struct dma_fence_chain **__ret drivers/dma-buf/dma-fence-chain.c:28:15: sparse: got struct dma_fence_chain [noderef] __rcu ** drivers/dma-buf/dma-fence-chain.c:28:13: sparse: sparse: incorrect type in assignment (different address spaces) @@ expected struct dma_fence_chain [noderef] __rcu **tmp @@ got struct dma_fence_chain **[assigned] __ret @@ drivers/dma-buf/dma-fence-chain.c:28:13: sparse: expected struct dma_fence_chain [noderef] __rcu **tmp drivers/dma-buf/dma-fence-chain.c:28:13: sparse: got struct dma_fence_chain **[assigned] __ret vim +28 drivers/dma-buf/dma-fence-chain.c 16 17 /** 18 * dma_fence_chain_enqueue - enqeue a chain node for garbage collection 19 * @chain: the chain node to enqueue 20 * 21 * Add the chain node to the end of the gc fifo. 22 */ 23 static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) 24 { 25 struct dma_fence_chain __rcu **tmp; 26 27 RCU_INIT_POINTER(chain->next, NULL); > 28 tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, 29 &chain->next); 30 31 /* This is intentionally unordered since we only need the fifo for gc */ 32 rcu_assign_pointer(*tmp, chain); 33 } 34 --- 0-DAY CI Kernel Test Service, Intel Corporation https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
diff --git a/drivers/dma-buf/dma-fence-chain.c b/drivers/dma-buf/dma-fence-chain.c index 1b4cb3e5cec9..c2f0b69eabb7 100644 --- a/drivers/dma-buf/dma-fence-chain.c +++ b/drivers/dma-buf/dma-fence-chain.c @@ -9,8 +9,53 @@ #include <linux/dma-fence-chain.h> +static struct dma_fence_chain __rcu *fifo_front; +static struct dma_fence_chain __rcu **fifo_back = &fifo_front; + static bool dma_fence_chain_enable_signaling(struct dma_fence *fence); +/** + * dma_fence_chain_enqueue - enqeue a chain node for garbage collection + * @chain: the chain node to enqueue + * + * Add the chain node to the end of the gc fifo. + */ +static void dma_fence_chain_enqueue(struct dma_fence_chain *chain) +{ + struct dma_fence_chain __rcu **tmp; + + RCU_INIT_POINTER(chain->next, NULL); + tmp = xchg((struct dma_fence_chain __force ***)&fifo_back, + &chain->next); + + /* This is intentionally unordered since we only need the fifo for gc */ + rcu_assign_pointer(*tmp, chain); +} + +/** + * dma_fence_chain_dequeue - deqeueue a chain node for garbage collection + * + * Remove the first chain node from the gc fifo while making sure that always + * keep at least one node on the fifo for lockless fifo implementation. + * Returns the dequeued chain node or NULL. + */ +static struct dma_fence_chain *dma_fence_chain_dequeue(void) +{ + struct dma_fence_chain *chain, *tmp; + + rcu_read_lock(); + chain = rcu_dereference(fifo_front); + /* Never dequeue the last chain node for lockless fifo */ + if (unlikely(!chain || !rcu_access_pointer(chain->next))) { + rcu_read_unlock(); + return NULL; + } + tmp = cmpxchg((struct dma_fence_chain __force **)&fifo_front, + chain, rcu_access_pointer(chain->next)); + rcu_read_unlock(); + return tmp == chain ? chain : NULL; +} + /** * dma_fence_chain_get_prev - use RCU to get a reference to the previous fence * @chain: chain node to get the previous node from @@ -28,6 +73,43 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) return prev; } +/** + * dma_fence_chain_try_replace - try to replace the prev node + * @chain: Chain node we try to replace prev. + * @prev: the old prev node + * + * Try to replace the previous chain node when it or its containing fence is + * signaled. Returns true if we tried, false if we need to wait. + */ +static bool dma_fence_chain_try_replace(struct dma_fence_chain *chain, + struct dma_fence *prev) +{ + struct dma_fence *replacement, *tmp; + struct dma_fence_chain *prev_chain; + + prev_chain = to_dma_fence_chain(prev); + if (prev_chain) { + if (!dma_fence_is_signaled(prev_chain->fence)) + return false; + + replacement = dma_fence_chain_get_prev(prev_chain); + } else { + if (!dma_fence_is_signaled(prev)) + return false; + + replacement = NULL; + } + + tmp = cmpxchg((struct dma_fence __force **)&chain->prev, prev, + replacement); + if (tmp == prev) + dma_fence_put(tmp); + else + dma_fence_put(replacement); + + return true; +} + /** * dma_fence_chain_walk - chain walking function * @fence: current chain node @@ -38,8 +120,8 @@ static struct dma_fence *dma_fence_chain_get_prev(struct dma_fence_chain *chain) */ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) { - struct dma_fence_chain *chain, *prev_chain; - struct dma_fence *prev, *replacement, *tmp; + struct dma_fence_chain *chain; + struct dma_fence *prev; chain = to_dma_fence_chain(fence); if (!chain) { @@ -48,26 +130,8 @@ struct dma_fence *dma_fence_chain_walk(struct dma_fence *fence) } while ((prev = dma_fence_chain_get_prev(chain))) { - - prev_chain = to_dma_fence_chain(prev); - if (prev_chain) { - if (!dma_fence_is_signaled(prev_chain->fence)) - break; - - replacement = dma_fence_chain_get_prev(prev_chain); - } else { - if (!dma_fence_is_signaled(prev)) - break; - - replacement = NULL; - } - - tmp = cmpxchg((struct dma_fence __force **)&chain->prev, - prev, replacement); - if (tmp == prev) - dma_fence_put(tmp); - else - dma_fence_put(replacement); + if (!dma_fence_chain_try_replace(chain, prev)) + break; dma_fence_put(prev); } @@ -205,7 +269,21 @@ static void dma_fence_chain_release(struct dma_fence *fence) dma_fence_put(prev); dma_fence_put(chain->fence); - dma_fence_free(fence); + WRITE_ONCE(chain->fence, NULL); + + /* + * Don't garbage collect here to avoid recursion and potential stack + * overflow. + */ + chain = dma_fence_chain_dequeue(); + if (!chain) + return; + + if (kref_read(&chain->base.refcount) || + READ_ONCE(chain->fence)) + dma_fence_chain_enqueue(chain); + else + dma_fence_free(&chain->base); } const struct dma_fence_ops dma_fence_chain_ops = { @@ -218,6 +296,40 @@ const struct dma_fence_ops dma_fence_chain_ops = { }; EXPORT_SYMBOL(dma_fence_chain_ops); +/** + * dma_fence_chain_garbage_collect - cleanup chain nodes + * + * Do some garbage collection and try to release chain nodes. + */ +void dma_fence_chain_garbage_collect(void) +{ + struct dma_fence_chain *chain = dma_fence_chain_dequeue(); + + if (!chain) + return; + + if (!dma_fence_get_rcu(&chain->base)) { + /* Unused, check if it's also clean */ + if (likely(!READ_ONCE(chain->fence))) { + dma_fence_free(&chain->base); + return; + } + + } else { + struct dma_fence *prev; + + /* Used, do some chain walk */ + prev = dma_fence_chain_get_prev(chain); + if (prev) { + dma_fence_chain_try_replace(chain, prev); + dma_fence_put(prev); + } + dma_fence_put(&chain->base); + } + dma_fence_chain_enqueue(chain); +} +EXPORT_SYMBOL(dma_fence_chain_garbage_collect); + /** * dma_fence_chain_init - initialize a fence chain * @chain: the chain node to initialize @@ -254,5 +366,7 @@ void dma_fence_chain_init(struct dma_fence_chain *chain, dma_fence_init(&chain->base, &dma_fence_chain_ops, &chain->lock, context, seqno); + dma_fence_chain_enqueue(chain); } + EXPORT_SYMBOL(dma_fence_chain_init); diff --git a/include/linux/dma-fence-chain.h b/include/linux/dma-fence-chain.h index 5f45689a6d2e..b412b5396434 100644 --- a/include/linux/dma-fence-chain.h +++ b/include/linux/dma-fence-chain.h @@ -19,6 +19,7 @@ * @base: fence base class * @lock: spinlock for fence handling * @prev: previous fence of the chain + * @next: next chain node for garbage collection * @prev_seqno: original previous seqno before garbage collection * @fence: encapsulated fence * @cb: callback structure for signaling @@ -27,6 +28,7 @@ struct dma_fence_chain { struct dma_fence base; struct dma_fence __rcu *prev; + struct dma_fence_chain __rcu *next; u64 prev_seqno; struct dma_fence *fence; union { @@ -38,6 +40,8 @@ struct dma_fence_chain { extern const struct dma_fence_ops dma_fence_chain_ops; +void dma_fence_chain_garbage_collect(void); + /** * to_dma_fence_chain - cast a fence to a dma_fence_chain * @fence: fence to cast to a dma_fence_array @@ -61,6 +65,7 @@ to_dma_fence_chain(struct dma_fence *fence) */ static inline struct dma_fence_chain *dma_fence_chain_alloc(void) { + dma_fence_chain_garbage_collect(); return kmalloc(sizeof(struct dma_fence_chain), GFP_KERNEL); };
Add some rather sophisticated lockless garbage collection for dma_fence_chain objects. For this keep all initialized dma_fence_chain nodes an a queue and trigger garbage collection before a new one is allocated. Signed-off-by: Christian König <christian.koenig@amd.com> --- drivers/dma-buf/dma-fence-chain.c | 160 +++++++++++++++++++++++++----- include/linux/dma-fence-chain.h | 5 + 2 files changed, 142 insertions(+), 23 deletions(-)