diff mbox

question about request merge

Message ID tencent_4B199E7C402C9C6F6664BDE6@qq.com (mailing list archive)
State New, archived
Headers show

Commit Message

Zhengyuan Liu April 20, 2018, 3:51 a.m. UTC
Hi, Shaohua

I found it indeed doesn't do front merge when two threads flush plug list  concurrently.   To 
reappear , I prepared two IO threads , named a0.io and a1.io .
Thread a1.io  uses libaio to write 5 requests : 
    sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8
Thread a0.io  uses libaio to write other 5 requests : 
    sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8

To meet the condition that thread a1.io flush all its requests to queuee before  thread a0.io
does, I delay thread a1.io to run queue after it adds all pluged requests to queue,  seeing
bellow patch:


With the patch I restart  "./a1.io & sleep 1 && ./a0.io" and blktrace result showed bellow:

      8,16   2      249   471.608930240  1473  Q  WS 16 + 8 [a1.io]
      8,16   2      250   471.608934940  1473  G  WS 16 + 8 [a1.io]
      8,16   2      252   471.608940680  1473  Q  WS 40 + 8 [a1.io]
      8,16   2      253   471.608942840  1473  G  WS 40 + 8 [a1.io]
      8,16   2      254   471.608946500  1473  Q  WS 64 + 8 [a1.io]
      8,16   2      255   471.608948040  1473  G  WS 64 + 8 [a1.io]
      8,16   2      256   471.608951460  1473  Q  WS 88 + 8 [a1.io]
      8,16   2      257   471.608952960  1473  G  WS 88 + 8 [a1.io]
      8,16   2      258   471.608956340  1473  Q  WS 112 + 8 [a1.io]
      8,16   2      259   471.608957780  1473  G  WS 112 + 8 [a1.io]
      8,16   2      260   471.608959880  1473  I  WS 16 + 8 [a1.io]
      8,16   2      261   471.608975880  1473  I  WS 40 + 8 [a1.io]
      8,16   2      262   471.608986420  1473  I  WS 64 + 8 [a1.io]
      8,16   2      263   471.608995480  1473  I  WS 88 + 8 [a1.io]
      8,16   2      264   471.609003720  1473  I  WS 112 + 8 [a1.io]
      8,16   3      267   471.610200680  1474  Q  WS 8 + 8 [a0.io]
      8,16   3      268   471.610204700  1474  G  WS 8 + 8 [a0.io]
      8,16   3      270   471.610210280  1474  Q  WS 32 + 8 [a0.io]
      8,16   3      271   471.610211840  1474  G  WS 32 + 8 [a0.io]
      8,16   3      272   471.610215660  1474  Q  WS 56 + 8 [a0.io]
      8,16   3      273   471.610217200  1474  G  WS 56 + 8 [a0.io]
      8,16   3      274   471.610220860  1474  Q  WS 80 + 8 [a0.io]
      8,16   3      275   471.610222300  1474  G  WS 80 + 8 [a0.io]
      8,16   3      276   471.610225720  1474  Q  WS 104 + 8 [a0.io]
      8,16   3      277   471.610227540  1474  G  WS 104 + 8 [a0.io]
      8,16   3      278   471.610229100  1474  I  WS 8 + 8 [a0.io]
      8,16   3      279   471.615291420  1474  I  WS 32 + 8 [a0.io]
      8,16   3      280   471.620311200  1474  I  WS 56 + 8 [a0.io]
      8,16   3      281   471.625327620  1474  I  WS 80 + 8 [a0.io]
      8,16   3      282   471.630343080  1474  I  WS 104 + 8 [a0.io]
      8,16   3      284   471.637881600  1474  D  WS 104 + 16 [a0.io]
      8,16   3      285   471.640429120  1474  D  WS 80 + 16 [a0.io]
      8,16   0      397   471.644573100     3  C  WS 104 + 16 [0]
      8,16   0      398   471.647159640     3  D  WS 56 + 16 [ksoftirqd/0]
      8,16   1       49   471.649825940    13  C  WS 80 + 16 [0]
      8,16   1       50   471.652391540    13  D  WS 32 + 16 [ksoftirqd/1]
      8,16   0      399   471.654446580     3  C  WS 56 + 16 [0]
      8,16   0      400   471.657000820     3  D  WS 8 + 16 [ksoftirqd/0]
      8,16   0      401   471.659445000     3  C  WS 32 + 16 [0]
      8,16   0      402   471.663946360     3  C  WS 8 + 16 [0]

Now those adjacent request get merged.

------------------ Original ------------------
From:  "Zhengyuan Liu"<liuzhengyuan@kylinos.cn>;
Date:  Fri, Apr 13, 2018 04:16 PM
To:  "shli"<shli@fb.com>;
Subject:  question about request merge
 
Hi, Shaohua

There is a question around me while viewing block layer code, so I'm writing to you to get help.

When a request from plug list get flushed to queue, it try to merge into a existing request by
calling function elv_attempt_insert_merge before added directly to the queue, seeing bellow:

          /*
         * Attempt to do an insertion back merge. Only check for the case where
         * we can append 'rq' to an existing request, so we can throw 'rq' away
         * afterwards.
         *
         * Returns true if we merged, false otherwise
         */
        bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
        {
                struct request *__rq;
                bool ret;

                if (blk_queue_nomerges(q))
                        return false;

                /*
                 * First try one-hit cache.
                 */
                if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
                        return true;

                if (blk_queue_noxmerges(q))
                        return false;

                ret = false;
                /*
                 * See if our hash lookup can find a potential backmerge.
                 */
                while (1) {
                        __rq = elv_rqhash_find(q, blk_rq_pos(rq));
                        if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
                                break;

                        /* The merged request could be merged with others, try again */
                        ret = true;
                        rq = __rq;
                }

                return ret;
        }

From the comment and code we can see it only do backmerge, I think its enough when there is only
one thread doing IO operation since we have sorted the requests in plug list before unplug, seeing the 
patch from Jianpeng Ma 422765c("block: Remove should_sort judgement when flush blk_plug") and
975927b("block: Add blk_rq_pos(rq) to sort rq when plushing"). However when comes to multiple
threads, lets image bellow IO scenario, thread A and B both hold three requests in plug list.

    threadA: a+1, a+4, a+7
    threadB: a+2, a+5, a+8

if threadA has flushed all its request to queue before threadB does, then request a+1 and a+2  and
other adjacent pairs have the chance to get merged through backmerge, but if  threadB flushs all its
requests to queue before threadA how could those adjacent requests get merged? or it doesn't matter
at all?  I know your patch bee0393("block: recursive merge requests") has solved a multi-thread merge
problem but not mines.
Its hard to simulate such a IO scenario from userspace and I cann't fingure out other useful methods
to verify whether  the requests in such IO scenario really cann't get merged. All my thoughts are from
speculation, so I hope you could give me some answers. Any reply would be thankful.

Zhengyuan

Comments

Jens Axboe April 20, 2018, 2:34 p.m. UTC | #1
On 4/19/18 9:51 PM, Zhengyuan Liu wrote:
> Hi, Shaohua
> 
> I found it indeed doesn't do front merge when two threads flush plug list  concurrently.   To 
> reappear , I prepared two IO threads , named a0.io and a1.io .
> Thread a1.io  uses libaio to write 5 requests : 
>     sectors: 16 + 8, 40 + 8, 64 + 8, 88 + 8, 112 + 8
> Thread a0.io  uses libaio to write other 5 requests : 
>     sectors: 8+ 8, 32 + 8, 56 + 8, 80 + 8, 104 + 8

I'm cutting some of the below.

Thanks for the detailed email. It's mostly on purpose that we don't
spend cycles and memory on maintaining a separate front merge hash,
since it's generally not something that happens very often. If you have
a thread pool doing IO and split sequential IO such that you would
benefit a lot from front merging, then I would generally claim that
you're not writing your app in the most optimal manner.

So I'm curious, what's the big interest in front merging?
diff mbox

Patch

--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3324,6 +3324,21 @@  void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
        /*
         * This drops the queue lock
         */
+
+           if (strcmp(current->comm, "a1.io") == 0)
+           {
+                   spin_unlock_irq(q->queue_lock); 
+                   mdelay(2000);
+                   spin_lock_irq(q->queue_lock);   
+           }
 
Then I used   " ./a1.io & sleep 1 && ./a0.io " to dispatch the IO operation and used blktrace to
trace the IO processing,  the trace result was showed bellow:

      8,16   1      174   176.733880160  1217  Q  WS 16 + 8 [a1.io]
      8,16   1      175   176.733885300  1217  G  WS 16 + 8 [a1.io]
      8,16   1      177   176.733890820  1217  Q  WS 40 + 8 [a1.io]
      8,16   1      178   176.733892460  1217  G  WS 40 + 8 [a1.io]
      8,16   1      179   176.733896380  1217  Q  WS 64 + 8 [a1.io]
      8,16   1      180   176.733897900  1217  G  WS 64 + 8 [a1.io]
      8,16   1      181   176.733901260  1217  Q  WS 88 + 8 [a1.io]
      8,16   1      182   176.733902760  1217  G  WS 88 + 8 [a1.io]
      8,16   1      183   176.733906200  1217  Q  WS 112 + 8 [a1.io]
      8,16   1      184   176.733907640  1217  G  WS 112 + 8 [a1.io]
      8,16   1      185   176.733909560  1217  I  WS 16 + 8 [a1.io]
      8,16   1      186   176.733925480  1217  I  WS 40 + 8 [a1.io]
      8,16   1      187   176.733936560  1217  I  WS 64 + 8 [a1.io]
      8,16   1      188   176.733945920  1217  I  WS 88 + 8 [a1.io]
      8,16   1      189   176.733953880  1217  I  WS 112 + 8 [a1.io]
      8,16   2      103   176.734317080  1218  Q  WS 8 + 8 [a0.io]
      8,16   2      104   176.734320600  1218  G  WS 8 + 8 [a0.io]
      8,16   2      106   176.734325520  1218  Q  WS 32 + 8 [a0.io]
      8,16   2      107   176.734327140  1218  G  WS 32 + 8 [a0.io]
      8,16   2      108   176.734330620  1218  Q  WS 56 + 8 [a0.io]
      8,16   2      109   176.734332140  1218  G  WS 56 + 8 [a0.io]
      8,16   2      110   176.734335380  1218  Q  WS 80 + 8 [a0.io]
      8,16   2      111   176.734336760  1218  G  WS 80 + 8 [a0.io]
      8,16   2      112   176.734340300  1218  Q  WS 104 + 8 [a0.io]
      8,16   2      113   176.734341900  1218  G  WS 104 + 8 [a0.io]
      8,16   2      114   176.734343520  1218  I  WS 8 + 8 [a0.io]
      8,16   2      115   176.734357280  1218  I  WS 32 + 8 [a0.io]
      8,16   2      116   176.734366560  1218  I  WS 56 + 8 [a0.io]
      8,16   2      117   176.734374800  1218  I  WS 80 + 8 [a0.io]
      8,16   2      118   176.734382460  1218  I  WS 104 + 8 [a0.io]
      8,16   2      120   176.738012960  1218  D  WS 112 + 8 [a0.io]
      8,16   0      151   176.740923200     3  C  WS 112 + 8 [0]
      8,16   2      121   176.741804960  1218  D  WS 88 + 8 [a0.io]
      8,16   2      122   176.741826060  1218  R  WS 88 + 8 [0]
      8,16   2      123   176.741827740  1218  I  WS 88 + 8 [a0.io]
      8,16   0      152   176.745259080     3  D  WS 88 + 8 [ksoftirqd/0]
      8,16   0      153   176.747824380     3  D  WS 64 + 8 [ksoftirqd/0]
      8,16   0      154   176.748921220     3  C  WS 88 + 8 [0]
      8,16   0      155   176.751472640     3  D  WS 40 + 8 [ksoftirqd/0]
      8,16   0      156   176.753172500     3  C  WS 64 + 8 [0]
      8,16   0      157   176.755720740     3  D  WS 16 + 8 [ksoftirqd/0]
      8,16   0      158   176.757920780     3  C  WS 40 + 8 [0]
      8,16   0      159   176.760471320     3  D  WS 104 + 8 [ksoftirqd/0]
      8,16   0      160   176.763172180     3  C  WS 16 + 8 [0]
      8,16   0      161   176.765717560     3  D  WS 80 + 8 [ksoftirqd/0]
      8,16   0      162   176.766795400     3  C  WS 104 + 8 [0]
      8,16   0      163   176.769341020     3  D  WS 56 + 8 [ksoftirqd/0]
      8,16   2      124   176.770926380    18  C  WS 80 + 8 [0]
      8,16   0      164   176.774335640   924  C  WS 56 + 8 [0]
      8,16   2      125   176.774642140    18  D  WS 32 + 8 [ksoftirqd/2]
      8,16   2      126   176.774651300    18  R  WS 32 + 8 [0]
      8,16   2      127   176.774652900    18  I  WS 32 + 8 [ksoftirqd/2]
      8,16   0      165   176.778546360   924  D  WS 32 + 8 [sshd]
      8,16   0      166   176.782498120   924  D  WS 8 + 8 [sshd]
      8,16   0      167   176.785298980     3  C  WS 32 + 8 [0]
      8,16   0      168   176.788671240     3  C  WS 8 + 8 [0]

We can see from the result that the adjacent requests didn't get merged. 
To make those adjacent requests to get merged, I try to write a simple patch:

diff --git a/block/elevator.c b/block/elevator.c
index c3555c9c6..40acd0e84 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -245,6 +245,7 @@  EXPORT_SYMBOL(elevator_exit);
 static inline void __elv_rqhash_del(struct request *rq)
 {
        hash_del(&rq->hash);
+       hash_del(&rq->front_hash);
        rq->cmd_flags &= ~REQ_HASHED;
 }
 
@@ -252,6 +253,7 @@  static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
        if (ELV_ON_HASH(rq))
                __elv_rqhash_del(rq);
+
 }
 
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
@@ -260,6 +262,7 @@  static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 
        BUG_ON(ELV_ON_HASH(rq));
        hash_add(e->hash, &rq->hash, rq_hash_key(rq));
+       hash_add(e->front_hash, &rq->front_hash, blk_rq_pos(rq));
        rq->cmd_flags |= REQ_HASHED;
 }
 
@@ -290,6 +293,28 @@  static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
        return NULL;
 }
 
+static struct request *elv_fronthash_find(struct request_queue *q, sector_t offset)
+{
+        struct elevator_queue *e = q->elevator;
+        struct hlist_node *next;
+        struct request *rq;
+
+        hash_for_each_possible_safe(e->front_hash, rq, next, front_hash, offset) {
+                BUG_ON(!ELV_ON_HASH(rq));
+
+                if (unlikely(!rq_mergeable(rq))) {
+                        __elv_rqhash_del(rq);
+                        continue;
+                }
+
+                if (blk_rq_pos(rq) == offset)
+                        return rq;
+        }
+
+        return NULL;
+}
+
+
 /*
  * RB-tree support functions for inserting/lookup/removal of requests
  * in a sorted RB tree.
@@ -493,6 +518,39 @@  static bool elv_attempt_insert_merge(struct request_queue *q,
 
        return ret;
 }
+/*
+ * Attempt to do an insertion front merge.
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_front_merge(struct request_queue *q,
+                                    struct request *rq)
+{
+       struct request *__rq;
+       bool ret;
+
+       if (blk_queue_nomerges(q))
+               return false;
+
+       /*
+        * First try one-hit cache.
+        */
+       if (q->last_merge && blk_attempt_req_merge(q, rq, q->last_merge))
+               return true;
+
+       if (blk_queue_noxmerges(q))
+               return false;
+
+       ret = false;
+       /*
+        * See if our hash lookup can find a potential frontmerge.
+        */
+
+       __rq = elv_fronthash_find(q, rq_hash_key(rq));
+       if (__rq && blk_attempt_req_merge(q, rq, __rq)) {
+               ret = true;
+       }
+       return ret;
+}
 
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
@@ -657,6 +715,8 @@  void __elv_add_request(struct request_queue *q, struct request *rq, int where)
                 * elevator_add_req_fn.
                 */
                q->elevator->type->ops.elevator_add_req_fn(q, rq);
+               /* do a front merge after we added it to elevator */
+               elv_attempt_front_merge(q, rq);
                break;
 
        case ELEVATOR_INSERT_FLUSH:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 085a03f67..9fe25be1b 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -120,6 +120,8 @@  struct request {
                struct list_head ipi_list;
        };
 
+       struct hlist_node front_hash; /* front merge hash */
+
        /*
         * The rb_node is only used inside the io scheduler, requests
         * are pruned when moved to the dispatch queue. So let the
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 638b324f0..1567e5412 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -114,6 +114,7 @@  struct elevator_queue
        struct mutex sysfs_lock;
        unsigned int registered:1;
        DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
+       DECLARE_HASHTABLE(front_hash, ELV_HASH_BITS);
 };
 
 /*