diff mbox

multipath-tools:Prioritizer based on a time-delay algorithm

Message ID 1494215904-7564-1-git-send-email-philip.yang@huawei.com (mailing list archive)
State Not Applicable, archived
Delegated to: christophe varoqui
Headers show

Commit Message

Yang Feng May 8, 2017, 3:58 a.m. UTC
Prioritizer for device mapper multipath, where the corresponding priority
values of specific paths are provided by a time-delay algorithm. And the
time-delay algorithm is dependent on the following arguments(delay_interval,
cons_num).
The principle of the algorithm is illustrated as follows:
1. By sending a certain number "cons_num" of read IOs to the current path
   continuously, the IOs' average delay can be calculated.
2. According to the average delay of each path and the weight value
   "delay_interval", the priority "rc" of each path can be provided.

     delay_interval  delay_interval  delay_interval       delay_interval
    |---------------|---------------|---------------|	 |---------------|
    |priority rank1 |priority rank2 |priority rank3 |... |priority rank4 |
    |---------------|---------------|---------------|    |---------------|
                       Priority Rank Partitioning
---
 libmultipath/Makefile                   |   2 +-
 libmultipath/checkers/Makefile          |   7 +-
 libmultipath/checkers/emc_clariion.c    |   2 +-
 libmultipath/checkers/libsg.c           |  94 ------------
 libmultipath/checkers/libsg.h           |   9 --
 libmultipath/checkers/readsector0.c     |   2 +-
 libmultipath/libsg.c                    |  94 ++++++++++++
 libmultipath/libsg.h                    |   9 ++
 libmultipath/prioritizers/Makefile      |   6 +-
 libmultipath/prioritizers/delayedpath.c | 246 ++++++++++++++++++++++++++++++++
 libmultipath/prioritizers/delayedpath.h |  14 ++
 11 files changed, 373 insertions(+), 112 deletions(-)
 delete mode 100644 libmultipath/checkers/libsg.c
 delete mode 100644 libmultipath/checkers/libsg.h
 create mode 100644 libmultipath/libsg.c
 create mode 100644 libmultipath/libsg.h
 create mode 100644 libmultipath/prioritizers/delayedpath.c
 create mode 100644 libmultipath/prioritizers/delayedpath.h

Comments

Xose Vazquez Perez May 10, 2017, 10:36 p.m. UTC | #1
On 05/08/2017 05:58 AM, Yang Feng wrote:

> Prioritizer for device mapper multipath, where the corresponding priority
> values of specific paths are provided by a time-delay algorithm. And the
> time-delay algorithm is dependent on the following arguments(delay_interval,
> cons_num).
This new feature should be documented in multipath/multipath.conf.5

> diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile
> index 4970fc0..7e433ca 100644
> --- a/libmultipath/checkers/Makefile
> +++ b/libmultipath/checkers/Makefile
> @@ -14,19 +14,16 @@ LIBS= \
>  	libcheckemc_clariion.so \
>  	libcheckhp_sw.so \
>  	libcheckrdac.so
> -ifneq ($(ENABLE_RADOS),0)
> -LIBS += libcheckrbd.so
> -endif

Is it right?

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
Martin Wilck May 11, 2017, 11:14 a.m. UTC | #2
Hello Yang,

thank you for your work. Please find my remarks below.

On Mon, 2017-05-08 at 11:58 +0800, Yang Feng wrote:
> Prioritizer for device mapper multipath, where the corresponding
> priority
> values of specific paths are provided by a time-delay algorithm. And
> the
> time-delay algorithm is dependent on the following
> arguments(delay_interval,
> cons_num).
> The principle of the algorithm is illustrated as follows:
> 1. By sending a certain number "cons_num" of read IOs to the current
> path
>    continuously, the IOs' average delay can be calculated.
> 2. According to the average delay of each path and the weight value
>    "delay_interval", the priority "rc" of each path can be provided.
> 
>      delay_interval  delay_interval  delay_interval       delay_inter

How does this algorithm behave under load? Can we be sure that
priorities don't start to fluctuate wildly because busy paths will
usually have longer latencies than idle ones?


> val
>     |---------------|---------------|---------------|	 |----
> -----------|
>     |priority rank1 |priority rank2 |priority rank3 |... |priority
> rank4 |
>     |---------------|---------------|---------------|    |-----------
> ----|
>                        Priority Rank Partitioning
> ---
>  libmultipath/Makefile                   |   2 +-
>  libmultipath/checkers/Makefile          |   7 +-
>  libmultipath/checkers/emc_clariion.c    |   2 +-
>  libmultipath/checkers/libsg.c           |  94 ------------
>  libmultipath/checkers/libsg.h           |   9 --
>  libmultipath/checkers/readsector0.c     |   2 +-
>  libmultipath/libsg.c                    |  94 ++++++++++++
>  libmultipath/libsg.h                    |   9 ++
>  libmultipath/prioritizers/Makefile      |   6 +-
>  libmultipath/prioritizers/delayedpath.c | 246 

Why do you have to move libsg for this? It's already used by various
checkers, why can't your checker do the same? If you really need to do
it, you should at least separate that part of the patch from the added
code.

> diff --git a/libmultipath/prioritizers/delayedpath.c
> b/libmultipath/prioritizers/delayedpath.c
> new file mode 100644
> index 0000000..4c1cfea
> --- /dev/null
> +++ b/libmultipath/prioritizers/delayedpath.c
> @@ -0,0 +1,246 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights
> Reserved.
> + *
> + * main.c
> + *
> + * Prioritizer for device mapper multipath, where the corresponding
> priority 
> + * values of specific paths are provided by a time-delay algorithm.
> And the
> + * time-delay algorithm is dependent on arguments.
> + * 
> + * The principle of the algorithm as follows: 
> + * 1. By sending a certain number "cons_num" of read IOs to the
> current path 
> + *    continuously, the IOs' average delay can be calculated. 
> + * 2. According to the average delay of each path and the weight
> value 
> + *    "delay_interval", the priority "rc" of each path can be
> provided. 
> + *
> + * Author(s): Yang Feng <philip.yang@huawei.com>
> + *            Zou Ming <zouming.zouming@huawei.com>
> + *
> + * This file is released under the GPL.
> + */
> +#include <stdio.h>
> +#include <ctype.h>
> +#include <sys/time.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "../libmultipath/libsg.h"
> +
> +#include "delayedpath.h"
> +
> +#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
> +#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
> +#define DEFAULT_CONS_NUM        20    
> +
> +#define MAX_CHAR_SIZE           30
> +
> +#define CHAR_SEC                "SEC"
> +#define CHAR_MSEC               "MSEC"
> +#define CHAR_USEC               "USEC"

I suggest to use "s", "ms", and "us" here instead.

If you create an array of "const char*" instead like you did for
conversion_ratio below, you could implement get_interval_type() more
elegantly using a loop over that array.

> +
> +enum interval_type {
> +    INTERVAL_SEC,
> +    INTERVAL_MSEC,
> +    INTERVAL_USEC,
> +    INTERVAL_INVALID
> +};
> +
> +static int conversion_ratio[] = {
> +	[INTERVAL_SEC]		= USEC_PER_SEC,
> +	[INTERVAL_MSEC]	    = USEC_PER_MSEC,
> +	[INTERVAL_USEC]		= USEC_PER_USEC,
> +	[INTERVAL_INVALID]	= 0,
> +};
> +
> +
> +static int do_readsector0(int fd, unsigned int timeout)
> +{
> +	unsigned char buf[4096];
> +	unsigned char sbuf[SENSE_BUFF_LEN];
> +	int ret;
> +
> +	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
> +		      SENSE_BUFF_LEN, timeout);
> +    
> +	return ret;
> +}
> +
> +static int get_interval_type(char *source, char *type)
> +{  
> +    /*is USEC*/
> +    if ((strstr(source, CHAR_USEC) != NULL)
> +        && (strstr(source, CHAR_USEC)[4] == '_'))

Please avoid these double strstr() invocation. The compiler may
optimize it away, but it just looks strange. The following would 
look better to me, and I find it actually more readable:

        if (((p = strstr(source, CHAR_USEC)) != NULL) && p[4] == '_')

> +    {
> +        memcpy(type, CHAR_USEC, strlen(CHAR_USEC)+1);        
> +        return INTERVAL_USEC;
> +    }
> +
> +    /*is MSEC*/
> +    if ((strstr(source, CHAR_MSEC) != NULL) 
> +        && (strstr(source, CHAR_MSEC)[4] == '_'))
> +    {
> +        memcpy(type, CHAR_MSEC, strlen(CHAR_MSEC)+1);
> +        return INTERVAL_MSEC;
> +    }
> +
> +    /*is SEC*/
> +	if ((strstr(source, CHAR_SEC) != NULL)
> +        && (strstr(source, CHAR_SEC)[4] == '_'))
> +    {
> +        memcpy(type, CHAR_SEC, strlen(CHAR_SEC)+1);
> +        return INTERVAL_SEC;
> +    }
> +
> +    return INTERVAL_INVALID;
> +}
> +
> +static int get_string_from_under(char *args,
> +                                        char *beforestring,
> +                                        char *afterstring,
> +                                        int *type)

Maybe you could figure out a more descriptive name for this function?

A comment in the code showing how the string to be parsed typically
looks like would be helpful for the reader.

> +{
> +    char source[MAX_CHAR_SIZE];
> +    char char_type[MAX_CHAR_SIZE];
> +    char under[] = "_";
> +    char *token  = NULL;
> +    char *tmp = NULL;
> +    char *saveptr = NULL;
> +    unsigned int size = strlen(args);
> +
> +    if ((args == NULL) || (beforestring == NULL) 
> +        || (afterstring == NULL) || (type == NULL))
> +        return 0;
> +
> +    /* int type */
> +    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
> +        return 0;
> +    
> +    memcpy(source, args, size+1);
> +    if (strstr(source, under) == NULL)
> +        return 0;
> +
> +    *type = get_interval_type(source, char_type);
> +    if (*type == INTERVAL_INVALID)
> +        return 0;
> +
> +    token = strtok_r(source, under, &saveptr);
> +    token = strtok(token, char_type);

I'm pretty sure this is is not what you intended to write. If char_type
is "usec", this would split the string at the possible delimiters 'u',
's', 'e', and 'c' (the 2nd argument of strtok(3) is not a sequence, but
a 'set' of bytes). It might accidentally work with the input strings
you are using (in particular because you only look at the first token),
but nevertheless it's wrong.

> +    if ((token == NULL) || (saveptr == NULL))
> +        return 0;
> +
> +    tmp = token;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +            return 0;
> +
> +    tmp = saveptr;
> +    while (*tmp != '\0')
> +        if (!isdigit(*tmp++))
> +            return 0;
> +
> +    strncpy(beforestring, token, strlen(token) + 1);
> +    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
> +    return 1;
> +}

I don't think it's safe to use saveptr the way you do it. The strtok_r
man page says this parameter is for "internal use". While it makes
sense to assume that it points to the next token, I'm not sure if
that's guaranteed. You would be safe by calling 

    somevar = strtok_r(NULL, under, &saveptr)

and use "somevar".

In general, this whole parsing code is odd. IIUC this parses input
looking like ([0-9]+)(SEC|MSEC|USEC)_([0-9]+) and sets beforestring,
type, and afterstring to the regex matches \1, \2, and \3,
respectively.

Why don't you start parsing from the beginning of the input, e.g. with
strtoul(), and look at the rest later?

> +
> +int checkargvalid(int delay_interval, int cons_num, int type)
> +{
> +    if (type == INTERVAL_SEC)
> +    {
> +        if ((delay_interval < 1) || (delay_interval > 60))
> +            return 0;
> +    }
> +    else if (type != INTERVAL_INVALID)
> +    {
> +        if ((delay_interval < 1) || (delay_interval >= 1000))
> +            return 0;
> +    }

You could be more forgiving here. 15000MSEC could be a legal value.

> +    
> +    if ((cons_num < 3) || (cons_num > 1000))
> +        return 0;
> +
> +    return 1;
> +}
> +
> +int get_delay_pref_arg(char *args, int *delay_interval, int
> *cons_num, int *type)
> +{
> +    char delayintervalstr[MAX_CHAR_SIZE];
> +    char consnumstr[MAX_CHAR_SIZE];
> +
> +    if (get_string_from_under(args, delayintervalstr, consnumstr,
> type) == 0)
> +        return 0;

It might be good to write the parser so that the consnum part can be
left out by the user, and assume a reasonable default in that case.

> +
> +    *delay_interval = atoi(delayintervalstr);
> +    *cons_num = atoi(consnumstr);
> +
> +    if (checkargvalid(*delay_interval, *cons_num, *type) == 0)
> +        return 0;
> +    
> +    return 1;
> +}
> +
> +long long get_conversion_ratio(int type)
> +{
> +    return conversion_ratio[type];
> +}
> +    
> +int getprio (struct path *pp, char *args, unsigned int timeout)
> +{
> +    int rc, delay_interval, cons_num, type, temp;
> +    long long delay, avgdelay, ratio;
> +    long long min = THRES_USEC_VALUE;
> +    long long max = 0;
> +    long long toldelay = 0;
> +    long long before, after;
> +    struct timeval tv;
> +
> +    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type)
> == 0)
> +    {
> +        condlog(3, "%s: get delay arg fail", pp->dev);
> +        delay_interval = DEFAULT_DELAY_INTERVAL;
> +        cons_num = DEFAULT_CONS_NUM;
> +        type = INTERVAL_MSEC;
> +    }
> +
> +    temp = cons_num;
> +    while (temp-- > 0)
> +    {
> +        (void)gettimeofday(&tv, NULL);
> +        before = timeval_to_us(&tv);		
> +
> +        if (do_readsector0(pp->fd, timeout) == 2)
> +        {
> +            condlog(0, "%s: path down", pp->dev);
> +            return 1;
> +        }
> +        
> +        (void)gettimeofday(&tv, NULL);

It's better to use clock_gettime(CLOCK_MONOTONIC, ...) here. Then you
can throw away the delay < 0 check below.

> +        after = timeval_to_us(&tv);
> +
> +        delay = after - before;
> +        if (delay < 0)
> +        {
> +            condlog(0, "%s: delay calc error", pp->dev);
> +            return 1;
> +        }
> +    	
> +        min = (min <= delay) ? min : delay;
> +        max = (max >= delay) ? max : delay;
> +                
> +        toldelay += delay;
> +    }
> +
> +    toldelay -= min + max;
> +    avgdelay = toldelay/(long long)(cons_num - 2);
> +    if (avgdelay > THRES_USEC_VALUE) 
> +    {           
> +        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
> +        return 1;
> +    }
> +    
> +	ratio = get_conversion_ratio(type);
> +	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long
> long)delay_interval) * ratio)));
> +
> +    return rc;
> +}

Is it reasonable to do these interval calculations synchronously in
getprio()? cons_num is limited to 1000, so this routine could issue
1000 reads on the device before returning. In particular if the device
is under IO load and the delay is high, execution if this routine could
be really slow.

It would make more sense to me to have a separate thread that
calculates some sort of "running average" for the delay of the
different paths, and have getprio() just fetch the current value of
that variable.

Regards
Martin
diff mbox

Patch

diff --git a/libmultipath/Makefile b/libmultipath/Makefile
index 1f5ec25..a4d725a 100644
--- a/libmultipath/Makefile
+++ b/libmultipath/Makefile
@@ -41,7 +41,7 @@  OBJS = memory.o parser.o vector.o devmapper.o callout.o \
 	structs.o discovery.o propsel.o dict.o \
 	pgpolicies.o debug.o defaults.o uevent.o time-util.o \
 	switchgroup.o uxsock.o print.o alias.o log_pthread.o \
-	log.o configure.o structs_vec.o sysfs.o prio.o checkers.o \
+	log.o configure.o structs_vec.o sysfs.o libsg.o prio.o checkers.o \
 	lock.o waiter.o file.o wwids.o prioritizers/alua_rtpg.o
 
 all: $(LIBS)
diff --git a/libmultipath/checkers/Makefile b/libmultipath/checkers/Makefile
index 4970fc0..7e433ca 100644
--- a/libmultipath/checkers/Makefile
+++ b/libmultipath/checkers/Makefile
@@ -14,19 +14,16 @@  LIBS= \
 	libcheckemc_clariion.so \
 	libcheckhp_sw.so \
 	libcheckrdac.so
-ifneq ($(ENABLE_RADOS),0)
-LIBS += libcheckrbd.so
-endif
 
 all: $(LIBS)
 
 libcheckrbd.so: rbd.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -lrados -ludev
 
-libcheckdirectio.so: libsg.o directio.o
+libcheckdirectio.so: ../libsg.o directio.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^ -laio
 
-libcheck%.so: libsg.o %.o
+libcheck%.so: ../libsg.o %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
 install:
diff --git a/libmultipath/checkers/emc_clariion.c b/libmultipath/checkers/emc_clariion.c
index 9c1ffed..e4ba757 100644
--- a/libmultipath/checkers/emc_clariion.c
+++ b/libmultipath/checkers/emc_clariion.c
@@ -12,7 +12,7 @@ 
 #include <errno.h>
 
 #include "../libmultipath/sg_include.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"
 #include "checkers.h"
 #include "debug.h"
 #include "memory.h"
diff --git a/libmultipath/checkers/libsg.c b/libmultipath/checkers/libsg.c
deleted file mode 100644
index 958ea92..0000000
--- a/libmultipath/checkers/libsg.c
+++ /dev/null
@@ -1,94 +0,0 @@ 
-/*
- * Copyright (c) 2004, 2005 Christophe Varoqui
- */
-#include <string.h>
-#include <sys/ioctl.h>
-#include <errno.h>
-#include <sys/stat.h>
-
-#include "checkers.h"
-#include "libsg.h"
-#include "../libmultipath/sg_include.h"
-
-int
-sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	 unsigned char * sense, int sense_len, unsigned int timeout)
-{
-	/* defaults */
-	int blocks;
-	long long start_block = 0;
-	int bs = 512;
-	int cdbsz = 10;
-
-	unsigned char rdCmd[cdbsz];
-	unsigned char *sbb = sense;
-	struct sg_io_hdr io_hdr;
-	int res;
-	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
-	int sz_ind;
-	struct stat filestatus;
-	int retry_count = 3;
-
-	if (fstat(sg_fd, &filestatus) != 0)
-		return PATH_DOWN;
-	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
-	blocks = buff_len / bs;
-	memset(rdCmd, 0, cdbsz);
-	sz_ind = 1;
-	rdCmd[0] = rd_opcode[sz_ind];
-	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
-	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
-	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
-	rdCmd[5] = (unsigned char)(start_block & 0xff);
-	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
-	rdCmd[8] = (unsigned char)(blocks & 0xff);
-
-	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
-	io_hdr.interface_id = 'S';
-	io_hdr.cmd_len = cdbsz;
-	io_hdr.cmdp = rdCmd;
-	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
-	io_hdr.dxfer_len = bs * blocks;
-	io_hdr.dxferp = buff;
-	io_hdr.mx_sb_len = sense_len;
-	io_hdr.sbp = sense;
-	io_hdr.timeout = timeout * 1000;
-	io_hdr.pack_id = (int)start_block;
-
-retry:
-	memset(sense, 0, sense_len);
-	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR == errno));
-
-	if (res < 0) {
-		if (ENOMEM == errno) {
-			return PATH_UP;
-		}
-		return PATH_DOWN;
-	}
-
-	if ((0 == io_hdr.status) &&
-	    (0 == io_hdr.host_status) &&
-	    (0 == io_hdr.driver_status)) {
-		return PATH_UP;
-	} else {
-		int key = 0;
-
-		if (io_hdr.sb_len_wr > 3) {
-			if (sbb[0] == 0x72 || sbb[0] == 0x73)
-				key = sbb[1] & 0x0f;
-			else if (io_hdr.sb_len_wr > 13 &&
-				 ((sbb[0] & 0x7f) == 0x70 ||
-				  (sbb[0] & 0x7f) == 0x71))
-				key = sbb[2] & 0x0f;
-		}
-
-		/*
-		 * Retry if UNIT_ATTENTION check condition.
-		 */
-		if (key == 0x6) {
-			if (--retry_count)
-				goto retry;
-		}
-		return PATH_DOWN;
-	}
-}
diff --git a/libmultipath/checkers/libsg.h b/libmultipath/checkers/libsg.h
deleted file mode 100644
index 3994f45..0000000
--- a/libmultipath/checkers/libsg.h
+++ /dev/null
@@ -1,9 +0,0 @@ 
-#ifndef _LIBSG_H
-#define _LIBSG_H
-
-#define SENSE_BUFF_LEN 32
-
-int sg_read (int sg_fd, unsigned char * buff, int buff_len,
-	     unsigned char * sense, int sense_len, unsigned int timeout);
-
-#endif /* _LIBSG_H */
diff --git a/libmultipath/checkers/readsector0.c b/libmultipath/checkers/readsector0.c
index 8fccb46..d70c5c5 100644
--- a/libmultipath/checkers/readsector0.c
+++ b/libmultipath/checkers/readsector0.c
@@ -4,7 +4,7 @@ 
 #include <stdio.h>
 
 #include "checkers.h"
-#include "libsg.h"
+#include "../libmultipath/libsg.h"
 
 #define MSG_READSECTOR0_UP	"readsector0 checker reports path is up"
 #define MSG_READSECTOR0_DOWN	"readsector0 checker reports path is down"
diff --git a/libmultipath/libsg.c b/libmultipath/libsg.c
new file mode 100644
index 0000000..99c91a4
--- /dev/null
+++ b/libmultipath/libsg.c
@@ -0,0 +1,94 @@ 
+/*
+ * Copyright (c) 2004, 2005 Christophe Varoqui
+ */
+#include <string.h>
+#include <sys/ioctl.h>
+#include <errno.h>
+#include <sys/stat.h>
+
+#include "checkers.h"
+#include "libsg.h"
+#include "sg_include.h"
+
+int
+sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	 unsigned char * sense, int sense_len, unsigned int timeout)
+{
+	/* defaults */
+	int blocks;
+	long long start_block = 0;
+	int bs = 512;
+	int cdbsz = 10;
+
+	unsigned char rdCmd[cdbsz];
+	unsigned char *sbb = sense;
+	struct sg_io_hdr io_hdr;
+	int res;
+	int rd_opcode[] = {0x8, 0x28, 0xa8, 0x88};
+	int sz_ind;
+	struct stat filestatus;
+	int retry_count = 3;
+
+	if (fstat(sg_fd, &filestatus) != 0)
+		return PATH_DOWN;
+	bs = (filestatus.st_blksize > 4096)? 4096: filestatus.st_blksize;
+	blocks = buff_len / bs;
+	memset(rdCmd, 0, cdbsz);
+	sz_ind = 1;
+	rdCmd[0] = rd_opcode[sz_ind];
+	rdCmd[2] = (unsigned char)((start_block >> 24) & 0xff);
+	rdCmd[3] = (unsigned char)((start_block >> 16) & 0xff);
+	rdCmd[4] = (unsigned char)((start_block >> 8) & 0xff);
+	rdCmd[5] = (unsigned char)(start_block & 0xff);
+	rdCmd[7] = (unsigned char)((blocks >> 8) & 0xff);
+	rdCmd[8] = (unsigned char)(blocks & 0xff);
+
+	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
+	io_hdr.interface_id = 'S';
+	io_hdr.cmd_len = cdbsz;
+	io_hdr.cmdp = rdCmd;
+	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	io_hdr.dxfer_len = bs * blocks;
+	io_hdr.dxferp = buff;
+	io_hdr.mx_sb_len = sense_len;
+	io_hdr.sbp = sense;
+	io_hdr.timeout = timeout * 1000;
+	io_hdr.pack_id = (int)start_block;
+
+retry:
+	memset(sense, 0, sense_len);
+	while (((res = ioctl(sg_fd, SG_IO, &io_hdr)) < 0) && (EINTR == errno));
+
+	if (res < 0) {
+		if (ENOMEM == errno) {
+			return PATH_UP;
+		}
+		return PATH_DOWN;
+	}
+
+	if ((0 == io_hdr.status) &&
+	    (0 == io_hdr.host_status) &&
+	    (0 == io_hdr.driver_status)) {
+		return PATH_UP;
+	} else {
+		int key = 0;
+
+		if (io_hdr.sb_len_wr > 3) {
+			if (sbb[0] == 0x72 || sbb[0] == 0x73)
+				key = sbb[1] & 0x0f;
+			else if (io_hdr.sb_len_wr > 13 &&
+				 ((sbb[0] & 0x7f) == 0x70 ||
+				  (sbb[0] & 0x7f) == 0x71))
+				key = sbb[2] & 0x0f;
+		}
+
+		/*
+		 * Retry if UNIT_ATTENTION check condition.
+		 */
+		if (key == 0x6) {
+			if (--retry_count)
+				goto retry;
+		}
+		return PATH_DOWN;
+	}
+}
diff --git a/libmultipath/libsg.h b/libmultipath/libsg.h
new file mode 100644
index 0000000..3994f45
--- /dev/null
+++ b/libmultipath/libsg.h
@@ -0,0 +1,9 @@ 
+#ifndef _LIBSG_H
+#define _LIBSG_H
+
+#define SENSE_BUFF_LEN 32
+
+int sg_read (int sg_fd, unsigned char * buff, int buff_len,
+	     unsigned char * sense, int sense_len, unsigned int timeout);
+
+#endif /* _LIBSG_H */
diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index 36b42e4..7e3da51 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -18,13 +18,17 @@  LIBS = \
 	libpriorandom.so \
 	libpriordac.so \
 	libprioweightedpath.so \
-	libpriosysfs.so
+	libpriodelayedpath.so \
+	libpriosysfs.so 
 
 all: $(LIBS)
 
 libprioalua.so: alua.o alua_rtpg.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
+libpriodelayedpath.so: delayedpath.o  ../libsg.o
+	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
+
 libprio%.so: %.o
 	$(CC) $(LDFLAGS) $(SHARED_FLAGS) -o $@ $^
 
diff --git a/libmultipath/prioritizers/delayedpath.c b/libmultipath/prioritizers/delayedpath.c
new file mode 100644
index 0000000..4c1cfea
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.c
@@ -0,0 +1,246 @@ 
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017, 2021   All Rights Reserved.
+ *
+ * main.c
+ *
+ * Prioritizer for device mapper multipath, where the corresponding priority 
+ * values of specific paths are provided by a time-delay algorithm. And the
+ * time-delay algorithm is dependent on arguments.
+ * 
+ * The principle of the algorithm as follows: 
+ * 1. By sending a certain number "cons_num" of read IOs to the current path 
+ *    continuously, the IOs' average delay can be calculated. 
+ * 2. According to the average delay of each path and the weight value 
+ *    "delay_interval", the priority "rc" of each path can be provided. 
+ *
+ * Author(s): Yang Feng <philip.yang@huawei.com>
+ *            Zou Ming <zouming.zouming@huawei.com>
+ *
+ * This file is released under the GPL.
+ */
+#include <stdio.h>
+#include <ctype.h>
+#include <sys/time.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "../libmultipath/libsg.h"
+
+#include "delayedpath.h"
+
+#define THRES_USEC_VALUE        300000000LL    /*USEC, 300SEC*/
+#define DEFAULT_DELAY_INTERVAL  10             /*MSEC*/
+#define DEFAULT_CONS_NUM        20    
+
+#define MAX_CHAR_SIZE           30
+
+#define CHAR_SEC                "SEC"
+#define CHAR_MSEC               "MSEC"
+#define CHAR_USEC               "USEC"
+
+enum interval_type {
+    INTERVAL_SEC,
+    INTERVAL_MSEC,
+    INTERVAL_USEC,
+    INTERVAL_INVALID
+};
+
+static int conversion_ratio[] = {
+	[INTERVAL_SEC]		= USEC_PER_SEC,
+	[INTERVAL_MSEC]	    = USEC_PER_MSEC,
+	[INTERVAL_USEC]		= USEC_PER_USEC,
+	[INTERVAL_INVALID]	= 0,
+};
+
+
+static int do_readsector0(int fd, unsigned int timeout)
+{
+	unsigned char buf[4096];
+	unsigned char sbuf[SENSE_BUFF_LEN];
+	int ret;
+
+	ret = sg_read(fd, &buf[0], 4096, &sbuf[0],
+		      SENSE_BUFF_LEN, timeout);
+    
+	return ret;
+}
+
+static int get_interval_type(char *source, char *type)
+{  
+    /*is USEC*/
+    if ((strstr(source, CHAR_USEC) != NULL)
+        && (strstr(source, CHAR_USEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_USEC, strlen(CHAR_USEC)+1);        
+        return INTERVAL_USEC;
+    }
+
+    /*is MSEC*/
+    if ((strstr(source, CHAR_MSEC) != NULL) 
+        && (strstr(source, CHAR_MSEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_MSEC, strlen(CHAR_MSEC)+1);
+        return INTERVAL_MSEC;
+    }
+
+    /*is SEC*/
+	if ((strstr(source, CHAR_SEC) != NULL)
+        && (strstr(source, CHAR_SEC)[4] == '_'))
+    {
+        memcpy(type, CHAR_SEC, strlen(CHAR_SEC)+1);
+        return INTERVAL_SEC;
+    }
+
+    return INTERVAL_INVALID;
+}
+
+static int get_string_from_under(char *args,
+                                        char *beforestring,
+                                        char *afterstring,
+                                        int *type)
+{
+    char source[MAX_CHAR_SIZE];
+    char char_type[MAX_CHAR_SIZE];
+    char under[] = "_";
+    char *token  = NULL;
+    char *tmp = NULL;
+    char *saveptr = NULL;
+    unsigned int size = strlen(args);
+
+    if ((args == NULL) || (beforestring == NULL) 
+        || (afterstring == NULL) || (type == NULL))
+        return 0;
+
+    /* int type */
+    if ((size < 1) || (size > MAX_CHAR_SIZE-1))
+        return 0;
+    
+    memcpy(source, args, size+1);
+    if (strstr(source, under) == NULL)
+        return 0;
+
+    *type = get_interval_type(source, char_type);
+    if (*type == INTERVAL_INVALID)
+        return 0;
+
+    token = strtok_r(source, under, &saveptr);
+    token = strtok(token, char_type);
+    if ((token == NULL) || (saveptr == NULL))
+        return 0;
+
+    tmp = token;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    tmp = saveptr;
+    while (*tmp != '\0')
+        if (!isdigit(*tmp++))
+            return 0;
+
+    strncpy(beforestring, token, strlen(token) + 1);
+    strncpy(afterstring, saveptr, strlen(saveptr) + 1);
+    return 1;
+}
+
+int checkargvalid(int delay_interval, int cons_num, int type)
+{
+    if (type == INTERVAL_SEC)
+    {
+        if ((delay_interval < 1) || (delay_interval > 60))
+            return 0;
+    }
+    else if (type != INTERVAL_INVALID)
+    {
+        if ((delay_interval < 1) || (delay_interval >= 1000))
+            return 0;
+    }
+    
+    if ((cons_num < 3) || (cons_num > 1000))
+        return 0;
+
+    return 1;
+}
+
+int get_delay_pref_arg(char *args, int *delay_interval, int *cons_num, int *type)
+{
+    char delayintervalstr[MAX_CHAR_SIZE];
+    char consnumstr[MAX_CHAR_SIZE];
+
+    if (get_string_from_under(args, delayintervalstr, consnumstr, type) == 0)
+        return 0;
+
+    *delay_interval = atoi(delayintervalstr);
+    *cons_num = atoi(consnumstr);
+
+    if (checkargvalid(*delay_interval, *cons_num, *type) == 0)
+        return 0;
+    
+    return 1;
+}
+
+long long get_conversion_ratio(int type)
+{
+    return conversion_ratio[type];
+}
+    
+int getprio (struct path *pp, char *args, unsigned int timeout)
+{
+    int rc, delay_interval, cons_num, type, temp;
+    long long delay, avgdelay, ratio;
+    long long min = THRES_USEC_VALUE;
+    long long max = 0;
+    long long toldelay = 0;
+    long long before, after;
+    struct timeval tv;
+
+    if (get_delay_pref_arg(args, &delay_interval, &cons_num, &type) == 0)
+    {
+        condlog(3, "%s: get delay arg fail", pp->dev);
+        delay_interval = DEFAULT_DELAY_INTERVAL;
+        cons_num = DEFAULT_CONS_NUM;
+        type = INTERVAL_MSEC;
+    }
+
+    temp = cons_num;
+    while (temp-- > 0)
+    {
+        (void)gettimeofday(&tv, NULL);
+        before = timeval_to_us(&tv);		
+
+        if (do_readsector0(pp->fd, timeout) == 2)
+        {
+            condlog(0, "%s: path down", pp->dev);
+            return 1;
+        }
+        
+        (void)gettimeofday(&tv, NULL);
+        after = timeval_to_us(&tv);
+
+        delay = after - before;
+        if (delay < 0)
+        {
+            condlog(0, "%s: delay calc error", pp->dev);
+            return 1;
+        }
+    	
+        min = (min <= delay) ? min : delay;
+        max = (max >= delay) ? max : delay;
+                
+        toldelay += delay;
+    }
+
+    toldelay -= min + max;
+    avgdelay = toldelay/(long long)(cons_num - 2);
+    if (avgdelay > THRES_USEC_VALUE) 
+    {           
+        condlog(0, "%s: avgdelay is more than thresold", pp->dev);
+        return 1;
+    }
+    
+	ratio = get_conversion_ratio(type);
+	rc = (int)(THRES_USEC_VALUE - (avgdelay/(((long long)delay_interval) * ratio)));
+
+    return rc;
+}
diff --git a/libmultipath/prioritizers/delayedpath.h b/libmultipath/prioritizers/delayedpath.h
new file mode 100644
index 0000000..ca89702
--- /dev/null
+++ b/libmultipath/prioritizers/delayedpath.h
@@ -0,0 +1,14 @@ 
+#ifndef _DELAYEDPATH_H
+#define _DELAYEDPATH_H
+
+#define PRIO_DELAYED_PATH "delayedpath"
+#define USEC_PER_SEC      1000000LL
+#define USEC_PER_MSEC     1000LL
+#define USEC_PER_USEC     1LL
+
+static inline long long timeval_to_us(const struct timeval *tv)
+{
+	return ((long long) tv->tv_sec * USEC_PER_SEC) + tv->tv_usec;
+}
+
+#endif