diff mbox

[2/4] dm: implement REQ_FLUSH/FUA support

Message ID 1282929060-23663-3-git-send-email-tj@kernel.org (mailing list archive)
State Not Applicable, archived
Headers show

Commit Message

Tejun Heo Aug. 27, 2010, 5:10 p.m. UTC
None
diff mbox

Patch

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 368e8e9..d5b0e4c 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1278,7 +1278,7 @@  static int crypt_map(struct dm_target *ti, struct bio *bio,
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
 		return DM_MAPIO_REMAPPED;
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 0590c75..136d4f7 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -31,7 +31,6 @@  struct dm_io_client {
  */
 struct io {
 	unsigned long error_bits;
-	unsigned long eopnotsupp_bits;
 	atomic_t count;
 	struct task_struct *sleeper;
 	struct dm_io_client *client;
@@ -130,11 +129,8 @@  static void retrieve_io_and_region_from_bio(struct bio *bio, struct io **io,
  *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-	if (error) {
+	if (error)
 		set_bit(region, &io->error_bits);
-		if (error == -EOPNOTSUPP)
-			set_bit(region, &io->eopnotsupp_bits);
-	}
 
 	if (atomic_dec_and_test(&io->count)) {
 		if (io->sleeper)
@@ -310,8 +306,8 @@  static void do_region(int rw, unsigned region, struct dm_io_region *where,
 	sector_t remaining = where->count;
 
 	/*
-	 * where->count may be zero if rw holds a write barrier and we
-	 * need to send a zero-sized barrier.
+	 * where->count may be zero if rw holds a flush and we need to
+	 * send a zero-sized flush.
 	 */
 	do {
 		/*
@@ -364,7 +360,7 @@  static void dispatch_io(int rw, unsigned int num_regions,
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count || (rw & REQ_HARDBARRIER))
+		if (where[i].count || (rw & REQ_FLUSH))
 			do_region(rw, i, where + i, dp, io);
 	}
 
@@ -393,9 +389,7 @@  static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		return -EIO;
 	}
 
-retry:
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = current;
 	io->client = client;
@@ -412,11 +406,6 @@  retry:
 	}
 	set_current_state(TASK_RUNNING);
 
-	if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-		rw &= ~REQ_HARDBARRIER;
-		goto retry;
-	}
-
 	if (error_bits)
 		*error_bits = io->error_bits;
 
@@ -437,7 +426,6 @@  static int async_io(struct dm_io_client *client, unsigned int num_regions,
 
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = NULL;
 	io->client = client;
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 5a08be0..33420e6 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -300,7 +300,7 @@  static int flush_header(struct log_c *lc)
 		.count = 0,
 	};
 
-	lc->io_req.bi_rw = WRITE_BARRIER;
+	lc->io_req.bi_rw = WRITE_FLUSH;
 
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 7c081bc..19a59b0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -259,7 +259,7 @@  static int mirror_flush(struct dm_target *ti)
 	struct dm_io_region io[ms->nr_mirrors];
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE_BARRIER,
+		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.bvec = NULL,
 		.client = ms->io_client,
@@ -629,7 +629,7 @@  static void do_write(struct mirror_set *ms, struct bio *bio)
 	struct dm_io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
 		.notify.fn = write_callback,
@@ -670,7 +670,7 @@  static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 	bio_list_init(&requeue);
 
 	while ((bio = bio_list_pop(writes))) {
-		if (unlikely(bio_empty_barrier(bio))) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			bio_list_add(&sync, bio);
 			continue;
 		}
@@ -1203,7 +1203,7 @@  static int mirror_end_io(struct dm_target *ti, struct bio *bio,
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (likely(!bio_empty_barrier(bio)))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index bd5c58b..dad011a 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -81,9 +81,9 @@  struct dm_region_hash {
 	struct list_head failed_recovered_regions;
 
 	/*
-	 * If there was a barrier failure no regions can be marked clean.
+	 * If there was a flush failure no regions can be marked clean.
 	 */
-	int barrier_failure;
+	int flush_failure;
 
 	void *context;
 	sector_t target_begin;
@@ -217,7 +217,7 @@  struct dm_region_hash *dm_region_hash_create(
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
-	rh->barrier_failure = 0;
+	rh->flush_failure = 0;
 
 	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
 						      sizeof(struct dm_region));
@@ -399,8 +399,8 @@  void dm_rh_mark_nosync(struct dm_region_hash *rh, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
 
-	if (bio_empty_barrier(bio)) {
-		rh->barrier_failure = 1;
+	if (bio->bi_rw & REQ_FLUSH) {
+		rh->flush_failure = 1;
 		return;
 	}
 
@@ -524,7 +524,7 @@  void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
 	struct bio *bio;
 
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio_empty_barrier(bio))
+		if (bio->bi_rw & REQ_FLUSH)
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
@@ -555,9 +555,9 @@  void dm_rh_dec(struct dm_region_hash *rh, region_t region)
 		 */
 
 		/* do nothing for DM_RH_NOSYNC */
-		if (unlikely(rh->barrier_failure)) {
+		if (unlikely(rh->flush_failure)) {
 			/*
-			 * If a write barrier failed some time ago, we
+			 * If a write flush failed some time ago, we
 			 * don't know whether or not this write made it
 			 * to the disk, so we must resync the device.
 			 */
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index cc2bdb8..0b61792 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -687,7 +687,7 @@  static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, WRITE_BARRIER))
+	if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
 		ps->valid = 0;
 
 	/*
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 5974d30..eed2101 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1587,7 +1587,7 @@  static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
@@ -1691,7 +1691,7 @@  static int snapshot_merge_map(struct dm_target *ti, struct bio *bio,
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		if (!map_context->target_request_nr)
 			bio->bi_bdev = s->origin->bdev;
 		else
@@ -2135,7 +2135,7 @@  static int origin_map(struct dm_target *ti, struct bio *bio,
 	struct dm_dev *dev = ti->private;
 	bio->bi_bdev = dev->bdev;
 
-	if (unlikely(bio_empty_barrier(bio)))
+	if (bio->bi_rw & REQ_FLUSH)
 		return DM_MAPIO_REMAPPED;
 
 	/* Only tell snapshots if this is a write */
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c297f6d..f0371b4 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -271,7 +271,7 @@  static int stripe_map(struct dm_target *ti, struct bio *bio,
 	uint32_t stripe;
 	unsigned target_request_nr;
 
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		target_request_nr = map_context->target_request_nr;
 		BUG_ON(target_request_nr >= sc->stripes);
 		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b1d92be..52f4033 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -144,21 +144,21 @@  struct mapped_device {
 	spinlock_t deferred_lock;
 
 	/*
-	 * An error from the barrier request currently being processed.
+	 * An error from the flush request currently being processed.
 	 */
-	int barrier_error;
+	int flush_error;
 
 	/*
-	 * Protect barrier_error from concurrent endio processing
+	 * Protect flush_error from concurrent endio processing
 	 * in request-based dm.
 	 */
-	spinlock_t barrier_error_lock;
+	spinlock_t flush_error_lock;
 
 	/*
-	 * Processing queue (flush/barriers)
+	 * Processing queue (flush)
 	 */
 	struct workqueue_struct *wq;
-	struct work_struct barrier_work;
+	struct work_struct flush_work;
 
 	/* A pointer to the currently processing pre/post flush request */
 	struct request *flush_request;
@@ -200,8 +200,8 @@  struct mapped_device {
 	/* sysfs handle */
 	struct kobject kobj;
 
-	/* zero-length barrier that will be cloned and submitted to targets */
-	struct bio barrier_bio;
+	/* zero-length flush that will be cloned and submitted to targets */
+	struct bio flush_bio;
 };
 
 /*
@@ -512,7 +512,7 @@  static void end_io_acct(struct dm_io *io)
 
 	/*
 	 * After this is decremented the bio must not be touched if it is
-	 * a barrier.
+	 * a flush.
 	 */
 	dm_disk(md)->part0.in_flight[rw] = pending =
 		atomic_dec_return(&md->pending[rw]);
@@ -626,7 +626,7 @@  static void dec_pending(struct dm_io *io, int error)
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+				if (!(io->bio->bi_rw & REQ_FLUSH))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
@@ -638,20 +638,14 @@  static void dec_pending(struct dm_io *io, int error)
 		io_error = io->error;
 		bio = io->bio;
 
-		if (bio->bi_rw & REQ_HARDBARRIER) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			/*
-			 * There can be just one barrier request so we use
+			 * There can be just one flush request so we use
 			 * a per-device variable for error reporting.
 			 * Note that you can't touch the bio after end_io_acct
-			 *
-			 * We ignore -EOPNOTSUPP for empty flush reported by
-			 * underlying devices. We assume that if the device
-			 * doesn't support empty barriers, it doesn't need
-			 * cache flushing commands.
 			 */
-			if (!md->barrier_error &&
-			    !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-				md->barrier_error = io_error;
+			if (!md->flush_error)
+				md->flush_error = io_error;
 			end_io_acct(io);
 			free_io(md, io);
 		} else {
@@ -755,21 +749,23 @@  static void end_clone_bio(struct bio *clone, int error)
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 
-static void store_barrier_error(struct mapped_device *md, int error)
+static void store_flush_error(struct mapped_device *md, int error)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&md->barrier_error_lock, flags);
+	spin_lock_irqsave(&md->flush_error_lock, flags);
 	/*
-	 * Basically, the first error is taken, but:
-	 *   -EOPNOTSUPP supersedes any I/O error.
-	 *   Requeue request supersedes any I/O error but -EOPNOTSUPP.
+	 * Basically, the first error is taken, but requeue request
+	 * supersedes any I/O error.
+	 *
+	 * FIXME: Depending on device and error type, a FLUSH failure
+	 * might imply that data is already lost.  This needs to be
+	 * updated such that hard FLUSH IO failures trump requeue
+	 * requests, not the other way around.
 	 */
-	if (!md->barrier_error || error == -EOPNOTSUPP ||
-	    (md->barrier_error != -EOPNOTSUPP &&
-	     error == DM_ENDIO_REQUEUE))
-		md->barrier_error = error;
-	spin_unlock_irqrestore(&md->barrier_error_lock, flags);
+	if (!md->flush_error || error == DM_ENDIO_REQUEUE)
+		md->flush_error = error;
+	spin_unlock_irqrestore(&md->flush_error_lock, flags);
 }
 
 /*
@@ -810,12 +806,11 @@  static void dm_end_request(struct request *clone, int error)
 {
 	int rw = rq_data_dir(clone);
 	int run_queue = 1;
-	bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 
-	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
+	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		rq->errors = clone->errors;
 		rq->resid_len = clone->resid_len;
 
@@ -830,9 +825,9 @@  static void dm_end_request(struct request *clone, int error)
 
 	free_rq_clone(clone);
 
-	if (unlikely(is_barrier)) {
+	if (clone->cmd_flags & REQ_FLUSH) {
 		if (unlikely(error))
-			store_barrier_error(md, error);
+			store_flush_error(md, error);
 		run_queue = 0;
 	} else
 		blk_end_request_all(rq, error);
@@ -862,9 +857,9 @@  void dm_requeue_unmapped_request(struct request *clone)
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
+	if (clone->cmd_flags & REQ_FLUSH) {
 		/*
-		 * Barrier clones share an original request.
+		 * Flush clones share an original request.
 		 * Leave it to dm_end_request(), which handles this special
 		 * case.
 		 */
@@ -961,14 +956,14 @@  static void dm_complete_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
+	if (clone->cmd_flags & REQ_FLUSH) {
 		/*
-		 * Barrier clones share an original request.  So can't use
+		 * Flush clones share an original request.  So can't use
 		 * softirq_done with the original.
 		 * Pass the clone to dm_done() directly in this special case.
 		 * It is safe (even if clone->q->queue_lock is held here)
 		 * because there is no I/O dispatching during the completion
-		 * of barrier clone.
+		 * of flush clone.
 		 */
 		dm_done(clone, error, true);
 		return;
@@ -990,9 +985,9 @@  void dm_kill_unmapped_request(struct request *clone, int error)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 
-	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
+	if (clone->cmd_flags & REQ_FLUSH) {
 		/*
-		 * Barrier clones share an original request.
+		 * Flush clones share an original request.
 		 * Leave it to dm_end_request(), which handles this special
 		 * case.
 		 */
@@ -1119,7 +1114,7 @@  static void dm_bio_destructor(struct bio *bio)
 }
 
 /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
@@ -1134,7 +1129,7 @@  static struct bio *split_bvec(struct bio *bio, sector_t sector,
 
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+	clone->bi_rw = bio->bi_rw;
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1156,6 @@  static struct bio *clone_bio(struct bio *bio, sector_t sector,
 
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_rw &= ~REQ_HARDBARRIER;
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1225,7 +1219,7 @@  static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
 		__issue_target_request(ci, ti, request_nr, len);
 }
 
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
@@ -1289,9 +1283,6 @@  static int __clone_and_map(struct clone_info *ci)
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 
-	if (unlikely(bio_empty_barrier(bio)))
-		return __clone_and_map_empty_barrier(ci);
-
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __clone_and_map_discard(ci);
 
@@ -1383,11 +1374,11 @@  static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
-		if (!(bio->bi_rw & REQ_HARDBARRIER))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			bio_io_error(bio);
 		else
-			if (!md->barrier_error)
-				md->barrier_error = -EIO;
+			if (!md->flush_error)
+				md->flush_error = -EIO;
 		return;
 	}
 
@@ -1400,14 +1391,22 @@  static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
-	ci.sector_count = bio_sectors(bio);
-	if (unlikely(bio_empty_barrier(bio)))
+	if (!(bio->bi_rw & REQ_FLUSH))
+		ci.sector_count = bio_sectors(bio);
+	else {
+		/* all FLUSH bio's reaching here should be empty */
+		WARN_ON_ONCE(bio_has_data(bio));
 		ci.sector_count = 1;
+	}
 	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
-	while (ci.sector_count && !error)
-		error = __clone_and_map(&ci);
+	while (ci.sector_count && !error) {
+		if (!(bio->bi_rw & REQ_FLUSH))
+			error = __clone_and_map(&ci);
+		else
+			error = __clone_and_map_flush(&ci);
+	}
 
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
@@ -1492,11 +1491,11 @@  static int _dm_request(struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	/*
-	 * If we're suspended or the thread is processing barriers
+	 * If we're suspended or the thread is processing flushes
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+	    (bio->bi_rw & REQ_FLUSH)) {
 		up_read(&md->io_lock);
 
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1595,7 +1594,7 @@  static int setup_clone(struct request *clone, struct request *rq,
 	if (dm_rq_is_flush_request(rq)) {
 		blk_rq_init(NULL, clone);
 		clone->cmd_type = REQ_TYPE_FS;
-		clone->cmd_flags |= (REQ_HARDBARRIER | WRITE);
+		clone->cmd_flags |= WRITE_FLUSH;
 	} else {
 		r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
 				      dm_rq_bio_constructor, tio);
@@ -1735,15 +1734,18 @@  static void dm_request_fn(struct request_queue *q)
 	 * dm_suspend().
 	 */
 	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+		if (md->flush_request)
+			goto plug_and_out;
+
 		rq = blk_peek_request(q);
 		if (!rq)
 			goto plug_and_out;
 
-		if (unlikely(dm_rq_is_flush_request(rq))) {
+		if (dm_rq_is_flush_request(rq)) {
 			BUG_ON(md->flush_request);
 			md->flush_request = rq;
 			blk_start_request(rq);
-			queue_work(md->wq, &md->barrier_work);
+			queue_work(md->wq, &md->flush_work);
 			goto out;
 		}
 
@@ -1918,7 +1920,7 @@  out:
 static const struct block_device_operations dm_blk_dops;
 
 static void dm_wq_work(struct work_struct *work);
-static void dm_rq_barrier_work(struct work_struct *work);
+static void dm_rq_flush_work(struct work_struct *work);
 
 static void dm_init_md_queue(struct mapped_device *md)
 {
@@ -1940,6 +1942,7 @@  static void dm_init_md_queue(struct mapped_device *md)
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 
 /*
@@ -1972,7 +1975,7 @@  static struct mapped_device *alloc_dev(int minor)
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
-	spin_lock_init(&md->barrier_error_lock);
+	spin_lock_init(&md->flush_error_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
@@ -1995,7 +1998,7 @@  static struct mapped_device *alloc_dev(int minor)
 	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
-	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
+	INIT_WORK(&md->flush_work, dm_rq_flush_work);
 	init_waitqueue_head(&md->eventq);
 
 	md->disk->major = _major;
@@ -2245,7 +2248,7 @@  static int dm_init_request_based_queue(struct mapped_device *md)
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_flush(md->queue, REQ_FLUSH);
+	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 
 	elv_register_queue(md->queue);
 
@@ -2406,41 +2409,35 @@  static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
-static void dm_flush(struct mapped_device *md)
+static void process_flush(struct mapped_device *md, struct bio *bio)
 {
-	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-
-	bio_init(&md->barrier_bio);
-	md->barrier_bio.bi_bdev = md->bdev;
-	md->barrier_bio.bi_rw = WRITE_BARRIER;
-	__split_and_process_bio(md, &md->barrier_bio);
+	md->flush_error = 0;
 
+	/* handle REQ_FLUSH */
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
 
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-	md->barrier_error = 0;
+	bio_init(&md->flush_bio);
+	md->flush_bio.bi_bdev = md->bdev;
+	md->flush_bio.bi_rw = WRITE_FLUSH;
+	__split_and_process_bio(md, &md->flush_bio);
 
-	dm_flush(md);
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 
-	if (!bio_empty_barrier(bio)) {
-		__split_and_process_bio(md, bio);
-		/*
-		 * If the request isn't supported, don't waste time with
-		 * the second flush.
-		 */
-		if (md->barrier_error != -EOPNOTSUPP)
-			dm_flush(md);
+	/* if it's an empty flush or the preflush failed, we're done */
+	if (!bio_has_data(bio) || md->flush_error) {
+		if (md->flush_error != DM_ENDIO_REQUEUE)
+			bio_endio(bio, md->flush_error);
+		else {
+			spin_lock_irq(&md->deferred_lock);
+			bio_list_add_head(&md->deferred, bio);
+			spin_unlock_irq(&md->deferred_lock);
+		}
+		return;
 	}
 
-	if (md->barrier_error != DM_ENDIO_REQUEUE)
-		bio_endio(bio, md->barrier_error);
-	else {
-		spin_lock_irq(&md->deferred_lock);
-		bio_list_add_head(&md->deferred, bio);
-		spin_unlock_irq(&md->deferred_lock);
-	}
+	/* issue data + REQ_FUA */
+	bio->bi_rw &= ~REQ_FLUSH;
+	__split_and_process_bio(md, bio);
 }
 
 /*
@@ -2469,8 +2466,8 @@  static void dm_wq_work(struct work_struct *work)
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (c->bi_rw & REQ_HARDBARRIER)
-				process_barrier(md, c);
+			if (c->bi_rw & REQ_FLUSH)
+				process_flush(md, c);
 			else
 				__split_and_process_bio(md, c);
 		}
@@ -2495,8 +2492,8 @@  static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_
 	tio->info.target_request_nr = request_nr;
 }
 
-/* Issue barrier requests to targets and wait for their completion. */
-static int dm_rq_barrier(struct mapped_device *md)
+/* Issue flush requests to targets and wait for their completion. */
+static int dm_rq_flush(struct mapped_device *md)
 {
 	int i, j;
 	struct dm_table *map = dm_get_live_table(md);
@@ -2504,7 +2501,7 @@  static int dm_rq_barrier(struct mapped_device *md)
 	struct dm_target *ti;
 	struct request *clone;
 
-	md->barrier_error = 0;
+	md->flush_error = 0;
 
 	for (i = 0; i < num_targets; i++) {
 		ti = dm_table_get_target(map, i);
@@ -2519,26 +2516,26 @@  static int dm_rq_barrier(struct mapped_device *md)
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 	dm_table_put(map);
 
-	return md->barrier_error;
+	return md->flush_error;
 }
 
-static void dm_rq_barrier_work(struct work_struct *work)
+static void dm_rq_flush_work(struct work_struct *work)
 {
 	int error;
 	struct mapped_device *md = container_of(work, struct mapped_device,
-						barrier_work);
+						flush_work);
 	struct request_queue *q = md->queue;
 	struct request *rq;
 	unsigned long flags;
 
 	/*
 	 * Hold the md reference here and leave it at the last part so that
-	 * the md can't be deleted by device opener when the barrier request
+	 * the md can't be deleted by device opener when the flush request
 	 * completes.
 	 */
 	dm_get(md);
 
-	error = dm_rq_barrier(md);
+	error = dm_rq_flush(md);
 
 	rq = md->flush_request;
 	md->flush_request = NULL;
@@ -2691,7 +2688,7 @@  int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	up_write(&md->io_lock);
 
 	/*
-	 * Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
+	 * Request-based dm uses md->wq for flush (dm_rq_flush_work) which
 	 * can be kicked until md->queue is stopped.  So stop md->queue before
 	 * flushing md->wq.
 	 */