diff mbox

Btrfs-progs: detect if the disk we are formatting is a ssd V2

Message ID 1342811743-8748-1-git-send-email-jbacik@fusionio.com (mailing list archive)
State New, archived
Headers show

Commit Message

Josef Bacik July 20, 2012, 7:15 p.m. UTC
SSD's do not gain anything by having metadata DUP turned on.  The underlying
file system that is a part of all SSD's could easily map duplicate metadat
blocks into the same erase block which effectively eliminates the benefit of
duplicating the metadata on disk.  So detect if we are formatting a single
SSD drive and if we are do not use DUP.  Thanks,

Signed-off-by: Josef Bacik <jbacik@fusionio.com>
---
V1->V2: use blkid to get the full disk in case we happen to be formatting a
partition.

 Makefile |    2 +-
 mkfs.c   |   54 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 2 files changed, 52 insertions(+), 4 deletions(-)

Comments

Goffredo Baroncelli July 20, 2012, 7:36 p.m. UTC | #1
On 07/20/2012 09:15 PM, Josef Bacik wrote:
> SSD's do not gain anything by having metadata DUP turned on.  The underlying
> file system that is a part of all SSD's could easily map duplicate metadat

If I understood correctly you are stating that because an SSD *might*
"eliminates the benefit of duplicating the metadata"  mkfs.btrfs *must*
remove _silently_ this behaviour on all SSD ?

To me it seems too strong; or almost it should be documented in the man
page and/or issuing a warning during the format process.

> blocks into the same erase block which effectively eliminates the benefit of
> duplicating the metadata on disk.  So detect if we are formatting a single
> SSD drive and if we are do not use DUP.  Thanks,
> 
> Signed-off-by: Josef Bacik <jbacik@fusionio.com>
> ---
> V1->V2: use blkid to get the full disk in case we happen to be formatting a
> partition.
> 
>  Makefile |    2 +-
>  mkfs.c   |   54 +++++++++++++++++++++++++++++++++++++++++++++++++++---
>  2 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/Makefile b/Makefile
> index 9694444..d827216 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -65,7 +65,7 @@ btrfsck: $(objects) btrfsck.o
>  	$(CC) $(CFLAGS) -o btrfsck btrfsck.o $(objects) $(LDFLAGS) $(LIBS)
>  
>  mkfs.btrfs: $(objects) mkfs.o
> -	$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS)
> +	$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS) -lblkid
>  
>  btrfs-debug-tree: $(objects) debug-tree.o
>  	$(CC) $(CFLAGS) -o btrfs-debug-tree $(objects) debug-tree.o $(LDFLAGS) $(LIBS)
> diff --git a/mkfs.c b/mkfs.c
> index dff5eb8..fc2b6ed 100644
> --- a/mkfs.c
> +++ b/mkfs.c
> @@ -37,6 +37,7 @@
>  #include <linux/fs.h>
>  #include <ctype.h>
>  #include <attr/xattr.h>
> +#include <blkid/blkid.h>
>  #include "kerncompat.h"
>  #include "ctree.h"
>  #include "disk-io.h"
> @@ -234,7 +235,7 @@ static int create_one_raid_group(struct btrfs_trans_handle *trans,
>  static int create_raid_groups(struct btrfs_trans_handle *trans,
>  			      struct btrfs_root *root, u64 data_profile,
>  			      int data_profile_opt, u64 metadata_profile,
> -			      int metadata_profile_opt, int mixed)
> +			      int metadata_profile_opt, int mixed, int ssd)
>  {
>  	u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
>  	u64 allowed;
> @@ -246,7 +247,7 @@ static int create_raid_groups(struct btrfs_trans_handle *trans,
>  	 */
>  	if (!metadata_profile_opt && !mixed) {

Please put something like

+		if(ssd)
+		    printf("SSD detected: remove the metadata duplication;");


>  		metadata_profile = (num_devices > 1) ?
> -			BTRFS_BLOCK_GROUP_RAID1 : BTRFS_BLOCK_GROUP_DUP;
> +			BTRFS_BLOCK_GROUP_RAID1 : (ssd) ? 0: BTRFS_BLOCK_GROUP_DUP;
>  	}
>  	if (!data_profile_opt && !mixed) {
>  		data_profile = (num_devices > 1) ?
> @@ -1201,6 +1202,49 @@ static int zero_output_file(int out_fd, u64 size, u32 sectorsize)
>  	return ret;
>  }
>  
> +static int is_ssd(const char *file)
> +{
> +	char *devname;
> +	blkid_probe probe;
> +	char *dev;
> +	char path[PATH_MAX];
> +	dev_t disk;
> +	int fd;
> +	char rotational;
> +
> +	probe = blkid_new_probe_from_filename(file);
> +	if (!probe)
> +		return 0;
> +
> +	/*
> +	 * We want to use blkid_devno_to_wholedisk() but it's broken for some
> +	 * reason on F17 at least so we'll do this trickery
> +	 */
> +	disk = blkid_probe_get_wholedisk_devno(probe);
> +	devname = blkid_devno_to_devname(disk);
> +
> +	dev = strrchr(devname, '/');
> +	dev++;
> +
> +	snprintf(path, PATH_MAX, "/sys/block/%s/queue/rotational", dev);
> +
> +	free(devname);
> +	blkid_free_probe(probe);
> +
> +	fd = open(path, O_RDONLY);
> +	if (fd < 0) {
> +		return 0;
> +	}
> +
> +	if (read(fd, &rotational, sizeof(char)) < sizeof(char)) {
> +		close(fd);
> +		return 0;
> +	}
> +	close(fd);
> +
> +	return !atoi((const char *)&rotational);
> +}
> +
>  int main(int ac, char **av)
>  {
>  	char *file;
> @@ -1227,6 +1271,7 @@ int main(int ac, char **av)
>  	int data_profile_opt = 0;
>  	int metadata_profile_opt = 0;
>  	int nodiscard = 0;
> +	int ssd = 0;
>  
>  	char *source_dir = NULL;
>  	int source_dir_set = 0;
> @@ -1352,6 +1397,9 @@ int main(int ac, char **av)
>  			exit(1);
>  		}
>  	}
> +
> +	ssd = is_ssd(file);
> +


>  	if (mixed) {
>  		if (metadata_profile != data_profile) {
>  			fprintf(stderr, "With mixed block groups data and metadata "
> @@ -1438,7 +1486,7 @@ raid_groups:
>  	if (!source_dir_set) {
>  		ret = create_raid_groups(trans, root, data_profile,
>  				 data_profile_opt, metadata_profile,
> -				 metadata_profile_opt, mixed);
> +				 metadata_profile_opt, mixed, ssd);
>  		BUG_ON(ret);
>  	}
>  

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Wendy Cheng July 20, 2012, 10:38 p.m. UTC | #2
On Fri, Jul 20, 2012 at 12:36 PM, Goffredo Baroncelli
<kreijack@libero.it> wrote:
> On 07/20/2012 09:15 PM, Josef Bacik wrote:
>> SSD's do not gain anything by having metadata DUP turned on.  The underlying
>> file system that is a part of all SSD's could easily map duplicate metadat
>
> If I understood correctly you are stating that because an SSD *might*
> "eliminates the benefit of duplicating the metadata"  mkfs.btrfs *must*
> remove _silently_ this behaviour on all SSD ?
>
> To me it seems too strong; or almost it should be documented in the man
> page and/or issuing a warning during the format process.

I'll have to second this .. this is my first time looking into btrfs -
do feel free to correct me if my reading is not correct.

Based on https://btrfs.wiki.kernel.org/index.php/Glossary, I assume
the DUP is a flag to ask for meta-data duplication within the same
device entity. This implies whenever an FS (meta-data) block is
updated, the duplicated FS block needs to be modified as well (true
?). So within a conventional SSD firmware implementation, it is true
that both FS blocks could end up in the same SSD block that get erased
and re-allocated together. Similar thing could happen with disks that
have embedded de-duplication feature turned on.

However, this should have been a task for the admin (or whoever types
this mkfs command). It is not a filesystem's job to assume how the
firmware works and silently ignore the DUP request, *unless* there is
a standard specification clearly describes linux devices that claim to
be not "rotational" should behave this way.

-- Wendy
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik July 23, 2012, 12:46 p.m. UTC | #3
On Fri, Jul 20, 2012 at 04:38:59PM -0600, Wendy Cheng wrote:
> On Fri, Jul 20, 2012 at 12:36 PM, Goffredo Baroncelli
> <kreijack@libero.it> wrote:
> > On 07/20/2012 09:15 PM, Josef Bacik wrote:
> >> SSD's do not gain anything by having metadata DUP turned on.  The underlying
> >> file system that is a part of all SSD's could easily map duplicate metadat
> >
> > If I understood correctly you are stating that because an SSD *might*
> > "eliminates the benefit of duplicating the metadata"  mkfs.btrfs *must*
> > remove _silently_ this behaviour on all SSD ?
> >
> > To me it seems too strong; or almost it should be documented in the man
> > page and/or issuing a warning during the format process.
> 
> I'll have to second this .. this is my first time looking into btrfs -
> do feel free to correct me if my reading is not correct.
> 
> Based on https://btrfs.wiki.kernel.org/index.php/Glossary, I assume
> the DUP is a flag to ask for meta-data duplication within the same
> device entity. This implies whenever an FS (meta-data) block is
> updated, the duplicated FS block needs to be modified as well (true
> ?). So within a conventional SSD firmware implementation, it is true
> that both FS blocks could end up in the same SSD block that get erased
> and re-allocated together. Similar thing could happen with disks that
> have embedded de-duplication feature turned on.
> 
> However, this should have been a task for the admin (or whoever types
> this mkfs command). It is not a filesystem's job to assume how the
> firmware works and silently ignore the DUP request, *unless* there is
> a standard specification clearly describes linux devices that claim to
> be not "rotational" should behave this way.
> 

The admin can still use -m dup if he wants the added possiblity of protection,
this just makes the default not dup.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Goffredo Baroncelli July 23, 2012, 5:01 p.m. UTC | #4
On 07/23/2012 02:46 PM, Josef Bacik wrote:
> On Fri, Jul 20, 2012 at 04:38:59PM -0600, Wendy Cheng wrote:

>> However, this should have been a task for the admin (or whoever types
>> this mkfs command). It is not a filesystem's job to assume how the
>> firmware works and silently ignore the DUP request, *unless* there is
>> a standard specification clearly describes linux devices that claim to
>> be not "rotational" should behave this way.
>>
> 
> The admin can still use -m dup if he wants the added possiblity of protection,
> this just makes the default not dup.  

Josef,

this was clear to us with the the code at hand. However, what was
pointed out is that is change of a well established behaviour without
documenting it not informing it.

I don't arguing about the rationale, what I am telling is : ok to the
change but you have to inform the user.

Saying "but the user can revert the change passing '-m dup'" is not a
valid response, because the user *could* revert the change if he *would*
be informed about the change.

BR
G.Baroncelli



>Thanks,
> 
> Josef
> .
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Josef Bacik July 23, 2012, 5:06 p.m. UTC | #5
On Mon, Jul 23, 2012 at 11:01:17AM -0600, Goffredo Baroncelli wrote:
> On 07/23/2012 02:46 PM, Josef Bacik wrote:
> > On Fri, Jul 20, 2012 at 04:38:59PM -0600, Wendy Cheng wrote:
> 
> >> However, this should have been a task for the admin (or whoever types
> >> this mkfs command). It is not a filesystem's job to assume how the
> >> firmware works and silently ignore the DUP request, *unless* there is
> >> a standard specification clearly describes linux devices that claim to
> >> be not "rotational" should behave this way.
> >>
> > 
> > The admin can still use -m dup if he wants the added possiblity of protection,
> > this just makes the default not dup.  
> 
> Josef,
> 
> this was clear to us with the the code at hand. However, what was
> pointed out is that is change of a well established behaviour without
> documenting it not informing it.
> 
> I don't arguing about the rationale, what I am telling is : ok to the
> change but you have to inform the user.
> 
> Saying "but the user can revert the change passing '-m dup'" is not a
> valid response, because the user *could* revert the change if he *would*
> be informed about the change.
> 

Yeah sorry Goffredo I agree and I changed it locally, I just haven't sent it out
yet.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/Makefile b/Makefile
index 9694444..d827216 100644
--- a/Makefile
+++ b/Makefile
@@ -65,7 +65,7 @@  btrfsck: $(objects) btrfsck.o
 	$(CC) $(CFLAGS) -o btrfsck btrfsck.o $(objects) $(LDFLAGS) $(LIBS)
 
 mkfs.btrfs: $(objects) mkfs.o
-	$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS)
+	$(CC) $(CFLAGS) -o mkfs.btrfs $(objects) mkfs.o $(LDFLAGS) $(LIBS) -lblkid
 
 btrfs-debug-tree: $(objects) debug-tree.o
 	$(CC) $(CFLAGS) -o btrfs-debug-tree $(objects) debug-tree.o $(LDFLAGS) $(LIBS)
diff --git a/mkfs.c b/mkfs.c
index dff5eb8..fc2b6ed 100644
--- a/mkfs.c
+++ b/mkfs.c
@@ -37,6 +37,7 @@ 
 #include <linux/fs.h>
 #include <ctype.h>
 #include <attr/xattr.h>
+#include <blkid/blkid.h>
 #include "kerncompat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -234,7 +235,7 @@  static int create_one_raid_group(struct btrfs_trans_handle *trans,
 static int create_raid_groups(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root, u64 data_profile,
 			      int data_profile_opt, u64 metadata_profile,
-			      int metadata_profile_opt, int mixed)
+			      int metadata_profile_opt, int mixed, int ssd)
 {
 	u64 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy);
 	u64 allowed;
@@ -246,7 +247,7 @@  static int create_raid_groups(struct btrfs_trans_handle *trans,
 	 */
 	if (!metadata_profile_opt && !mixed) {
 		metadata_profile = (num_devices > 1) ?
-			BTRFS_BLOCK_GROUP_RAID1 : BTRFS_BLOCK_GROUP_DUP;
+			BTRFS_BLOCK_GROUP_RAID1 : (ssd) ? 0: BTRFS_BLOCK_GROUP_DUP;
 	}
 	if (!data_profile_opt && !mixed) {
 		data_profile = (num_devices > 1) ?
@@ -1201,6 +1202,49 @@  static int zero_output_file(int out_fd, u64 size, u32 sectorsize)
 	return ret;
 }
 
+static int is_ssd(const char *file)
+{
+	char *devname;
+	blkid_probe probe;
+	char *dev;
+	char path[PATH_MAX];
+	dev_t disk;
+	int fd;
+	char rotational;
+
+	probe = blkid_new_probe_from_filename(file);
+	if (!probe)
+		return 0;
+
+	/*
+	 * We want to use blkid_devno_to_wholedisk() but it's broken for some
+	 * reason on F17 at least so we'll do this trickery
+	 */
+	disk = blkid_probe_get_wholedisk_devno(probe);
+	devname = blkid_devno_to_devname(disk);
+
+	dev = strrchr(devname, '/');
+	dev++;
+
+	snprintf(path, PATH_MAX, "/sys/block/%s/queue/rotational", dev);
+
+	free(devname);
+	blkid_free_probe(probe);
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0) {
+		return 0;
+	}
+
+	if (read(fd, &rotational, sizeof(char)) < sizeof(char)) {
+		close(fd);
+		return 0;
+	}
+	close(fd);
+
+	return !atoi((const char *)&rotational);
+}
+
 int main(int ac, char **av)
 {
 	char *file;
@@ -1227,6 +1271,7 @@  int main(int ac, char **av)
 	int data_profile_opt = 0;
 	int metadata_profile_opt = 0;
 	int nodiscard = 0;
+	int ssd = 0;
 
 	char *source_dir = NULL;
 	int source_dir_set = 0;
@@ -1352,6 +1397,9 @@  int main(int ac, char **av)
 			exit(1);
 		}
 	}
+
+	ssd = is_ssd(file);
+
 	if (mixed) {
 		if (metadata_profile != data_profile) {
 			fprintf(stderr, "With mixed block groups data and metadata "
@@ -1438,7 +1486,7 @@  raid_groups:
 	if (!source_dir_set) {
 		ret = create_raid_groups(trans, root, data_profile,
 				 data_profile_opt, metadata_profile,
-				 metadata_profile_opt, mixed);
+				 metadata_profile_opt, mixed, ssd);
 		BUG_ON(ret);
 	}