Message ID | 2291ba747c6c9701952fa75140684535cfe4ab3e.1646589622.git.kreijack@inwind.it (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | btrfs: allocation_hint | expand |
On Sun, Mar 06, 2022 at 07:14:42PM +0100, Goffredo Baroncelli wrote: > From: Goffredo Baroncelli <kreijack@inwind.it> > > The chunk allocation policy is modified as follow. > > Each disk may have one of the following tags: > - BTRFS_DEV_ALLOCATION_METADATA_PREFERRED > - BTRFS_DEV_ALLOCATION_METADATA_ONLY > - BTRFS_DEV_ALLOCATION_DATA_ONLY > - BTRFS_DEV_ALLOCATION_DATA_PREFERRED (default) > > During a *mixed data/metadata* chunk allocation, BTRFS works as > usual. > > During a *data* chunk allocation, the space are searched first in > BTRFS_DEV_ALLOCATION_DATA_ONLY. If the space found is not enough (eg. > in raid5, only two disks are available), then the disks tagged > BTRFS_DEV_ALLOCATION_DATA_PREFERRED are considered. If the space is not > enough again, the disks tagged BTRFS_DEV_ALLOCATION_METADATA_PREFERRED > are also considered. If even in this case the space is not > sufficient, -ENOSPC is raised. > A disk tagged with BTRFS_DEV_ALLOCATION_METADATA_ONLY is never considered > for a data BG allocation. > > During a *metadata* chunk allocation, the same algorithm applies swapping > _DATA_ and _METADATA_. > > By default the disks are tagged as BTRFS_DEV_ALLOCATION_DATA_PREFERRED, > so BTRFS behaves as usual. > > If the user prefers to store the metadata in the faster disks (e.g. SSD), > he can tag these with BTRFS_DEV_ALLOCATION_METADATA_PREFERRED: in this > case the metadata BG go in the BTRFS_DEV_ALLOCATION_METADATA_PREFERRED > disks and the data BG in the others ones. When a disks set is filled, the > other is considered. > > Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> > --- > fs/btrfs/volumes.c | 113 +++++++++++++++++++++++++++++++++++++++++++-- > fs/btrfs/volumes.h | 1 + > 2 files changed, 111 insertions(+), 3 deletions(-) > > diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c > index d4ac90f5c949..7b37db9bb887 100644 > --- a/fs/btrfs/volumes.c > +++ b/fs/btrfs/volumes.c > @@ -184,6 +184,27 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags > return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ > } > > +#define BTRFS_DEV_ALLOCATION_HINT_COUNT (1ULL << \ > + BTRFS_DEV_ALLOCATION_HINT_BIT_COUNT) > + > +/* > + * The order of BTRFS_DEV_ALLOCATION_HINT_* values are not > + * good, because BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED is 0 > + * (for backward compatibility reason), and the other > + * values are greater (because the field is unsigned). So we > + * need a map that rearranges the order giving to _DATA_PREFERRED > + * an intermediate priority. > + * These values give to METADATA_ONLY the highest priority, and are > + * valid for metadata BG allocation. When a data > + * BG is allocated we negate these values to reverse the priority. > + */ > +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_HINT_COUNT] = { > + [BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY] = -1, > + [BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED] = 0, > + [BTRFS_DEV_ALLOCATION_HINT_METADATA_PREFERRED] = 1, > + [BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY] = 2, > +}; This should be const int, not const char. Also the formatting for the comment is awkward, it's 1 space between the * and the word, so /* * The order of .... * * These values give METADATA_ONLY the highest priority... */ Also the -1 thing is weird and unclear. In fact I think it's problematic, I'll explain below. > + > const char *btrfs_bg_type_to_raid_name(u64 flags) > { > const int index = btrfs_bg_flags_to_raid_index(flags); > @@ -5030,13 +5051,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, > } > > /* > - * sort the devices in descending order by max_avail, total_avail > + * sort the devices in descending order by alloc_hint, > + * max_avail, total_avail > */ > static int btrfs_cmp_device_info(const void *a, const void *b) > { > const struct btrfs_device_info *di_a = a; > const struct btrfs_device_info *di_b = b; > > + if (di_a->alloc_hint > di_b->alloc_hint) > + return -1; > + if (di_a->alloc_hint < di_b->alloc_hint) > + return 1; This is making things awkward, instead I think we change this to sort_r, which uses cmp(a, b, priv). You pass in priv which is the type we want, DATA or METADATA or whatever. Then you can do if (priv == data) { /* do the sorting so DATA_ONLY is on top, then DATA_PREFERRED, etc. */ } else { /* do the METADATA thing instead. */ } > if (di_a->max_avail > di_b->max_avail) > return -1; > if (di_a->max_avail < di_b->max_avail) > @@ -5199,6 +5225,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, > int ndevs = 0; > u64 max_avail; > u64 dev_offset; > + int hint; > > /* > * in the first pass through the devices list, we gather information > @@ -5251,17 +5278,95 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, > devices_info[ndevs].max_avail = max_avail; > devices_info[ndevs].total_avail = total_avail; > devices_info[ndevs].dev = device; > + > + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && > + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { > + /* > + * if mixed bg set all the alloc_hint > + * fields to the same value, so the sorting > + * is not affected > + */ > + devices_info[ndevs].alloc_hint = 0; > + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { > + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; > + > + /* > + * skip BTRFS_DEV_METADATA_ONLY disks > + */ > + if (hint == BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY) > + continue; > + /* > + * if a data chunk must be allocated, > + * sort also by hint (data disk > + * higher priority) > + */ > + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; > + } else { /* BTRFS_BLOCK_GROUP_METADATA */ > + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; > + > + /* > + * skip BTRFS_DEV_DATA_ONLY disks > + */ > + if (hint == BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY) > + continue; > + /* > + * if a metadata chunk must be allocated, > + * sort also by hint (metadata hint > + * higher priority) > + */ > + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; > + } > + Shouldn't we be doing this before adding the device to devices_info? That way for _ONLY we just don't even add the disk to the devices_info. > ++ndevs; > } > ctl->ndevs = ndevs; > > + return 0; > +} > + > +static void sort_and_reduce_device_info(struct alloc_chunk_ctl *ctl, > + struct btrfs_device_info *devices_info) > +{ > + int ndevs, hint, i; > + > + ndevs = ctl->ndevs; > /* > - * now sort the devices by hole size / available space > + * now sort the devices by hint / hole size / available space > */ > sort(devices_info, ndevs, sizeof(struct btrfs_device_info), > btrfs_cmp_device_info, NULL); > > - return 0; > + /* > + * select the minimum set of disks grouped by hint that > + * can host the chunk > + */ > + ndevs = 0; > + while (ndevs < ctl->ndevs) { > + hint = devices_info[ndevs++].alloc_hint; > + while (ndevs < ctl->ndevs) { > + if (devices_info[ndevs].alloc_hint != hint) > + break; > + ndevs++; > + } > + if (ndevs >= ctl->devs_min) > + break; > + } > + > + ctl->ndevs = ndevs; > + > + /* > + * the next layers require the devices_info ordered by > + * max_avail. If we are returning two (or more) different > + * group of alloc_hint, this is not always true. So sort > + * these again. > + */ > + > + for (i = 0 ; i < ndevs ; i++) > + devices_info[i].alloc_hint = 0; > + > + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), > + btrfs_cmp_device_info, NULL); With my sort_r suggestion I think we no longer need the second sort. It'll get you the devices you want in order of most preferred alloc hint, and with the max_avail. I'd double check with printk's, but you should be able to drop all this. Thanks, Josef
On 22/03/2022 20.47, Josef Bacik wrote: > On Sun, Mar 06, 2022 at 07:14:42PM +0100, Goffredo Baroncelli wrote: >> From: Goffredo Baroncelli <kreijack@inwind.it> >> >> The chunk allocation policy is modified as follow. >> >> Each disk may have one of the following tags: >> - BTRFS_DEV_ALLOCATION_METADATA_PREFERRED >> - BTRFS_DEV_ALLOCATION_METADATA_ONLY >> - BTRFS_DEV_ALLOCATION_DATA_ONLY >> - BTRFS_DEV_ALLOCATION_DATA_PREFERRED (default) >> >> During a *mixed data/metadata* chunk allocation, BTRFS works as >> usual. >> >> During a *data* chunk allocation, the space are searched first in >> BTRFS_DEV_ALLOCATION_DATA_ONLY. If the space found is not enough (eg. >> in raid5, only two disks are available), then the disks tagged >> BTRFS_DEV_ALLOCATION_DATA_PREFERRED are considered. If the space is not >> enough again, the disks tagged BTRFS_DEV_ALLOCATION_METADATA_PREFERRED >> are also considered. If even in this case the space is not >> sufficient, -ENOSPC is raised. >> A disk tagged with BTRFS_DEV_ALLOCATION_METADATA_ONLY is never considered >> for a data BG allocation. >> >> During a *metadata* chunk allocation, the same algorithm applies swapping >> _DATA_ and _METADATA_. >> >> By default the disks are tagged as BTRFS_DEV_ALLOCATION_DATA_PREFERRED, >> so BTRFS behaves as usual. >> >> If the user prefers to store the metadata in the faster disks (e.g. SSD), >> he can tag these with BTRFS_DEV_ALLOCATION_METADATA_PREFERRED: in this >> case the metadata BG go in the BTRFS_DEV_ALLOCATION_METADATA_PREFERRED >> disks and the data BG in the others ones. When a disks set is filled, the >> other is considered. >> >> Signed-off-by: Goffredo Baroncelli <kreijack@inwind.it> >> --- >> fs/btrfs/volumes.c | 113 +++++++++++++++++++++++++++++++++++++++++++-- >> fs/btrfs/volumes.h | 1 + >> 2 files changed, 111 insertions(+), 3 deletions(-) >> >> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c >> index d4ac90f5c949..7b37db9bb887 100644 >> --- a/fs/btrfs/volumes.c >> +++ b/fs/btrfs/volumes.c >> @@ -184,6 +184,27 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags >> return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ >> } >> >> +#define BTRFS_DEV_ALLOCATION_HINT_COUNT (1ULL << \ >> + BTRFS_DEV_ALLOCATION_HINT_BIT_COUNT) >> + >> +/* >> + * The order of BTRFS_DEV_ALLOCATION_HINT_* values are not >> + * good, because BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED is 0 >> + * (for backward compatibility reason), and the other >> + * values are greater (because the field is unsigned). So we >> + * need a map that rearranges the order giving to _DATA_PREFERRED >> + * an intermediate priority. >> + * These values give to METADATA_ONLY the highest priority, and are >> + * valid for metadata BG allocation. When a data >> + * BG is allocated we negate these values to reverse the priority. >> + */ >> +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_HINT_COUNT] = { >> + [BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY] = -1, >> + [BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED] = 0, >> + [BTRFS_DEV_ALLOCATION_HINT_METADATA_PREFERRED] = 1, >> + [BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY] = 2, >> +}; > > This should be const int, not const char. Also the formatting for the comment > is awkward, it's 1 space between the * and the word, so > Ok > /* > * The order of .... > * > * These values give METADATA_ONLY the highest priority... > */ > > Also the -1 thing is weird and unclear. In fact I think it's problematic, I'll > explain below. To me it was more natural to have BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED -> 0. However we can rearrange this as we want, like below. +static const int alloc_hint_map[BTRFS_DEV_ALLOCATION_HINT_COUNT] = { + [BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY] = 0, + [BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED] = 1, + [BTRFS_DEV_ALLOCATION_HINT_METADATA_PREFERRED] = 2, + [BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY] = 3, +}; I don't see any problem > >> + >> const char *btrfs_bg_type_to_raid_name(u64 flags) >> { >> const int index = btrfs_bg_flags_to_raid_index(flags); >> @@ -5030,13 +5051,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, >> } >> >> /* >> - * sort the devices in descending order by max_avail, total_avail >> + * sort the devices in descending order by alloc_hint, >> + * max_avail, total_avail >> */ >> static int btrfs_cmp_device_info(const void *a, const void *b) >> { >> const struct btrfs_device_info *di_a = a; >> const struct btrfs_device_info *di_b = b; >> >> + if (di_a->alloc_hint > di_b->alloc_hint) >> + return -1; >> + if (di_a->alloc_hint < di_b->alloc_hint) >> + return 1; > > This is making things awkward, instead I think we change this to sort_r, which > uses cmp(a, b, priv). You pass in priv which is the type we want, DATA or > METADATA or whatever. Then you can do To me it seemed a natural extension of the previous code. What I dislike is the use (see below) of alloc_hint_map[]: hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; /* for data */ and hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; devices_info[ndevs].alloc_hint = +alloc_hint_map[hint]; /* for metadata */ However this is necessary because the values of device->type & BTRFS_DEV_ALLOCATION_HINT_MASK are not in the same order of priority allocation. And this is due to the fact that device->type & BTRFS_DEV_ALLOCATION_HINT_MASK == 0 means DATA_PREFERRED Let me to summarize: device->type & alloc_hint_map[hint] BTRFS_DEV_ALLOCATION_HINT_MASK meaning (aka priority) ------------------------------- ------------------ -------------------- 0 DATA_PREFERRED 0 1 METADATA_PREFERRED 1 2 METADATA_ONLY 2 3 DATA_ONLY -1 If you sort the table above by priority, you have the hints in the correct order device->type & alloc_hint_map[hint] BTRFS_DEV_ALLOCATION_HINT_MASK meaning (aka priority) ------------------------------- ------------------ -------------------- 3 DATA_ONLY -1 0 DATA_PREFERRED 0 1 METADATA_PREFERRED 1 2 METADATA_ONLY 2 To understand the tables above, I have to point out: - device->type & BTRFS_DEV_ALLOCATION_HINT_MASK is zero by default - when all the disks are in a default state, we don't want that the allocation_hint policy creates mess - this means that the default state has to be DATA_PREFERRED (or METADATA_PREFERRED), otherwise we cannot allocate METADATA (or DATA) - for now we have only four values. But I am expect to have more in the future (e.g. different priorities because different disks speeds: hdd vs sdd vs nvme). So I preferred to avoid more complex scheme like: first bit=data|medata, second bit=only|preferred. Instead having an array that maps the values stored to disk to a priority give us the maximum of flexibility. > > if (priv == data) { > /* do the sorting so DATA_ONLY is on top, then DATA_PREFERRED, etc. */ > } else { > /* do the METADATA thing instead. */ > } > Yes, in the past I thought about something similar. However, pay attention that you need to ignore the DATA_ONLY when you need to allocate METADATA. I don't see a simple way to do that in btrfs_cmp_device_info(). So in any case you need some "if(s)" in gather_device_info() to skip the unwanted devices. And because of that, it is simpler to adjust the field "devices_info[ndevs].alloc_hint" to the sorting that we need (data, metadata o nothing) in these "if(s)" Moreover I liked the simplicity of btrfs_cmp_device_info(), so I preferred to don't complicate it. >> if (di_a->max_avail > di_b->max_avail) >> return -1; >> if (di_a->max_avail < di_b->max_avail) >> @@ -5199,6 +5225,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, >> int ndevs = 0; >> u64 max_avail; >> u64 dev_offset; >> + int hint; >> >> /* >> * in the first pass through the devices list, we gather information >> @@ -5251,17 +5278,95 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, >> devices_info[ndevs].max_avail = max_avail; >> devices_info[ndevs].total_avail = total_avail; >> devices_info[ndevs].dev = device; >> + >> + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && >> + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { >> + /* >> + * if mixed bg set all the alloc_hint >> + * fields to the same value, so the sorting >> + * is not affected >> + */ >> + devices_info[ndevs].alloc_hint = 0; >> + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { >> + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; >> + >> + /* >> + * skip BTRFS_DEV_METADATA_ONLY disks >> + */ >> + if (hint == BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY) >> + continue; >> + /* >> + * if a data chunk must be allocated, >> + * sort also by hint (data disk >> + * higher priority) >> + */ >> + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; >> + } else { /* BTRFS_BLOCK_GROUP_METADATA */ >> + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; >> + >> + /* >> + * skip BTRFS_DEV_DATA_ONLY disks >> + */ >> + if (hint == BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY) >> + continue; >> + /* >> + * if a metadata chunk must be allocated, >> + * sort also by hint (metadata hint >> + * higher priority) >> + */ >> + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; >> + } >> + > > Shouldn't we be doing this before adding the device to devices_info? That way > for _ONLY we just don't even add the disk to the devices_info. Yes, it can be safely moved before > >> ++ndevs; >> } >> ctl->ndevs = ndevs; >> >> + return 0; >> +} >> + >> +static void sort_and_reduce_device_info(struct alloc_chunk_ctl *ctl, >> + struct btrfs_device_info *devices_info) >> +{ >> + int ndevs, hint, i; >> + >> + ndevs = ctl->ndevs; >> /* >> - * now sort the devices by hole size / available space >> + * now sort the devices by hint / hole size / available space >> */ >> sort(devices_info, ndevs, sizeof(struct btrfs_device_info), >> btrfs_cmp_device_info, NULL); >> >> - return 0; >> + /* >> + * select the minimum set of disks grouped by hint that >> + * can host the chunk >> + */ >> + ndevs = 0; >> + while (ndevs < ctl->ndevs) { >> + hint = devices_info[ndevs++].alloc_hint; >> + while (ndevs < ctl->ndevs) { >> + if (devices_info[ndevs].alloc_hint != hint) >> + break; >> + ndevs++; >> + } >> + if (ndevs >= ctl->devs_min) >> + break; >> + } >> + >> + ctl->ndevs = ndevs; >> + >> + /* >> + * the next layers require the devices_info ordered by >> + * max_avail. If we are returning two (or more) different >> + * group of alloc_hint, this is not always true. So sort >> + * these again. >> + */ >> + >> + for (i = 0 ; i < ndevs ; i++) >> + devices_info[i].alloc_hint = 0; >> + >> + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), >> + btrfs_cmp_device_info, NULL); > > With my sort_r suggestion I think we no longer need the second sort. It'll get > you the devices you want in order of most preferred alloc hint, and with the > max_avail. I'd double check with printk's, but you should be able to drop all > this. Thanks, It is not so simple. Assuming that you have the following situation (after the first sort): sda: DATA_PREFERRED free=1GB sdb: DATA_PREFERRED free=0GB sdc: METADATA_PREFERRED free=2GB sdd: METADATA_PREFERRED free=1GB And you need to allocate 1 DATA chunk in RAID1, you need to use the following combination: sda: DATA_PREFERRED free=1GB sdc: METADATA_PREFERRED free=2GB However, as the comment report, you need to pass the array sorted by free avail in the next layers; this is the reason why you need a 2nd sort. > > Josef
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d4ac90f5c949..7b37db9bb887 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -184,6 +184,27 @@ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } +#define BTRFS_DEV_ALLOCATION_HINT_COUNT (1ULL << \ + BTRFS_DEV_ALLOCATION_HINT_BIT_COUNT) + +/* + * The order of BTRFS_DEV_ALLOCATION_HINT_* values are not + * good, because BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED is 0 + * (for backward compatibility reason), and the other + * values are greater (because the field is unsigned). So we + * need a map that rearranges the order giving to _DATA_PREFERRED + * an intermediate priority. + * These values give to METADATA_ONLY the highest priority, and are + * valid for metadata BG allocation. When a data + * BG is allocated we negate these values to reverse the priority. + */ +static const char alloc_hint_map[BTRFS_DEV_ALLOCATION_HINT_COUNT] = { + [BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY] = -1, + [BTRFS_DEV_ALLOCATION_HINT_DATA_PREFERRED] = 0, + [BTRFS_DEV_ALLOCATION_HINT_METADATA_PREFERRED] = 1, + [BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY] = 2, +}; + const char *btrfs_bg_type_to_raid_name(u64 flags) { const int index = btrfs_bg_flags_to_raid_index(flags); @@ -5030,13 +5051,18 @@ static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info, } /* - * sort the devices in descending order by max_avail, total_avail + * sort the devices in descending order by alloc_hint, + * max_avail, total_avail */ static int btrfs_cmp_device_info(const void *a, const void *b) { const struct btrfs_device_info *di_a = a; const struct btrfs_device_info *di_b = b; + if (di_a->alloc_hint > di_b->alloc_hint) + return -1; + if (di_a->alloc_hint < di_b->alloc_hint) + return 1; if (di_a->max_avail > di_b->max_avail) return -1; if (di_a->max_avail < di_b->max_avail) @@ -5199,6 +5225,7 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, int ndevs = 0; u64 max_avail; u64 dev_offset; + int hint; /* * in the first pass through the devices list, we gather information @@ -5251,17 +5278,95 @@ static int gather_device_info(struct btrfs_fs_devices *fs_devices, devices_info[ndevs].max_avail = max_avail; devices_info[ndevs].total_avail = total_avail; devices_info[ndevs].dev = device; + + if ((ctl->type & BTRFS_BLOCK_GROUP_DATA) && + (ctl->type & BTRFS_BLOCK_GROUP_METADATA)) { + /* + * if mixed bg set all the alloc_hint + * fields to the same value, so the sorting + * is not affected + */ + devices_info[ndevs].alloc_hint = 0; + } else if (ctl->type & BTRFS_BLOCK_GROUP_DATA) { + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; + + /* + * skip BTRFS_DEV_METADATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_HINT_METADATA_ONLY) + continue; + /* + * if a data chunk must be allocated, + * sort also by hint (data disk + * higher priority) + */ + devices_info[ndevs].alloc_hint = -alloc_hint_map[hint]; + } else { /* BTRFS_BLOCK_GROUP_METADATA */ + hint = device->type & BTRFS_DEV_ALLOCATION_HINT_MASK; + + /* + * skip BTRFS_DEV_DATA_ONLY disks + */ + if (hint == BTRFS_DEV_ALLOCATION_HINT_DATA_ONLY) + continue; + /* + * if a metadata chunk must be allocated, + * sort also by hint (metadata hint + * higher priority) + */ + devices_info[ndevs].alloc_hint = alloc_hint_map[hint]; + } + ++ndevs; } ctl->ndevs = ndevs; + return 0; +} + +static void sort_and_reduce_device_info(struct alloc_chunk_ctl *ctl, + struct btrfs_device_info *devices_info) +{ + int ndevs, hint, i; + + ndevs = ctl->ndevs; /* - * now sort the devices by hole size / available space + * now sort the devices by hint / hole size / available space */ sort(devices_info, ndevs, sizeof(struct btrfs_device_info), btrfs_cmp_device_info, NULL); - return 0; + /* + * select the minimum set of disks grouped by hint that + * can host the chunk + */ + ndevs = 0; + while (ndevs < ctl->ndevs) { + hint = devices_info[ndevs++].alloc_hint; + while (ndevs < ctl->ndevs) { + if (devices_info[ndevs].alloc_hint != hint) + break; + ndevs++; + } + if (ndevs >= ctl->devs_min) + break; + } + + ctl->ndevs = ndevs; + + /* + * the next layers require the devices_info ordered by + * max_avail. If we are returning two (or more) different + * group of alloc_hint, this is not always true. So sort + * these again. + */ + + for (i = 0 ; i < ndevs ; i++) + devices_info[i].alloc_hint = 0; + + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + } static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl, @@ -5513,6 +5618,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, goto out; } + sort_and_reduce_device_info(&ctl, devices_info); + ret = decide_stripe_size(fs_devices, &ctl, devices_info); if (ret < 0) { block_group = ERR_PTR(ret); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 93ac27d8097c..b066f9af216a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -404,6 +404,7 @@ struct btrfs_device_info { u64 dev_offset; u64 max_avail; u64 total_avail; + int alloc_hint; }; struct btrfs_raid_attr {