Skip to content

Commit 246e588

Browse files
pcd1193182Paul Dagnelie
and
Paul Dagnelie
authored
Implement allocation size ranges and use for gang leaves (#17111)
When forced to resort to ganging, ZFS currently allocates three child blocks, each one third of the size of the original. This is true regardless of whether larger allocations could be made, which would allow us to have fewer gang leaves. This improves performance when fragmentation is high enough to require ganging, but not so high that all the free ranges are only just big enough to hold a third of the recordsize. This is also useful for improving the behavior of a future change to allow larger gang headers. We add the ability for the allocation codepath to allocate a range of sizes instead of a single fixed size. We then use this to pre-allocate the DVAs for the gang children. If those allocations fail, we fall back to the normal write path, which will likely re-gang. Signed-off-by: Paul Dagnelie <[email protected]> Co-authored-by: Paul Dagnelie <[email protected]> Reviewed-by: Alexander Motin <[email protected]> Reviewed-by: Tony Hutter <[email protected]>
1 parent a7de203 commit 246e588

24 files changed

+391
-106
lines changed

include/sys/metaslab.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ extern "C" {
4141

4242
typedef struct metaslab_ops {
4343
const char *msop_name;
44-
uint64_t (*msop_alloc)(metaslab_t *, uint64_t);
44+
uint64_t (*msop_alloc)(metaslab_t *, uint64_t, uint64_t, uint64_t *);
4545
} metaslab_ops_t;
4646

4747

@@ -82,6 +82,9 @@ uint64_t metaslab_largest_allocatable(metaslab_t *);
8282

8383
int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, blkptr_t *, int,
8484
uint64_t, blkptr_t *, int, zio_alloc_list_t *, int, const void *);
85+
int metaslab_alloc_range(spa_t *, metaslab_class_t *, uint64_t, uint64_t,
86+
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *,
87+
int, const void *, uint64_t *);
8588
int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t,
8689
dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int);
8790
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
@@ -95,6 +98,7 @@ void metaslab_check_free(spa_t *, const blkptr_t *);
9598

9699
void metaslab_stat_init(void);
97100
void metaslab_stat_fini(void);
101+
void metaslab_trace_move(zio_alloc_list_t *, zio_alloc_list_t *);
98102
void metaslab_trace_init(zio_alloc_list_t *);
99103
void metaslab_trace_fini(zio_alloc_list_t *);
100104

@@ -127,6 +131,8 @@ uint64_t metaslab_group_get_space(metaslab_group_t *);
127131
void metaslab_group_histogram_verify(metaslab_group_t *);
128132
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
129133
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
134+
void metaslab_group_alloc_increment_all(spa_t *, blkptr_t *, int, int,
135+
uint64_t, const void *);
130136
void metaslab_group_alloc_decrement(spa_t *, uint64_t, int, int, uint64_t,
131137
const void *);
132138
void metaslab_recalculate_weight_and_sort(metaslab_t *);

include/sys/vdev.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,8 @@ extern void vdev_space_update(vdev_t *vd,
134134

135135
extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space);
136136

137+
extern uint64_t vdev_asize_to_psize_txg(vdev_t *vd, uint64_t asize,
138+
uint64_t txg);
137139
extern uint64_t vdev_psize_to_asize_txg(vdev_t *vd, uint64_t psize,
138140
uint64_t txg);
139141
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);

include/sys/vdev_draid.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ extern int vdev_draid_generate_perms(const draid_map_t *, uint8_t **);
9595
*/
9696
extern boolean_t vdev_draid_readable(vdev_t *, uint64_t);
9797
extern boolean_t vdev_draid_missing(vdev_t *, uint64_t, uint64_t, uint64_t);
98-
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t);
98+
extern uint64_t vdev_draid_asize_to_psize(vdev_t *, uint64_t, uint64_t);
9999
extern void vdev_draid_map_alloc_empty(zio_t *, struct raidz_row *);
100100
extern int vdev_draid_map_verify_empty(zio_t *, struct raidz_row *);
101101
extern nvlist_t *vdev_draid_read_config_spare(vdev_t *);

include/sys/vdev_impl.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ typedef const struct vdev_ops {
103103
vdev_fini_func_t *vdev_op_fini;
104104
vdev_open_func_t *vdev_op_open;
105105
vdev_close_func_t *vdev_op_close;
106-
vdev_asize_func_t *vdev_op_asize;
106+
vdev_asize_func_t *vdev_op_psize_to_asize;
107+
vdev_asize_func_t *vdev_op_asize_to_psize;
107108
vdev_min_asize_func_t *vdev_op_min_asize;
108109
vdev_min_alloc_func_t *vdev_op_min_alloc;
109110
vdev_io_start_func_t *vdev_op_io_start;
@@ -615,6 +616,7 @@ extern vdev_ops_t vdev_indirect_ops;
615616
*/
616617
extern void vdev_default_xlate(vdev_t *vd, const zfs_range_seg64_t *logical_rs,
617618
zfs_range_seg64_t *physical_rs, zfs_range_seg64_t *remain_rs);
619+
extern uint64_t vdev_default_psize(vdev_t *vd, uint64_t asize, uint64_t txg);
618620
extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize, uint64_t txg);
619621
extern uint64_t vdev_default_min_asize(vdev_t *vd);
620622
extern uint64_t vdev_get_min_asize(vdev_t *vd);

include/sys/zio.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,7 @@ typedef uint64_t zio_flag_t;
227227
#define ZIO_FLAG_REEXECUTED (1ULL << 30)
228228
#define ZIO_FLAG_DELEGATED (1ULL << 31)
229229
#define ZIO_FLAG_DIO_CHKSUM_ERR (1ULL << 32)
230+
#define ZIO_FLAG_PREALLOCATED (1ULL << 33)
230231

231232
#define ZIO_ALLOCATOR_NONE (-1)
232233
#define ZIO_HAS_ALLOCATOR(zio) ((zio)->io_allocator != ZIO_ALLOCATOR_NONE)

lib/libzfs/libzfs_dataset.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5436,12 +5436,12 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
54365436
* +-------+-------+-------+-------+-------+
54375437
*
54385438
* Above, notice that the 4k block required one sector for parity and another
5439-
* for data. vdev_raidz_asize() will return 8k and as such the pool's allocated
5440-
* and free properties will be adjusted by 8k. The dataset will not be charged
5441-
* 8k. Rather, it will be charged a value that is scaled according to the
5442-
* overhead of the 128k block on the same vdev. This 8k allocation will be
5443-
* charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as
5444-
* calculated in the 128k block example above.
5439+
* for data. vdev_raidz_psize_to_asize() will return 8k and as such the pool's
5440+
* allocated and free properties will be adjusted by 8k. The dataset will not
5441+
* be charged 8k. Rather, it will be charged a value that is scaled according
5442+
* to the overhead of the 128k block on the same vdev. This 8k allocation will
5443+
* be charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is
5444+
* as calculated in the 128k block example above.
54455445
*
54465446
* Every raidz allocation is sized to be a multiple of nparity+1 sectors. That
54475447
* is, every raidz1 allocation will be a multiple of 2 sectors, raidz2
@@ -5488,7 +5488,7 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
54885488
* not necessarily equal to "blksize", due to RAIDZ deflation.
54895489
*/
54905490
static uint64_t
5491-
vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
5491+
vdev_raidz_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
54925492
uint64_t blksize)
54935493
{
54945494
uint64_t asize, ndata;
@@ -5508,7 +5508,7 @@ vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
55085508
* size.
55095509
*/
55105510
static uint64_t
5511-
vdev_draid_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
5511+
vdev_draid_psize_to_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift,
55125512
uint64_t blksize)
55135513
{
55145514
ASSERT3U(ndisks, >, nparity);
@@ -5568,12 +5568,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
55685568
continue;
55695569

55705570
/* allocation size for the "typical" 128k block */
5571-
tsize = vdev_raidz_asize(ndisks, nparity, ashift,
5572-
SPA_OLD_MAXBLOCKSIZE);
5571+
tsize = vdev_raidz_psize_to_asize(ndisks, nparity,
5572+
ashift, SPA_OLD_MAXBLOCKSIZE);
55735573

55745574
/* allocation size for the blksize block */
5575-
asize = vdev_raidz_asize(ndisks, nparity, ashift,
5576-
blksize);
5575+
asize = vdev_raidz_psize_to_asize(ndisks, nparity,
5576+
ashift, blksize);
55775577
} else {
55785578
uint64_t ndata;
55795579

@@ -5582,12 +5582,12 @@ volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize)
55825582
continue;
55835583

55845584
/* allocation size for the "typical" 128k block */
5585-
tsize = vdev_draid_asize(ndata + nparity, nparity,
5586-
ashift, SPA_OLD_MAXBLOCKSIZE);
5585+
tsize = vdev_draid_psize_to_asize(ndata + nparity,
5586+
nparity, ashift, SPA_OLD_MAXBLOCKSIZE);
55875587

55885588
/* allocation size for the blksize block */
5589-
asize = vdev_draid_asize(ndata + nparity, nparity,
5590-
ashift, blksize);
5589+
asize = vdev_draid_psize_to_asize(ndata + nparity,
5590+
nparity, ashift, blksize);
55915591
}
55925592

55935593
/*

module/os/freebsd/zfs/vdev_geom.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,8 @@ vdev_ops_t vdev_disk_ops = {
12761276
.vdev_op_fini = NULL,
12771277
.vdev_op_open = vdev_geom_open,
12781278
.vdev_op_close = vdev_geom_close,
1279-
.vdev_op_asize = vdev_default_asize,
1279+
.vdev_op_psize_to_asize = vdev_default_asize,
1280+
.vdev_op_asize_to_psize = vdev_default_psize,
12801281
.vdev_op_min_asize = vdev_default_min_asize,
12811282
.vdev_op_min_alloc = NULL,
12821283
.vdev_op_io_start = vdev_geom_io_start,

module/os/linux/zfs/vdev_disk.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1554,7 +1554,8 @@ vdev_ops_t vdev_disk_ops = {
15541554
.vdev_op_fini = NULL,
15551555
.vdev_op_open = vdev_disk_open,
15561556
.vdev_op_close = vdev_disk_close,
1557-
.vdev_op_asize = vdev_default_asize,
1557+
.vdev_op_asize_to_psize = vdev_default_psize,
1558+
.vdev_op_psize_to_asize = vdev_default_asize,
15581559
.vdev_op_min_asize = vdev_default_min_asize,
15591560
.vdev_op_min_alloc = NULL,
15601561
.vdev_op_io_start = vdev_disk_io_start,

0 commit comments

Comments
 (0)