Skip to content

Commit 1f058cf

Browse files
committed
Add vdev property to toggle adding io to vdev queue
Added vdev property to toggle queueing io to vdev queue when reading / writing from / to vdev. The intention behind this property is to improve IOPS performance when using o_direct. Signed-off-by: MigeljanImeri <[email protected]>
1 parent 5c30b24 commit 1f058cf

File tree

14 files changed

+211
-2
lines changed

14 files changed

+211
-2
lines changed

include/sys/fs/zfs.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,23 @@ typedef enum {
385385
VDEV_PROP_TRIM_SUPPORT,
386386
VDEV_PROP_TRIM_ERRORS,
387387
VDEV_PROP_SLOW_IOS,
388+
VDEV_PROP_QUEUE_IO,
388389
VDEV_NUM_PROPS
389390
} vdev_prop_t;
390391

392+
/*
393+
* Different queuing behaviors for vdev prop queue_io.
394+
* VDEV_QUEUE_AUTO = Don't queue if vdev is nonrot and backed by blkdev,
395+
* queue otherwise.
396+
* VDEV_QUEUE_CLASSIC = Always queue.
397+
* VDEV_QUEUE_NONE = Never Queue.
398+
*/
399+
typedef enum {
400+
VDEV_QUEUE_AUTO,
401+
VDEV_QUEUE_CLASSIC,
402+
VDEV_QUEUE_NONE
403+
} vdev_queue_type_t;
404+
391405
/*
392406
* Dataset property functions shared between libzfs and kernel.
393407
*/

include/sys/vdev_impl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -423,6 +423,7 @@ struct vdev {
423423
boolean_t vdev_resilver_deferred; /* resilver deferred */
424424
boolean_t vdev_kobj_flag; /* kobj event record */
425425
boolean_t vdev_attaching; /* vdev attach ashift handling */
426+
boolean_t vdev_is_blkdev; /* vdev is backed by block device */
426427
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
427428
spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
428429
zio_t *vdev_probe_zio; /* root of current probe */
@@ -466,6 +467,7 @@ struct vdev {
466467
uint64_t vdev_io_t;
467468
uint64_t vdev_slow_io_n;
468469
uint64_t vdev_slow_io_t;
470+
uint64_t vdev_queue_io;
469471
};
470472

471473
#define VDEV_PAD_SIZE (8 << 10)

lib/libzfs/libzfs.abi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5899,7 +5899,8 @@
58995899
<enumerator name='VDEV_PROP_TRIM_SUPPORT' value='49'/>
59005900
<enumerator name='VDEV_PROP_TRIM_ERRORS' value='50'/>
59015901
<enumerator name='VDEV_PROP_SLOW_IOS' value='51'/>
5902-
<enumerator name='VDEV_NUM_PROPS' value='52'/>
5902+
<enumerator name='VDEV_PROP_QUEUE_IO' value='52'/>
5903+
<enumerator name='VDEV_NUM_PROPS' value='53'/>
59035904
</enum-decl>
59045905
<typedef-decl name='vdev_prop_t' type-id='1573bec8' id='5aa5c90c'/>
59055906
<class-decl name='zpool_load_policy' size-in-bits='256' is-struct='yes' visibility='default' id='2f65b36f'>

man/man7/vdevprops.7

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,22 @@ If this device should perform new allocations, used to disable a device
157157
when it is scheduled for later removal.
158158
See
159159
.Xr zpool-remove 8 .
160+
.It Sy queue_io Ns = Ns Sy auto Ns | Ns Sy classic Ns | Ns Sy none
161+
Controls how I/O requests are added to the vdev queue when reading or
162+
writing to this vdev.
163+
.It Sy auto
164+
Does not add I/O requests to the vdev queue if the vdev is backed by a
165+
non-rotational block device.
166+
This can sometimes improve performance for direct IOs.
167+
I/O requests will be added to the vdev queue if the vdev is backed by a
168+
rotational block device or file.
169+
This is the default behavior.
170+
.It Sy classic
171+
Always adds I/O requests to the vdev queue.
172+
.It Sy none
173+
Never adds I/O requests to the vdev queue.
174+
This is not recommended for vdevs backed by spinning disks as it could
175+
result in starvation.
160176
.El
161177
.Ss User Properties
162178
In addition to the standard native properties, ZFS supports arbitrary user

module/os/freebsd/zfs/vdev_geom.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -968,6 +968,9 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
968968
else
969969
vd->vdev_nonrot = B_FALSE;
970970

971+
/* Is backed by a block device. */
972+
vd->vdev_is_blkdev = B_TRUE;
973+
971974
/* Set when device reports it supports TRIM. */
972975
error = g_getattr("GEOM::candelete", cp, &has_trim);
973976
vd->vdev_has_trim = (error == 0 && has_trim);

module/os/linux/zfs/vdev_disk.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize,
447447
/* Inform the ZIO pipeline that we are non-rotational */
448448
v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev));
449449

450+
/* Is backed by a block device. */
451+
v->vdev_is_blkdev = B_TRUE;
452+
450453
/* Physical volume size in bytes for the partition */
451454
*psize = bdev_capacity(bdev);
452455

module/zcommon/zpool_prop.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,13 @@ vdev_prop_init(void)
326326
{ NULL }
327327
};
328328

329+
static const zprop_index_t vdevqueuetype_table[] = {
330+
{ "auto", VDEV_QUEUE_AUTO },
331+
{ "classic", VDEV_QUEUE_CLASSIC },
332+
{ "none", VDEV_QUEUE_NONE },
333+
{ NULL }
334+
};
335+
329336
struct zfs_mod_supported_features *sfeatures =
330337
zfs_mod_list_supported(ZFS_SYSFS_VDEV_PROPERTIES);
331338

@@ -470,6 +477,9 @@ vdev_prop_init(void)
470477
zprop_register_index(VDEV_PROP_TRIM_SUPPORT, "trim_support", 0,
471478
PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "TRIMSUP",
472479
boolean_table, sfeatures);
480+
zprop_register_index(VDEV_PROP_QUEUE_IO, "queue_io", VDEV_QUEUE_AUTO,
481+
PROP_DEFAULT, ZFS_TYPE_VDEV, "auto | classic | none", "QUEUE_IO",
482+
vdevqueuetype_table, sfeatures);
473483

474484
/* default index properties */
475485
zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE,

module/zfs/vdev.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,8 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
718718
vd->vdev_slow_io_n = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N);
719719
vd->vdev_slow_io_t = vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T);
720720

721+
vd->vdev_queue_io = vdev_prop_default_numeric(VDEV_PROP_QUEUE_IO);
722+
721723
list_link_init(&vd->vdev_config_dirty_node);
722724
list_link_init(&vd->vdev_state_dirty_node);
723725
list_link_init(&vd->vdev_initialize_node);
@@ -3860,6 +3862,12 @@ vdev_load(vdev_t *vd)
38603862
if (error && error != ENOENT)
38613863
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
38623864
"failed [error=%d]", (u_longlong_t)zapobj, error);
3865+
3866+
error = vdev_prop_get_int(vd, VDEV_PROP_QUEUE_IO,
3867+
&vd->vdev_queue_io);
3868+
if (error && error != ENOENT)
3869+
vdev_dbgmsg(vd, "vdev_load: zap_lookup(zap=%llu) "
3870+
"failed [error=%d]", (u_longlong_t)zapobj, error);
38633871
}
38643872

38653873
/*
@@ -6095,6 +6103,15 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
60956103
}
60966104
vd->vdev_slow_io_t = intval;
60976105
break;
6106+
case VDEV_PROP_QUEUE_IO:
6107+
if (nvpair_value_uint64(elem, &intval) != 0) {
6108+
error = EINVAL;
6109+
break;
6110+
}
6111+
if (vd->vdev_ops->vdev_op_leaf) {
6112+
vd->vdev_queue_io = intval;
6113+
}
6114+
break;
60986115
default:
60996116
/* Most processing is done in vdev_props_set_sync */
61006117
break;
@@ -6458,6 +6475,7 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl)
64586475
case VDEV_PROP_IO_T:
64596476
case VDEV_PROP_SLOW_IO_N:
64606477
case VDEV_PROP_SLOW_IO_T:
6478+
case VDEV_PROP_QUEUE_IO:
64616479
err = vdev_prop_get_int(vd, prop, &intval);
64626480
if (err && err != ENOENT)
64636481
break;

module/zfs/vdev_file.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
109109
*/
110110
vd->vdev_nonrot = B_TRUE;
111111

112+
/* Is not backed by a block device. */
113+
vd->vdev_is_blkdev = B_FALSE;
114+
112115
/*
113116
* Allow TRIM on file based vdevs. This may not always be supported,
114117
* since it depends on your kernel version and underlying filesystem

module/zfs/vdev_queue.c

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,38 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
879879
return (zio);
880880
}
881881

882+
static boolean_t
883+
vdev_should_queue_zio(zio_t *zio)
884+
{
885+
vdev_t *vd = zio->io_vd;
886+
boolean_t should_queue = B_TRUE;
887+
888+
/*
889+
* Add zio with ZIO_FLAG_NODATA to queue as bypass code
890+
* currently does not handle certain cases (gang abd, raidz
891+
* write aggregation).
892+
*/
893+
if (zio->io_flags & ZIO_FLAG_NODATA)
894+
return (B_TRUE);
895+
896+
switch (vd->vdev_queue_io) {
897+
case VDEV_QUEUE_AUTO:
898+
if (vd->vdev_nonrot && vd->vdev_is_blkdev)
899+
should_queue = B_FALSE;
900+
break;
901+
case VDEV_QUEUE_CLASSIC:
902+
should_queue = B_TRUE;
903+
break;
904+
case VDEV_QUEUE_NONE:
905+
should_queue = B_FALSE;
906+
break;
907+
default:
908+
should_queue = B_TRUE;
909+
break;
910+
}
911+
return (should_queue);
912+
}
913+
882914
zio_t *
883915
vdev_queue_io(zio_t *zio)
884916
{
@@ -922,6 +954,11 @@ vdev_queue_io(zio_t *zio)
922954
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
923955
zio->io_timestamp = gethrtime();
924956

957+
if (!vdev_should_queue_zio(zio)) {
958+
zio->io_queue_state = ZIO_QS_NONE;
959+
return (zio);
960+
}
961+
925962
mutex_enter(&vq->vq_lock);
926963
vdev_queue_io_add(vq, zio);
927964
nio = vdev_queue_io_to_issue(vq);
@@ -954,6 +991,10 @@ vdev_queue_io_done(zio_t *zio)
954991
vq->vq_io_complete_ts = now;
955992
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
956993

994+
if (zio->io_queue_state == ZIO_QS_NONE) {
995+
return;
996+
}
997+
957998
mutex_enter(&vq->vq_lock);
958999
vdev_queue_pending_remove(vq, zio);
9591000

tests/runfiles/common.run

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -549,7 +549,7 @@ tags = ['functional', 'cli_root', 'zpool_scrub']
549549
tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg',
550550
'zpool_set_ashift', 'zpool_set_features', 'vdev_set_001_pos',
551551
'user_property_001_pos', 'user_property_002_neg',
552-
'zpool_set_clear_userprop']
552+
'zpool_set_clear_userprop','vdev_set_queue_io']
553553
tags = ['functional', 'cli_root', 'zpool_set']
554554

555555
[tests/functional/cli_root/zpool_split]

tests/zfs-tests/tests/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
12451245
functional/cli_root/zpool_set/setup.ksh \
12461246
functional/cli_root/zpool/setup.ksh \
12471247
functional/cli_root/zpool_set/vdev_set_001_pos.ksh \
1248+
functional/cli_root/zpool_set/vdev_set_queue_io.ksh \
12481249
functional/cli_root/zpool_set/zpool_set_common.kshlib \
12491250
functional/cli_root/zpool_set/zpool_set_001_pos.ksh \
12501251
functional/cli_root/zpool_set/zpool_set_002_neg.ksh \

tests/zfs-tests/tests/functional/cli_root/zpool_get/vdev_get.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,4 +76,5 @@ typeset -a properties=(
7676
trim_support
7777
trim_errors
7878
slow_ios
79+
queue_io
7980
)
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#!/bin/ksh -p
2+
# SPDX-License-Identifier: CDDL-1.0
3+
#
4+
# CDDL HEADER START
5+
#
6+
# The contents of this file are subject to the terms of the
7+
# Common Development and Distribution License (the "License").
8+
# You may not use this file except in compliance with the License.
9+
#
10+
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
11+
# or https://opensource.org/licenses/CDDL-1.0.
12+
# See the License for the specific language governing permissions
13+
# and limitations under the License.
14+
#
15+
# When distributing Covered Code, include this CDDL HEADER in each
16+
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
17+
# If applicable, add the following below this CDDL HEADER, with the
18+
# fields enclosed by brackets "[]" replaced with your own identifying
19+
# information: Portions Copyright [yyyy] [name of copyright owner]
20+
#
21+
# CDDL HEADER END
22+
#
23+
24+
#
25+
# Copyright (c) 2024 by Triad National Security, LLC.
26+
#
27+
28+
. $STF_SUITE/include/libtest.shlib
29+
30+
#
31+
# DESCRIPTION:
32+
# Toggling vdev queue_io property while reading from vdev should not cause panic.
33+
#
34+
# STRATEGY:
35+
# 1. Create a zpool
36+
# 2. Write a file to the pool.
37+
# 3. Start reading from file, while also toggling the queue_io property on / off.
38+
#
39+
40+
verify_runnable "global"
41+
42+
command -v fio > /dev/null || log_unsupported "fio missing"
43+
log_must save_tunable DIO_ENABLED
44+
log_must set_tunable32 DIO_ENABLED 1
45+
46+
function toggle_queue_io
47+
{
48+
zpool set queue_io=off $TESTPOOL1 $FILEDEV
49+
sleep 0.1
50+
zpool set queue_io=on $TESTPOOL1 $FILEDEV
51+
sleep 0.1
52+
}
53+
54+
function cleanup
55+
{
56+
destroy_pool $TESTPOOL1
57+
log_must rm -f $FILEDEV
58+
log_must restore_tunable DIO_ENABLED
59+
}
60+
61+
log_assert "Toggling vdev queue_io property while reading from vdev should not cause panic"
62+
log_onexit cleanup
63+
64+
# 1. Create a pool
65+
66+
FILEDEV="$TEST_BASE_DIR/filedev.$$"
67+
log_must truncate -s $(($MINVDEVSIZE * 2)) $FILEDEV
68+
create_pool $TESTPOOL1 $FILEDEV
69+
70+
mntpnt=$(get_prop mountpoint $TESTPOOL1)
71+
72+
# 2. Write a file to the pool, while also toggling the queue_io property on / off.
73+
74+
log_must eval "fio --filename=$mntpnt/foobar --name=write-file \
75+
--rw=write --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
76+
--ioengine=sync --time_based --runtime=10 &"
77+
78+
ITERATIONS=30
79+
80+
for i in $(seq $ITERATIONS); do
81+
log_must toggle_queue_io
82+
done;
83+
wait
84+
85+
# 3. Starting reading from file, while also toggling the queue_io property on / off.
86+
87+
log_must eval "fio --filename=$mntpnt/foobar --name=read-file \
88+
--rw=read --size=$MINVDEVSIZE --bs=128k --numjobs=1 --direct=1 \
89+
--ioengine=sync --time_based --runtime=10 &"
90+
91+
for i in $(seq $ITERATIONS); do
92+
log_must toggle_queue_io
93+
done;
94+
wait
95+
96+
log_pass "Toggling vdev queue_io property while reading from vdev does not cause panic"

0 commit comments

Comments
 (0)