Skip to content

Commit 7ea400c

Browse files
committed
Allow bypassing the vdev queue on SSDs
Allow bypassing the vdev queue on SSDs if the vdev queue is less than zfs_vdev_queue_bypass_pct percent full. This can lead to an over 2x IOPS speed-up on some benchmarks. The intention behind this property is to improve performance when using O_DIRECT. Signed-off-by: MigeljanImeri <[email protected]>
1 parent 3420571 commit 7ea400c

File tree

5 files changed

+93
-3
lines changed

5 files changed

+93
-3
lines changed

include/sys/vdev_impl.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,10 +147,21 @@ struct vdev_queue {
147147
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
148148
uint32_t vq_cqueued; /* Classes with queued I/Os. */
149149
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
150-
uint32_t vq_active; /* Number of active I/Os. */
150+
/*
151+
* Number of active I/Os. This includes I/Os that were previously
152+
* queued and are now active, plus all the 'bypass' I/Os that bypassed
153+
* the queue.
154+
*/
155+
uint32_t vq_active;
156+
/*
157+
* Number of active I/Os that were previously queued. This is a subset
158+
* of vq_active.
159+
*/
160+
uint32_t vq_queued_active;
151161
uint32_t vq_ia_active; /* Active interactive I/Os. */
152162
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
153163
list_t vq_active_list; /* List of active I/Os. */
164+
kmutex_t vq_active_list_lock;
154165
hrtime_t vq_io_complete_ts; /* time last i/o completed */
155166
hrtime_t vq_io_delta_ts;
156167
zio_t vq_io_search; /* used as local for stack reduction */

include/sys/zio.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,24 @@ typedef uint64_t zio_flag_t;
249249
#define ZIO_CHILD_BIT(x) (1U << (x))
250250
#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1U << (x)))
251251

252+
253+
/*
254+
* ZIOs that are ZIO_FLAG_IMPORTANT are always queued so that they never get
255+
* starved out. This allows us to bypass the queue for "normal" reads and
256+
* writes when the queues are low for better IOPS. If the queues get too high
257+
* then we go back to queuing the "normal" reads/writes so as not to starve
258+
* out more important IOs like scrub/resilver/retry. See
259+
* zfs_vdev_queue_bypass_pct for details.
260+
*/
261+
262+
#define ZIO_FLAG_IMPORTANT \
263+
ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL | \
264+
ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB | \
265+
ZIO_FLAG_IO_RETRY | ZIO_FLAG_NODATA
266+
267+
#define ZIO_IS_NORMAL(zio) \
268+
!((zio)->io_flags & (ZIO_FLAG_IMPORTANT))
269+
252270
enum zio_child {
253271
ZIO_CHILD_VDEV = 0,
254272
ZIO_CHILD_GANG,
@@ -449,6 +467,7 @@ enum zio_qstate {
449467
ZIO_QS_NONE = 0,
450468
ZIO_QS_QUEUED,
451469
ZIO_QS_ACTIVE,
470+
ZIO_QS_BYPASS,
452471
};
453472

454473
struct zio {

man/man4/zfs.4

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1528,6 +1528,13 @@ Default queue depth for each vdev IO allocator.
15281528
Higher values allow for better coalescing of sequential writes before sending
15291529
them to the disk, but can increase transaction commit times.
15301530
.
1531+
.It Sy zfs_vdev_queue_bypass_pct Ns = Ns Sy 10 Pq uint
1532+
Allow bypassing the vdev's queue if the vdev queue is less than
1533+
zfs_vdev_queue_bypass_pct percent full.
1534+
This only applies to SSDs (non-rotational drives).
1535+
Only "normal" (read/write) zios can bypass the queue.
1536+
You can use 0 to always queue IOs and 100 to never queue IOs.
1537+
.
15311538
.It Sy zfs_vdev_failfast_mask Ns = Ns Sy 1 Pq uint
15321539
Defines if the driver should retire on a given error type.
15331540
The following options may be bitwise-ored together:

module/zfs/vdev.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5634,7 +5634,9 @@ vdev_deadman(vdev_t *vd, const char *tag)
56345634
* if any I/O has been outstanding for longer than
56355635
* the spa_deadman_synctime invoke the deadman logic.
56365636
*/
5637+
mutex_enter(&vq->vq_active_list_lock);
56375638
fio = list_head(&vq->vq_active_list);
5639+
mutex_exit(&vq->vq_active_list_lock);
56385640
delta = gethrtime() - fio->io_timestamp;
56395641
if (delta > spa_deadman_synctime(spa))
56405642
zio_deadman(fio, tag);

module/zfs/vdev_queue.c

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
228228
*/
229229
uint_t zfs_vdev_def_queue_depth = 32;
230230

231+
/*
232+
* Allow io to bypass the queue depending on how full the queue is.
233+
* 0 = never bypass, 100 = always bypass.
234+
*/
235+
uint_t zfs_vdev_queue_bypass_pct = 10;
236+
231237
static int
232238
vdev_queue_offset_compare(const void *x1, const void *x2)
233239
{
@@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
502508
list_create(&vq->vq_active_list, sizeof (struct zio),
503509
offsetof(struct zio, io_queue_node.l));
504510
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
511+
mutex_init(&vq->vq_active_list_lock, NULL, MUTEX_DEFAULT, NULL);
505512
}
506513

507514
void
@@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)
520527

521528
list_destroy(&vq->vq_active_list);
522529
mutex_destroy(&vq->vq_lock);
530+
mutex_destroy(&vq->vq_active_list_lock);
523531
}
524532

525533
static void
@@ -564,15 +572,18 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
564572
ASSERT(MUTEX_HELD(&vq->vq_lock));
565573
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
566574
vq->vq_cactive[zio->io_priority]++;
567-
vq->vq_active++;
568575
if (vdev_queue_is_interactive(zio->io_priority)) {
569576
if (++vq->vq_ia_active == 1)
570577
vq->vq_nia_credit = 1;
571578
} else if (vq->vq_ia_active > 0) {
572579
vq->vq_nia_credit--;
573580
}
574581
zio->io_queue_state = ZIO_QS_ACTIVE;
582+
mutex_enter(&vq->vq_active_list_lock);
575583
list_insert_tail(&vq->vq_active_list, zio);
584+
vq->vq_active++;
585+
vq->vq_queued_active++;
586+
mutex_exit(&vq->vq_active_list_lock);
576587
}
577588

578589
static void
@@ -581,15 +592,18 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
581592
ASSERT(MUTEX_HELD(&vq->vq_lock));
582593
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
583594
vq->vq_cactive[zio->io_priority]--;
584-
vq->vq_active--;
585595
if (vdev_queue_is_interactive(zio->io_priority)) {
586596
if (--vq->vq_ia_active == 0)
587597
vq->vq_nia_credit = 0;
588598
else
589599
vq->vq_nia_credit = zfs_vdev_nia_credit;
590600
} else if (vq->vq_ia_active == 0)
591601
vq->vq_nia_credit++;
602+
mutex_enter(&vq->vq_active_list_lock);
592603
list_remove(&vq->vq_active_list, zio);
604+
vq->vq_active--;
605+
vq->vq_queued_active--;
606+
mutex_exit(&vq->vq_active_list_lock);
593607
zio->io_queue_state = ZIO_QS_NONE;
594608
}
595609

@@ -946,6 +960,31 @@ vdev_queue_io(zio_t *zio)
946960
zio->io_flags |= ZIO_FLAG_DONT_QUEUE;
947961
zio->io_timestamp = gethrtime();
948962

963+
/*
964+
* Bypass queue if certain conditions are met. Queue bypassing requires
965+
* a non-rotational device. Reads / writes will attempt to bypass queue,
966+
* depending on how full the queue is. Other operations will always
967+
* queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some
968+
* benchmarks. If the queue is too full (due to a scrub or resilver)
969+
* then go back to queuing normal reads/writes so as not to starve out
970+
* the more important IOs.
971+
*/
972+
if (zio->io_vd->vdev_nonrot && ZIO_IS_NORMAL(zio)) {
973+
974+
boolean_t is_bypass = vq->vq_queued_active <
975+
(zfs_vdev_max_active * zfs_vdev_queue_bypass_pct) / 100
976+
? 1 : 0;
977+
978+
if (is_bypass) {
979+
zio->io_queue_state = ZIO_QS_BYPASS;
980+
mutex_enter(&vq->vq_active_list_lock);
981+
list_insert_tail(&vq->vq_active_list, zio);
982+
vq->vq_active++;
983+
mutex_exit(&vq->vq_active_list_lock);
984+
return (zio);
985+
}
986+
}
987+
949988
mutex_enter(&vq->vq_lock);
950989
vdev_queue_io_add(vq, zio);
951990
nio = vdev_queue_io_to_issue(vq);
@@ -978,6 +1017,15 @@ vdev_queue_io_done(zio_t *zio)
9781017
vq->vq_io_complete_ts = now;
9791018
vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp;
9801019

1020+
if (zio->io_queue_state == ZIO_QS_BYPASS) {
1021+
mutex_enter(&vq->vq_active_list_lock);
1022+
list_remove(&vq->vq_active_list, zio);
1023+
vq->vq_active--;
1024+
mutex_exit(&vq->vq_active_list_lock);
1025+
zio->io_queue_state = ZIO_QS_NONE;
1026+
return;
1027+
}
1028+
9811029
mutex_enter(&vq->vq_lock);
9821030
vdev_queue_pending_remove(vq, zio);
9831031

@@ -1163,3 +1211,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
11631211

11641212
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, def_queue_depth, UINT, ZMOD_RW,
11651213
"Default queue depth for each allocator");
1214+
1215+
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_bypass_pct, UINT, ZMOD_RW,
1216+
"Queue bypass percentage per vdev");

0 commit comments

Comments
 (0)