@@ -228,6 +228,12 @@ uint_t zfs_vdev_queue_depth_pct = 300;
228
228
*/
229
229
uint_t zfs_vdev_def_queue_depth = 32 ;
230
230
231
+ /*
232
+ * Allow io to bypass the queue depending on how full the queue is.
233
+ * 0 = never bypass, 100 = always bypass.
234
+ */
235
+ uint_t zfs_vdev_queue_bypass_pct = 10 ;
236
+
231
237
static int
232
238
vdev_queue_offset_compare (const void * x1 , const void * x2 )
233
239
{
@@ -502,6 +508,7 @@ vdev_queue_init(vdev_t *vd)
502
508
list_create (& vq -> vq_active_list , sizeof (struct zio ),
503
509
offsetof(struct zio , io_queue_node .l ));
504
510
mutex_init (& vq -> vq_lock , NULL , MUTEX_DEFAULT , NULL );
511
+ mutex_init (& vq -> vq_active_list_lock , NULL , MUTEX_DEFAULT , NULL );
505
512
}
506
513
507
514
void
@@ -520,6 +527,7 @@ vdev_queue_fini(vdev_t *vd)
520
527
521
528
list_destroy (& vq -> vq_active_list );
522
529
mutex_destroy (& vq -> vq_lock );
530
+ mutex_destroy (& vq -> vq_active_list_lock );
523
531
}
524
532
525
533
static void
@@ -564,15 +572,18 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
564
572
ASSERT (MUTEX_HELD (& vq -> vq_lock ));
565
573
ASSERT3U (zio -> io_priority , < , ZIO_PRIORITY_NUM_QUEUEABLE );
566
574
vq -> vq_cactive [zio -> io_priority ]++ ;
567
- vq -> vq_active ++ ;
568
575
if (vdev_queue_is_interactive (zio -> io_priority )) {
569
576
if (++ vq -> vq_ia_active == 1 )
570
577
vq -> vq_nia_credit = 1 ;
571
578
} else if (vq -> vq_ia_active > 0 ) {
572
579
vq -> vq_nia_credit -- ;
573
580
}
574
581
zio -> io_queue_state = ZIO_QS_ACTIVE ;
582
+ mutex_enter (& vq -> vq_active_list_lock );
575
583
list_insert_tail (& vq -> vq_active_list , zio );
584
+ vq -> vq_active ++ ;
585
+ vq -> vq_queued_active ++ ;
586
+ mutex_exit (& vq -> vq_active_list_lock );
576
587
}
577
588
578
589
static void
@@ -581,15 +592,18 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
581
592
ASSERT (MUTEX_HELD (& vq -> vq_lock ));
582
593
ASSERT3U (zio -> io_priority , < , ZIO_PRIORITY_NUM_QUEUEABLE );
583
594
vq -> vq_cactive [zio -> io_priority ]-- ;
584
- vq -> vq_active -- ;
585
595
if (vdev_queue_is_interactive (zio -> io_priority )) {
586
596
if (-- vq -> vq_ia_active == 0 )
587
597
vq -> vq_nia_credit = 0 ;
588
598
else
589
599
vq -> vq_nia_credit = zfs_vdev_nia_credit ;
590
600
} else if (vq -> vq_ia_active == 0 )
591
601
vq -> vq_nia_credit ++ ;
602
+ mutex_enter (& vq -> vq_active_list_lock );
592
603
list_remove (& vq -> vq_active_list , zio );
604
+ vq -> vq_active -- ;
605
+ vq -> vq_queued_active -- ;
606
+ mutex_exit (& vq -> vq_active_list_lock );
593
607
zio -> io_queue_state = ZIO_QS_NONE ;
594
608
}
595
609
@@ -946,6 +960,31 @@ vdev_queue_io(zio_t *zio)
946
960
zio -> io_flags |= ZIO_FLAG_DONT_QUEUE ;
947
961
zio -> io_timestamp = gethrtime ();
948
962
963
+ /*
964
+ * Bypass queue if certain conditions are met. Queue bypassing requires
965
+ * a non-rotational device. Reads / writes will attempt to bypass queue,
966
+ * depending on how full the queue is. Other operations will always
967
+ * queue. Bypassing the queue can lead to a 2x IOPS speed-ups on some
968
+ * benchmarks. If the queue is too full (due to a scrub or resilver)
969
+ * then go back to queuing normal reads/writes so as not to starve out
970
+ * the more important IOs.
971
+ */
972
+ if (zio -> io_vd -> vdev_nonrot && ZIO_IS_NORMAL (zio )) {
973
+
974
+ boolean_t is_bypass = vq -> vq_queued_active <
975
+ (zfs_vdev_max_active * zfs_vdev_queue_bypass_pct ) / 100
976
+ ? 1 : 0 ;
977
+
978
+ if (is_bypass ) {
979
+ zio -> io_queue_state = ZIO_QS_BYPASS ;
980
+ mutex_enter (& vq -> vq_active_list_lock );
981
+ list_insert_tail (& vq -> vq_active_list , zio );
982
+ vq -> vq_active ++ ;
983
+ mutex_exit (& vq -> vq_active_list_lock );
984
+ return (zio );
985
+ }
986
+ }
987
+
949
988
mutex_enter (& vq -> vq_lock );
950
989
vdev_queue_io_add (vq , zio );
951
990
nio = vdev_queue_io_to_issue (vq );
@@ -978,6 +1017,15 @@ vdev_queue_io_done(zio_t *zio)
978
1017
vq -> vq_io_complete_ts = now ;
979
1018
vq -> vq_io_delta_ts = zio -> io_delta = now - zio -> io_timestamp ;
980
1019
1020
+ if (zio -> io_queue_state == ZIO_QS_BYPASS ) {
1021
+ mutex_enter (& vq -> vq_active_list_lock );
1022
+ list_remove (& vq -> vq_active_list , zio );
1023
+ vq -> vq_active -- ;
1024
+ mutex_exit (& vq -> vq_active_list_lock );
1025
+ zio -> io_queue_state = ZIO_QS_NONE ;
1026
+ return ;
1027
+ }
1028
+
981
1029
mutex_enter (& vq -> vq_lock );
982
1030
vdev_queue_pending_remove (vq , zio );
983
1031
@@ -1163,3 +1211,6 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, UINT, ZMOD_RW,
1163
1211
1164
1212
ZFS_MODULE_PARAM (zfs_vdev , zfs_vdev_ , def_queue_depth , UINT , ZMOD_RW ,
1165
1213
"Default queue depth for each allocator" );
1214
+
1215
+ ZFS_MODULE_PARAM (zfs_vdev , zfs_vdev_ , queue_bypass_pct , UINT , ZMOD_RW ,
1216
+ "Queue bypass percentage per vdev" );
0 commit comments