Skip to content

Commit f163a44

Browse files
committed
Refactor Log Size Limit.
Original Log Size Limit implementation blocked all writes in case of limit reached until the TXG is committed and the log is freed. It caused huge delays and following speed spikes in application writes. This implementation instead smoothly throttles writes, using exactly the same mechanism as used for dirty data. Signed-off-by: Alexander Motin <[email protected]> Sponsored-By: iXsystems, Inc.
1 parent c0cf6ed commit f163a44

File tree

5 files changed

+53
-33
lines changed

5 files changed

+53
-33
lines changed

include/sys/dmu_tx.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,8 @@ typedef struct dmu_tx_stats {
124124
kstat_named_t dmu_tx_dirty_throttle;
125125
kstat_named_t dmu_tx_dirty_delay;
126126
kstat_named_t dmu_tx_dirty_over_max;
127-
kstat_named_t dmu_tx_wrlog_over_max;
128127
kstat_named_t dmu_tx_dirty_frees_delay;
128+
kstat_named_t dmu_tx_wrlog_delay;
129129
kstat_named_t dmu_tx_quota;
130130
} dmu_tx_stats_t;
131131

include/sys/dsl_pool.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp,
164164
zfs_space_check_t slop_policy);
165165
uint64_t dsl_pool_deferred_space(dsl_pool_t *dp);
166166
void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg);
167-
boolean_t dsl_pool_wrlog_over_max(dsl_pool_t *dp);
167+
boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp);
168168
void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
169169
void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
170170
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);

man/man4/zfs.4

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,10 +1098,9 @@ This should be less than
10981098
.
10991099
.It Sy zfs_wrlog_data_max Ns = Pq int
11001100
The upper limit of write-transaction zil log data size in bytes.
1101-
Once it is reached, write operation is blocked, until log data is cleared out
1102-
after transaction group sync.
1103-
Because of some overhead, it should be set
1104-
at least 2 times the size of
1101+
Write operations are throttled when approaching the limit until log data is
1102+
cleared out after transaction group sync.
1103+
Because of some overhead, it should be set at least 2 times the size of
11051104
.Sy zfs_dirty_data_max
11061105
.No to prevent harming normal write throughput.
11071106
It also should be smaller than the size of the slog device if slog is present.

module/zfs/dmu_tx.c

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ dmu_tx_stats_t dmu_tx_stats = {
5353
{ "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 },
5454
{ "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 },
5555
{ "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 },
56-
{ "dmu_tx_wrlog_over_max", KSTAT_DATA_UINT64 },
5756
{ "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 },
57+
{ "dmu_tx_wrlog_delay", KSTAT_DATA_UINT64 },
5858
{ "dmu_tx_quota", KSTAT_DATA_UINT64 },
5959
};
6060

@@ -779,34 +779,49 @@ static void
779779
dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
780780
{
781781
dsl_pool_t *dp = tx->tx_pool;
782-
uint64_t delay_min_bytes =
782+
uint64_t delay_min_bytes, wrlog;
783+
hrtime_t wakeup, tx_time = 0, now;
784+
785+
/* Calculate minimum transaction time for the dirty data amount. */
786+
delay_min_bytes =
783787
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
784-
hrtime_t wakeup, min_tx_time, now;
788+
if (dirty > delay_min_bytes) {
789+
/*
790+
* The caller has already waited until we are under the max.
791+
* We make them pass us the amount of dirty data so we don't
792+
* have to handle the case of it being >= the max, which
793+
* could cause a divide-by-zero if it's == the max.
794+
*/
795+
ASSERT3U(dirty, <, zfs_dirty_data_max);
785796

786-
if (dirty <= delay_min_bytes)
787-
return;
797+
tx_time = zfs_delay_scale * (dirty - delay_min_bytes) /
798+
(zfs_dirty_data_max - dirty);
799+
}
788800

789-
/*
790-
* The caller has already waited until we are under the max.
791-
* We make them pass us the amount of dirty data so we don't
792-
* have to handle the case of it being >= the max, which could
793-
* cause a divide-by-zero if it's == the max.
794-
*/
795-
ASSERT3U(dirty, <, zfs_dirty_data_max);
801+
/* Calculate minimum transaction time for the TX_WRITE log size. */
802+
wrlog = aggsum_upper_bound(&dp->dp_wrlog_total);
803+
delay_min_bytes =
804+
zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
805+
if (wrlog >= zfs_wrlog_data_max) {
806+
tx_time = zfs_delay_max_ns;
807+
} else if (wrlog > delay_min_bytes) {
808+
tx_time = MAX(zfs_delay_scale * (wrlog - delay_min_bytes) /
809+
(zfs_wrlog_data_max - wrlog), tx_time);
810+
}
796811

812+
if (tx_time == 0)
813+
return;
814+
815+
tx_time = MIN(tx_time, zfs_delay_max_ns);
797816
now = gethrtime();
798-
min_tx_time = zfs_delay_scale *
799-
(dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
800-
min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
801-
if (now > tx->tx_start + min_tx_time)
817+
if (now > tx->tx_start + tx_time)
802818
return;
803819

804820
DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
805-
uint64_t, min_tx_time);
821+
uint64_t, tx_time);
806822

807823
mutex_enter(&dp->dp_lock);
808-
wakeup = MAX(tx->tx_start + min_tx_time,
809-
dp->dp_last_wakeup + min_tx_time);
824+
wakeup = MAX(tx->tx_start + tx_time, dp->dp_last_wakeup + tx_time);
810825
dp->dp_last_wakeup = wakeup;
811826
mutex_exit(&dp->dp_lock);
812827

@@ -884,8 +899,9 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
884899
}
885900

886901
if (!tx->tx_dirty_delayed &&
887-
dsl_pool_wrlog_over_max(tx->tx_pool)) {
888-
DMU_TX_STAT_BUMP(dmu_tx_wrlog_over_max);
902+
dsl_pool_need_wrlog_delay(tx->tx_pool)) {
903+
tx->tx_wait_dirty = B_TRUE;
904+
DMU_TX_STAT_BUMP(dmu_tx_wrlog_delay);
889905
return (SET_ERROR(ERESTART));
890906
}
891907

module/zfs/dsl_pool.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,8 @@ int zfs_dirty_data_max_percent = 10;
105105
int zfs_dirty_data_max_max_percent = 25;
106106

107107
/*
108-
* zfs_wrlog_data_max, the upper limit of TX_WRITE log data.
109-
* Once it is reached, write operation is blocked,
110-
* until log data is cleared out after txg sync.
108+
* The upper limit of TX_WRITE log data. Write operations are throttled
109+
* when approaching the limit until log data is cleared out after txg sync.
111110
* It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY.
112111
*/
113112
unsigned long zfs_wrlog_data_max = 0;
@@ -623,15 +622,18 @@ dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg)
623622

624623
/* Choose a value slightly bigger than min dirty sync bytes */
625624
uint64_t sync_min =
626-
zfs_dirty_data_max * (zfs_dirty_data_sync_percent + 10) / 100;
625+
zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200;
627626
if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0)
628627
txg_kick(dp, txg);
629628
}
630629

631630
boolean_t
632-
dsl_pool_wrlog_over_max(dsl_pool_t *dp)
631+
dsl_pool_need_wrlog_delay(dsl_pool_t *dp)
633632
{
634-
return (aggsum_compare(&dp->dp_wrlog_total, zfs_wrlog_data_max) > 0);
633+
uint64_t delay_min_bytes =
634+
zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100;
635+
636+
return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0);
635637
}
636638

637639
static void
@@ -641,6 +643,9 @@ dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg)
641643
delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
642644
aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta);
643645
aggsum_add(&dp->dp_wrlog_total, delta);
646+
/* Compact per-CPU sums after the big change. */
647+
(void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]);
648+
(void) aggsum_value(&dp->dp_wrlog_total);
644649
}
645650

646651
#ifdef ZFS_DEBUG

0 commit comments

Comments
 (0)