@@ -22,6 +22,14 @@ import (
22
22
opentracing "github.com/opentracing/opentracing-go"
23
23
)
24
24
25
+ type txnHeartbeaterStatus byte
26
+
27
+ const (
28
+ txnHeartbeaterStatusReady txnHeartbeaterStatus = iota
29
+ txnHeartbeaterStatusRunning
30
+ txnHeartbeaterStatusRejecting
31
+ )
32
+
25
33
// txnHeartbeater is a txnInterceptor in charge of a transaction's heartbeat
26
34
// loop. Transaction coordinators heartbeat their transaction record
27
35
// periodically to indicate the liveness of their transaction. Other actors like
@@ -84,9 +92,9 @@ type txnHeartbeater struct {
84
92
mu struct {
85
93
sync.Locker
86
94
87
- // loopStarted indicates whether the heartbeat loop has been launched
95
+ // status indicates whether the heartbeat loop has been launched
88
96
// for the transaction or not. It remains true once the loop terminates.
89
- loopStarted bool
97
+ status txnHeartbeaterStatus
90
98
91
99
// txnEnd is closed when the transaction is aborted or committed, terminating
92
100
// the heartbeat loop. Nil if the heartbeat loop is not running.
@@ -122,20 +130,21 @@ func (h *txnHeartbeater) init(
122
130
h .metrics = metrics
123
131
h .mu .Locker = mu
124
132
h .mu .txn = txn
133
+ h .mu .status = txnHeartbeaterStatusReady
125
134
h .gatekeeper = gatekeeper
126
135
h .asyncAbortCallbackLocked = asyncAbortCallbackLocked
127
136
}
128
137
129
- // SendLocked is part of the txnInteceptor interface.
138
+ // SendLocked is part of the txnInterceptor interface.
130
139
func (h * txnHeartbeater ) SendLocked (
131
140
ctx context.Context , ba roachpb.BatchRequest ,
132
141
) (* roachpb.BatchResponse , * roachpb.Error ) {
133
142
// If finalErr is set, we reject everything but rollbacks.
134
- if h .mu .finalErr != nil {
143
+ if h .mu .status == txnHeartbeaterStatusRejecting {
135
144
singleRollback := ba .IsSingleEndTransactionRequest () &&
136
145
! ba .Requests [0 ].GetInner ().(* roachpb.EndTransactionRequest ).Commit
137
146
if ! singleRollback {
138
- return nil , h . mu . finalErr
147
+ return nil , roachpb . NewErrorf ( "programming error: txnHeartbeater is finalized" )
139
148
}
140
149
}
141
150
@@ -161,12 +170,9 @@ func (h *txnHeartbeater) SendLocked(
161
170
// Note that we don't do it for 1PC txns: they only leave intents around on
162
171
// retriable errors if the batch has been split between ranges. We consider
163
172
// that unlikely enough so we prefer to not pay for a goroutine.
164
- if ! h .mu .loopStarted {
173
+ if h .mu .status == txnHeartbeaterStatusReady {
165
174
if _ , haveEndTxn := ba .GetArg (roachpb .EndTransaction ); ! haveEndTxn {
166
- if err := h .startHeartbeatLoopLocked (ctx ); err != nil {
167
- h .mu .finalErr = roachpb .NewError (err )
168
- return nil , h .mu .finalErr
169
- }
175
+ h .startHeartbeatLoopLocked (ctx )
170
176
}
171
177
}
172
178
}
@@ -200,13 +206,13 @@ func (h *txnHeartbeater) closeLocked() {
200
206
}
201
207
202
208
// startHeartbeatLoopLocked starts a heartbeat loop in a different goroutine.
203
- func (h * txnHeartbeater ) startHeartbeatLoopLocked (ctx context.Context ) error {
204
- if h .mu .loopStarted || h .mu .txnEnd != nil {
209
+ func (h * txnHeartbeater ) startHeartbeatLoopLocked (ctx context.Context ) {
210
+ if h .mu .status != txnHeartbeaterStatusReady || h .mu .txnEnd != nil {
205
211
log .Fatal (ctx , "attempting to start a second heartbeat loop " )
206
212
}
207
213
208
214
log .VEventf (ctx , 2 , "coordinator spawns heartbeat loop" )
209
- h .mu .loopStarted = true
215
+ h .mu .status = txnHeartbeaterStatusRunning
210
216
h .mu .txnEnd = make (chan struct {})
211
217
212
218
// Create a new context so that the heartbeat loop doesn't inherit the
@@ -217,10 +223,12 @@ func (h *txnHeartbeater) startHeartbeatLoopLocked(ctx context.Context) error {
217
223
hbCtx := h .AnnotateCtx (context .Background ())
218
224
hbCtx = opentracing .ContextWithSpan (hbCtx , opentracing .SpanFromContext (ctx ))
219
225
220
- return h .stopper .RunAsyncTask (
226
+ if err := h .stopper .RunAsyncTask (
221
227
hbCtx , "kv.TxnCoordSender: heartbeat loop" , func (ctx context.Context ) {
222
228
h .heartbeatLoop (ctx )
223
- })
229
+ }); err != nil {
230
+ h .mu .status = txnHeartbeaterStatusRejecting
231
+ }
224
232
}
225
233
226
234
// heartbeatLoop periodically sends a HeartbeatTxn request to the transaction
@@ -234,13 +242,10 @@ func (h *txnHeartbeater) heartbeatLoop(ctx context.Context) {
234
242
defer ticker .Stop ()
235
243
}
236
244
237
- var finalErr * roachpb.Error
238
245
defer func () {
239
246
h .mu .Lock ()
240
- // Prevent future SendLocked() calls.
241
- if finalErr != nil {
242
- h .mu .finalErr = finalErr
243
- }
247
+ // Prevent future SendLocked() calls (except rollbacks).
248
+ h .mu .status = txnHeartbeaterStatusRejecting
244
249
if h .mu .txnEnd != nil {
245
250
h .mu .txnEnd = nil
246
251
}
@@ -264,15 +269,12 @@ func (h *txnHeartbeater) heartbeatLoop(ctx context.Context) {
264
269
// This error we're generating here should not be seen by clients. Since
265
270
// the transaction is aborted, they should be rejected before they reach
266
271
// this interceptor.
267
- finalErr = roachpb .NewErrorf ("heartbeat failed fatally" )
268
272
return
269
273
}
270
274
case <- closer :
271
275
// Transaction finished normally.
272
- finalErr = roachpb .NewErrorf ("txnHeartbeater already closed" )
273
276
return
274
277
case <- h .stopper .ShouldQuiesce ():
275
- finalErr = roachpb .NewErrorf ("node already quiescing" )
276
278
return
277
279
}
278
280
}
@@ -337,8 +339,16 @@ func (h *txnHeartbeater) heartbeat(ctx context.Context) bool {
337
339
// TODO(nvanbenschoten): Make this the only case where we get back an
338
340
// Aborted txn.
339
341
if _ , ok := pErr .GetDetail ().(* roachpb.TransactionAbortedError ); ok {
340
- h .mu .txn .Status = roachpb .ABORTED
342
+ // Note that it's possible that the txn actually committed but its
343
+ // record got GC'ed. In that case, aborting won't hurt anyone though,
344
+ // since all intents have already been resolved.
345
+ // The only thing we must ascertain is that we don't tell the client
346
+ // about this error - it will get either a definitive result of
347
+ // its commit or an ambiguous one and we have nothing to offer that
348
+ // provides more clarity. We do however prevent it from running more
349
+ // requests in case it isn't aware that the transaction is over.
341
350
log .VEventf (ctx , 1 , "Heartbeat detected aborted txn. Cleaning up." )
351
+ h .mu .status = txnHeartbeaterStatusRejecting
342
352
h .abortTxnAsyncLocked (ctx )
343
353
return false
344
354
}
@@ -382,9 +392,6 @@ func (h *txnHeartbeater) heartbeat(ctx context.Context) bool {
382
392
// abortTxnAsyncLocked send an EndTransaction(commmit=false) asynchronously.
383
393
// The asyncAbortCallbackLocked callback is also called.
384
394
func (h * txnHeartbeater ) abortTxnAsyncLocked (ctx context.Context ) {
385
- if h .mu .txn .Status != roachpb .ABORTED {
386
- log .Fatalf (ctx , "abortTxnAsyncLocked called for non-aborted txn: %s" , h .mu .txn )
387
- }
388
395
h .asyncAbortCallbackLocked (ctx )
389
396
txn := h .mu .txn .Clone ()
390
397
0 commit comments