Skip to content

Commit 24feca7

Browse files
craig[bot]tbg
craig[bot]
andcommitted
Merge #37205
37205: roachtest: reduce hangs in acceptance-chaos tests r=andreimatei a=tbg These tests are pretty janky, and can end up failing with a timeout and a deadlocked test, which is not something roachtest can really ever handle gracefully. Sprinkle more contexts around and set a statement timeout for the central query that is most likely to get stuck under the crucial lock that we think "causes" most of the deadlocks. Of course there is likely a real problem with CRDB, which this PR does nothing about. All that is (hopefully) achieved here is a clean failure mode. The failure prompting this PR is fixed by #37204, unfortunately it also turns out that the statement timeout added in this PR did not prevent the statement from hanging. It is probably still worth merging this. Release note: None Co-authored-by: Tobias Schottdorf <[email protected]>
2 parents 468b93b + d17a695 commit 24feca7

File tree

1 file changed

+23
-15
lines changed

1 file changed

+23
-15
lines changed

pkg/cmd/roachtest/bank.go

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"github.com/cockroachdb/cockroach/pkg/util/retry"
3434
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
3535
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
36+
"github.com/pkg/errors"
3637
)
3738

3839
const (
@@ -46,28 +47,35 @@ type bankClient struct {
4647
count uint64
4748
}
4849

49-
func (client *bankClient) transferMoney(numAccounts, maxTransfer int) error {
50+
func (client *bankClient) transferMoney(ctx context.Context, numAccounts, maxTransfer int) error {
5051
from := rand.Intn(numAccounts)
5152
to := rand.Intn(numAccounts - 1)
5253
if from == to {
5354
to = numAccounts - 1
5455
}
5556
amount := rand.Intn(maxTransfer)
5657

57-
const update = `
58+
tBegin := timeutil.Now()
59+
60+
// If this statement gets stuck, the test harness will get stuck. Run with a
61+
// statement timeout, which unfortunately precludes the use of prepared
62+
// statements.
63+
q := fmt.Sprintf(`
64+
SET statement_timeout = '31s';
5865
UPDATE bank.accounts
59-
SET balance = CASE id WHEN $1 THEN balance-$3 WHEN $2 THEN balance+$3 END
60-
WHERE id IN ($1, $2) AND (SELECT balance >= $3 FROM bank.accounts WHERE id = $1)
61-
`
66+
SET balance = CASE id WHEN %[1]d THEN balance-%[3]d WHEN %[2]d THEN balance+%[3]d END
67+
WHERE id IN (%[1]d, %[2]d) AND (SELECT balance >= %[3]d FROM bank.accounts WHERE id = %[1]d);
68+
`, from, to, amount)
69+
6270
client.RLock()
6371
defer client.RUnlock()
64-
_, err := client.db.Exec(update, from, to, amount)
72+
_, err := client.db.ExecContext(ctx, q)
6573
if err == nil {
6674
// Do all increments under the read lock so that grabbing a write lock in
6775
// startChaosMonkey below guarantees no more increments could be incoming.
6876
atomic.AddUint64(&client.count, 1)
6977
}
70-
return err
78+
return errors.Wrapf(err, "after %.1fs", timeutil.Since(tBegin).Seconds())
7179
}
7280

7381
type bankState struct {
@@ -111,12 +119,12 @@ func (s *bankState) initBank(ctx context.Context, t *test, c *cluster) {
111119
db := c.Conn(ctx, 1)
112120
defer db.Close()
113121

114-
if _, err := db.Exec(`CREATE DATABASE IF NOT EXISTS bank`); err != nil {
122+
if _, err := db.ExecContext(ctx, `CREATE DATABASE IF NOT EXISTS bank`); err != nil {
115123
t.Fatal(err)
116124
}
117125

118126
// Delete table created by a prior instance of a test.
119-
if _, err := db.Exec(`DROP TABLE IF EXISTS bank.accounts`); err != nil {
127+
if _, err := db.ExecContext(ctx, `DROP TABLE IF EXISTS bank.accounts`); err != nil {
120128
t.Fatal(err)
121129
}
122130

@@ -125,7 +133,7 @@ CREATE TABLE bank.accounts (
125133
id INT PRIMARY KEY,
126134
balance INT NOT NULL
127135
)`
128-
if _, err := db.Exec(schema); err != nil {
136+
if _, err := db.ExecContext(ctx, schema); err != nil {
129137
t.Fatal(err)
130138
}
131139

@@ -139,7 +147,7 @@ CREATE TABLE bank.accounts (
139147
values = append(values, i)
140148
}
141149
stmt := `INSERT INTO bank.accounts (id, balance) VALUES ` + placeholders.String()
142-
if _, err := db.Exec(stmt, values...); err != nil {
150+
if _, err := db.ExecContext(ctx, stmt, values...); err != nil {
143151
t.Fatal(err)
144152
}
145153
}
@@ -151,7 +159,7 @@ func (s *bankState) transferMoney(
151159
defer c.l.Printf("client %d shutting down\n", idx)
152160
client := &s.clients[idx-1]
153161
for !s.done(ctx) {
154-
if err := client.transferMoney(numAccounts, maxTransfer); err != nil {
162+
if err := client.transferMoney(ctx, numAccounts, maxTransfer); err != nil {
155163
// Ignore some errors.
156164
if !pgerror.IsSQLRetryableError(err) {
157165
// Report the err and terminate.
@@ -180,7 +188,7 @@ func (s *bankState) verifyAccounts(ctx context.Context, t *test) {
180188
// chaos monkey.
181189
client.RLock()
182190
defer client.RUnlock()
183-
err := client.db.QueryRow("SELECT count(*), sum(balance) FROM bank.accounts").Scan(&accounts, &sum)
191+
err := client.db.QueryRowContext(ctx, "SELECT count(*), sum(balance) FROM bank.accounts").Scan(&accounts, &sum)
184192
if err != nil && !pgerror.IsSQLRetryableError(err) {
185193
t.Fatal(err)
186194
}
@@ -311,7 +319,7 @@ func (s *bankState) startSplitMonkey(ctx context.Context, d time.Duration, c *cl
311319
zipF := accountDistribution(r)
312320
key := zipF.Uint64()
313321
c.l.Printf("round %d: splitting key %v\n", curRound, key)
314-
_, err := client.db.Exec(fmt.Sprintf(
322+
_, err := client.db.ExecContext(ctx, fmt.Sprintf(
315323
`SET experimental_force_split_at = true; ALTER TABLE bank.accounts SPLIT AT VALUES (%d)`, key))
316324
if err != nil && !(pgerror.IsSQLRetryableError(err) || isExpectedRelocateError(err)) {
317325
s.errChan <- err
@@ -333,7 +341,7 @@ func (s *bankState) startSplitMonkey(ctx context.Context, d time.Duration, c *cl
333341
c.l.Printf("round %d: relocating key %d to nodes %s\n",
334342
curRound, key, nodes[1:])
335343

336-
_, err := client.db.Exec(relocateQuery)
344+
_, err := client.db.ExecContext(ctx, relocateQuery)
337345
if err != nil && !(pgerror.IsSQLRetryableError(err) || isExpectedRelocateError(err)) {
338346
s.errChan <- err
339347
}

0 commit comments

Comments
 (0)