Skip to content

Commit 0524b35

Browse files
committed
api: fix INIT state stuck
Sometimes, instance could enter the queue initialization while still not running (for example, left in the orphan mode). This resulted in "lazy start". But Tarantool does not call `box.cfg {}` after leaving orphan mode, so queue could stuck in the `INIT` state. Now we wait in the background for instances, that are not running. It is similar to lazy init for read-only instances. Note that this fix works only for Tarantool versions >= 2.10.0. This is because of used watchers. Closes #226
1 parent 5f2b145 commit 0524b35

File tree

3 files changed

+92
-4
lines changed

3 files changed

+92
-4
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,13 @@ All notable changes to this project will be documented in this file.
55
The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
66
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
77

8+
## [Unreleased]
9+
10+
### Fixed
11+
12+
- Stuck in `INIT` state if an instance failed to enter the `running` mode
13+
in time (#226). This fix works only for Tarantool versions >= 2.10.0.
14+
815
## [1.3.3] - 2023-09-13
916

1017
### Fixed

queue/init.lua

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
1+
local fiber = require('fiber')
2+
13
local abstract = require('queue.abstract')
24
local queue_state = require('queue.abstract.queue_state')
5+
local qc = require('queue.compat')
36
local queue = nil
47

58
-- load all core drivers
@@ -11,6 +14,10 @@ local core_drivers = {
1114
limfifottl = require('queue.abstract.driver.limfifottl')
1215
}
1316

17+
-- since:
18+
-- https://github.com/locker/tarantool/commit/8cf5151cb4f05cee3fd0ea831add2b3187a01fe4
19+
local watchers_supported = qc.check_version({2, 10, 0})
20+
1421
local function register_driver(driver_name, tube_ctr)
1522
if type(tube_ctr.create_space) ~= 'function' or
1623
type(tube_ctr.new) ~= 'function' then
@@ -62,6 +69,19 @@ local orig_call = nil
6269

6370
local wrapper_impl
6471

72+
local function running_waiter()
73+
fiber.name('queue running waiter')
74+
local wait_cond = fiber.cond()
75+
local w = box.watch('box.status', function(_, new_status)
76+
if new_status.status == 'running' then
77+
wait_cond:signal()
78+
end
79+
end)
80+
wait_cond:wait()
81+
w:unregister()
82+
return wrapper_impl()
83+
end
84+
6585
local function cfg_wrapper(...)
6686
box.cfg = orig_cfg
6787
return wrapper_impl(...)
@@ -79,10 +99,15 @@ local function wrap_box_cfg()
7999
orig_cfg = box.cfg
80100
box.cfg = cfg_wrapper
81101
elseif type(box.cfg) == 'table' then
82-
-- box.cfg after the first box.cfg call
83-
local cfg_mt = getmetatable(box.cfg)
84-
orig_call = cfg_mt.__call
85-
cfg_mt.__call = cfg_call_wrapper
102+
if watchers_supported and box.info.status ~= 'running' then
103+
-- Wait for the running state and initialize the queue.
104+
fiber.new(running_waiter)
105+
else
106+
-- box.cfg after the first box.cfg call
107+
local cfg_mt = getmetatable(box.cfg)
108+
orig_call = cfg_mt.__call
109+
cfg_mt.__call = cfg_call_wrapper
110+
end
86111
else
87112
error('The box.cfg type is unexpected: ' .. type(box.cfg))
88113
end

t/230-orphan-not-stalling-init.t

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env tarantool
2+
3+
local test = require('tap').test('')
4+
local queue = require('queue')
5+
local tnt = require('t.tnt')
6+
7+
rawset(_G, 'queue', require('queue'))
8+
9+
local qc = require('queue.compat')
10+
if not qc.check_version({2, 10, 0}) then
11+
log.info('Tests skipped, tarantool version < 2.10.0')
12+
return
13+
end
14+
15+
test:plan(1)
16+
17+
test:test('Check orphan mode not stalling queue', function(test)
18+
test:plan(4)
19+
local engine = os.getenv('ENGINE') or 'memtx'
20+
tnt.cluster.cfg{}
21+
22+
box.cfg{
23+
replication = {
24+
'replicator:[email protected]:3398',
25+
'replicator:[email protected]:3399'
26+
},
27+
listen = '127.0.0.1:3395',
28+
replication_connect_quorum = 4,
29+
bootstrap_strategy = 'legacy'
30+
}
31+
32+
test:isnt(queue.state(), 'RUNNING', 'check queue state')
33+
test:is(box.info.ro, true, 'check read only')
34+
test:is(box.info.ro_reason, 'orphan', 'check ro reason')
35+
36+
box.cfg{replication_connect_quorum = 1}
37+
38+
local attempts = 0
39+
while true do
40+
if queue.state() == 'RUNNING' then
41+
test:is(queue.state(), 'RUNNING', 'check queue state after orphan')
42+
return
43+
end
44+
attempts = attempts + 1
45+
if attempts == 10 then
46+
break
47+
end
48+
require('fiber').sleep(0.1)
49+
end
50+
test:is(queue.state(), 'RUNNING', 'check queue state after orphan')
51+
end)
52+
53+
rawset(_G, 'queue', nil)
54+
tnt.finish()
55+
os.exit(test:check() and 0 or 1)
56+
-- vim: set ft=lua :

0 commit comments

Comments
 (0)