Skip to content

Commit ad6f6cb

Browse files
committed
auto merge of #10817 : alexcrichton/rust/sched-fix, r=brson
Right now, as pointed out in #8132, it is very easy to introduce a subtle race in the runtime. I believe that this is the cause of the current flakiness on the bots. I have taken the last idea mentioned in that issue which is to use a lock around descheduling and context switching in order to solve this race. Closes #8132
2 parents 3cc86d8 + be0580b commit ad6f6cb

File tree

2 files changed

+49
-4
lines changed

2 files changed

+49
-4
lines changed

src/libstd/rt/sched.rs

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ use borrow::{to_uint};
2727
use cell::Cell;
2828
use rand::{XorShiftRng, Rng, Rand};
2929
use iter::range;
30+
use unstable::mutex::Mutex;
3031
use vec::{OwnedVector};
3132

33+
3234
/// A scheduler is responsible for coordinating the execution of Tasks
3335
/// on a single thread. The scheduler runs inside a slightly modified
3436
/// Rust Task. When not running this task is stored in the scheduler
@@ -618,6 +620,12 @@ impl Scheduler {
618620
unsafe {
619621
let task: *mut Task = Local::unsafe_borrow();
620622
(*task).sched.get_mut_ref().run_cleanup_job();
623+
624+
// See the comments in switch_running_tasks_and_then for why a lock
625+
// is acquired here. This is the resumption points and the "bounce"
626+
// that it is referring to.
627+
(*task).nasty_deschedule_lock.lock();
628+
(*task).nasty_deschedule_lock.unlock();
621629
}
622630
}
623631

@@ -671,6 +679,15 @@ impl Scheduler {
671679
/// This passes a Scheduler pointer to the fn after the context switch
672680
/// in order to prevent that fn from performing further scheduling operations.
673681
/// Doing further scheduling could easily result in infinite recursion.
682+
///
683+
/// Note that if the closure provided relinquishes ownership of the
684+
/// BlockedTask, then it is possible for the task to resume execution before
685+
/// the closure has finished executing. This would naturally introduce a
686+
/// race if the closure and task shared portions of the environment.
687+
///
688+
/// This situation is currently prevented, or in other words it is
689+
/// guaranteed that this function will not return before the given closure
690+
/// has returned.
674691
pub fn deschedule_running_task_and_then(mut ~self,
675692
f: |&mut Scheduler, BlockedTask|) {
676693
// Trickier - we need to get the scheduler task out of self
@@ -682,10 +699,29 @@ impl Scheduler {
682699

683700
pub fn switch_running_tasks_and_then(~self, next_task: ~Task,
684701
f: |&mut Scheduler, BlockedTask|) {
685-
// This is where we convert the BlockedTask-taking closure into one
686-
// that takes just a Task
687-
self.change_task_context(next_task, |sched, task| {
688-
f(sched, BlockedTask::block(task))
702+
// And here comes one of the sad moments in which a lock is used in a
703+
// core portion of the rust runtime. As always, this is highly
704+
// undesirable, so there's a good reason behind it.
705+
//
706+
// There is an excellent outline of the problem in issue #8132, and it's
707+
// summarized in that `f` is executed on a sched task, but its
708+
// environment is on the previous task. If `f` relinquishes ownership of
709+
// the BlockedTask, then it may introduce a race where `f` is using the
710+
// environment as well as the code after the 'deschedule' block.
711+
//
712+
// The solution we have chosen to adopt for now is to acquire a
713+
// task-local lock around this block. The resumption of the task in
714+
// context switching will bounce on the lock, thereby waiting for this
715+
// block to finish, eliminating the race mentioned above.
716+
//
717+
// To actually maintain a handle to the lock, we use an unsafe pointer
718+
// to it, but we're guaranteed that the task won't exit until we've
719+
// unlocked the lock so there's no worry of this memory going away.
720+
self.change_task_context(next_task, |sched, mut task| {
721+
let lock: *mut Mutex = &mut task.nasty_deschedule_lock;
722+
unsafe { (*lock).lock() }
723+
f(sched, BlockedTask::block(task));
724+
unsafe { (*lock).unlock() }
689725
})
690726
}
691727

src/libstd/rt/task.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ use rt::sched::{Scheduler, SchedHandle};
3737
use rt::stack::{StackSegment, StackPool};
3838
use send_str::SendStr;
3939
use unstable::finally::Finally;
40+
use unstable::mutex::Mutex;
4041

4142
// The Task struct represents all state associated with a rust
4243
// task. There are at this point two primary "subtypes" of task,
@@ -59,6 +60,9 @@ pub struct Task {
5960
// Dynamic borrowck debugging info
6061
borrow_list: Option<~[BorrowRecord]>,
6162
stdout_handle: Option<~Writer>,
63+
64+
// See the comments in the scheduler about why this is necessary
65+
nasty_deschedule_lock: Mutex,
6266
}
6367

6468
pub enum TaskType {
@@ -193,6 +197,7 @@ impl Task {
193197
task_type: SchedTask,
194198
borrow_list: None,
195199
stdout_handle: None,
200+
nasty_deschedule_lock: unsafe { Mutex::new() },
196201
}
197202
}
198203

@@ -227,6 +232,7 @@ impl Task {
227232
task_type: GreenTask(Some(home)),
228233
borrow_list: None,
229234
stdout_handle: None,
235+
nasty_deschedule_lock: unsafe { Mutex::new() },
230236
}
231237
}
232238

@@ -249,6 +255,7 @@ impl Task {
249255
task_type: GreenTask(Some(home)),
250256
borrow_list: None,
251257
stdout_handle: None,
258+
nasty_deschedule_lock: unsafe { Mutex::new() },
252259
}
253260
}
254261

@@ -391,6 +398,8 @@ impl Drop for Task {
391398
fn drop(&mut self) {
392399
rtdebug!("called drop for a task: {}", borrow::to_uint(self));
393400
rtassert!(self.destroyed);
401+
402+
unsafe { self.nasty_deschedule_lock.destroy(); }
394403
}
395404
}
396405

0 commit comments

Comments
 (0)