Seems like with 2 threads on nqueens 11 we have a reliable deadlock :/

Fixed in <a class="issue-link js-issue-link" data-error-text="Failed to load title" da

Livelock with backoff on Nqueens 11,about mratsim/weave

Comments (5)

mratsim commented on May 20, 2024

See: https://www.usenix.org/legacy/publications/compsystems/1990/sum_ruane.pdf

Any operating system kernel has some form of process synchronization, allowing a process to wait for a particular condition. The traditional choice for UNIX systems, the event-wait mechanism, leads to race conditions on multiprocessors.
This problem was initially solved in Amdahl's UTS multiprocessing kernel by replacing the event-wait mechanism with Dijkstra semaphores. The kernel, however, became noticeably more complicated and less reliable when based on semaphores.
This has led us to develop a race-free multiprocessor event-wait mechanism with some novel properties. A few common synchronization techniques have emerged, the most complex of which was verified correct with the supertrace protocol validation system spin. A new scheduling approach with per-CPU run queues reduces the number of unnecessary context switches due to awakening all waiting processes. The overall approach is claimed to be simple, efficient, and reliable.

from weave.

mratsim commented on May 20, 2024

After modeling the Backoff mechanism in a formal language and runtime formal verification by model checking, I believe I've narrowed down the deadlock.

There

weave/weave/channels/event_notifiers.nim

Lines 113 to 120 in b4c5a37

 func notify*(en: var EventNotifier) {.inline.} = 

 ## Signal a thread that it can be unparked 

 # No thread waiting, return 

 let consumerState = en.consumerState.load(moRelaxed) 

 if consumerState in {Busy, ShouldWakeup}: 

 fence(moAcquire) 

 return

the parent worker loads its child worker state in register and exits if its Busy or ShouldWakeup.
But while this happens, the child can send its intent to sleep

weave/weave/channels/event_notifiers.nim

Lines 88 to 95 in b4c5a37

 func intendToSleep*(en: var EventNotifier) {.inline.} = 

 ## The consumer intends to sleep soon. 

 ## This must be called before the formal notification 

 ## via a channel. 

 assert en.consumerState.load(moRelaxed) == Busy 

 fence(moRelease) 

 en.consumerState.store(IntendToSleep, moRelaxed)

And then follow up with sleeping

weave/weave/channels/event_notifiers.nim

Lines 97 to 106 in b4c5a37

 func wait*(en: var EventNotifier) {.inline.} = 

 ## Wait until we are signaled of an event 

 ## Thread is parked and does not consume CPU resources 

 var expected = IntendToSleep 

 if compareExchange(en.consumerState, expected, Parked, moAcquireRelease): 

 while en.consumerState.load(moRelaxed) == Parked: 

 # We only used the lock for the condition variable, we protect via atomics otherwise 

 fence(moAcquire) 

 en.cond.wait(en.lock)

The parent thread then sends its signal to shutdown but it's never received.

Formal specification in TLA+

------------------- MODULE event_notifiers -----------------------
(*
Formal specification of the event_notifiers datastructure.
It allows a single consumer to be put to sleep and being woken up
by multiple producers so that the consumer is able to consume the incoming messages (steal requests).
It combines a prepare phase "intendToSleep" with commit phase "wait".
In between the consumer sends a message to its parent that it is going to sleep.
The commit phase is aborted when any producers signal an incoming message.
There should be no deadlock, i.e. an incoming message being signaled but the consumer stays sleeping
as in the runtime the main thread may be the only one awake and wouldn't be able to awaken its children
in that case.
*)
EXTENDS Integers, TLC, Sequences, FiniteSets

CONSTANTS NumThreads, ConsumerTID
ASSUME NumThreads > 1
ASSUME ConsumerTID > 0
ASSUME ConsumerTID < NumThreads

MaxID == NumThreads-1
ParentTID == ConsumerTID \div 2
producers == (0..MaxID) \ {ConsumerTID, ParentTID}

(* PlusCal options (-termination) *)
(* --algorithm event_notifier

variables
    consumerState = "Busy";
    signaled = FALSE;       \* Simulate a condition variable
    msgToParent = "None";   \* Simulate a message to parent. I.e. an opportunity for thread interleaving

macro intendToSleep() begin
    consumerState := "IntendToSleep"
end macro

\* Everything in a macro happens atomically
macro atomicCompareExchange(result, current, expected, newVal) begin
    if current = expected then
        current := newVal;
        result := TRUE;
    else
        result := FALSE;
    end if;
end macro;

\* Consumer wait until it is signaled to wakeup
procedure wait()
    variables casSuccess = FALSE;
    begin

    WCAS1: atomicCompareExchange(casSuccess, consumerState, "IntendToSleep", "Parked");
           if casSuccess then
    W2:       while consumerState = "Parked" do
              \* The while loop protects against spurious wakeups
    WCV3:         await signaled;
                  signaled := FALSE;
              end while;
           end if;
    W4:    assert consumerState \in {"Parked", "ShouldWakeup"};
    W5:    consumerState := "Busy";
    W8:    return;
end procedure;

\* Notify the potentially waiting consumer that it should wake up
procedure notify()
    variables localConsumerState = "N/A"; \* local view of the consumer state
    begin

    N1:    localConsumerState := consumerState;
    N2:    if localConsumerState \in {"Busy", "ShouldWakeup"} then
    N3:        return;
           end if;

    N4:    consumerState := "ShouldWakeup";
    N5:    while TRUE do
    NSIG6:     signaled := TRUE;
    N7:        if consumerState /= "Busy" then
    N8:          skip;
               else
    N9:          return;
               end if;
           end while;
end procedure;

procedure mayRequestWork()
    begin MaySteal:
        either
            \* Sometimes you have enough work
            NoSteal: skip;
        or
            \* Sometimes you don't and you steal
            Steal: call notify();
        end either;
        ReqRET: return;
end procedure

procedure mayShareWork()
    \* A parent can also share work with a
    \* a child that sent it "Waiting"
    begin MayShare:
        either
            \* sometimes the parent doesn't have work
            NoWork: skip;
        or
            \* Sometimes it has some
            Share0: if msgToParent = "Waiting" then
            Share1:    call notify();         \* wakeup the child
            Share2:    msgToParent := "None"; \* dequeue the child steal request
                    end if;
        end either;
        ShareRET: return;
end procedure;

\* Not fair because they might never steal
process producer \in producers
    begin Coworkers:
        call mayRequestWork();
end process;

\* a parent will always run at least the termination
fair process parent = ParentTID
    \* The order of work sharing and work stealing is arbitrary
    begin ParentWork:
        either
            PMayRW0: call mayRequestWork();
            PMaySW0: call mayShareWork();
        or
            PMaySW1: call mayShareWork();
            PMayRW1: call mayRequestWork();
        end either;
        \* But it will for sure tell the consumer to terminate at one point
        Terminate: call notify();
end process;

process consumer = ConsumerTID
    begin ConsumerWork:
        either
            \* if we have work we work on it
            FoundWork: skip;
        or
            \* we signal our intent to sleep, tell our parent and then sleep
            Sleeping0: intendToSleep();
            Sleeping1: msgToParent := "Waiting";
            Sleeping2: call wait();
        end either;
end process;

end algorithm; *)

Error trace (keep an eye on the consumerState and localConsumerState)

from weave.

mratsim commented on May 20, 2024

And it was not it, or it was hiding another one.
Unfortunately it disappear when adding full debug but when adding just debugTermination and looking into 8 happenings, the stacktrace is always the same:

Worker 1 parks
Worker 0 enter the barrier
Worker 0 receives the message
Deadlock

Note that rarely this doesn't lead to a deadlock. This combined with the formal verification in #54 suggest that the deadlock is at the system level, probably some ordering issue between sending a steal request and receiving the waiting message.

For example, between trySteal and declineAll (decline all processes the children wait status):

weave/weave/runtime.nim

Lines 121 to 129 in a064692

 trySteal(isOutOfTasks = true) 

 ascertain: myThefts().outstanding > 0 

 var task: Task 

 profile(idle): 

 while not recv(task, isOutOfTasks = true): 

 ascertain: myWorker().deque.isEmpty() 

 ascertain: myThefts().outstanding > 0 

 declineAll()

from weave.

mratsim commented on May 20, 2024

Okay so first of all it's not a deadlock it's a livelock.

Beginning of investigation, here is the stacktrace sampling after a minute of livelock.

Note: those are assumptions to get started with, and test various hypotheses:

Before the main entering the barrier, and dispatching tasks, all threads are starved for tasks and so immediately send a "WAITING" request to their parent

The main thread enters the barrier and tries to pop tasks.

weave/weave/runtime.nim

Lines 89 to 113 in a064692

 proc sync*(_: type Weave) {.gcsafe.} = 

 ## Global barrier for the Picasso runtime 

 ## This is only valid in the root task 

 Worker: return 

 debugTermination: 

 log(">>> Worker %2d enters barrier <<<\n", myID()) 

 preCondition: myTask().isRootTask() 

 block EmptyLocalQueue: 

 ## Empty all the tasks and before leaving the barrier 

 while true: 

 debug: log("Worker %2d: globalsync 1 - task from local deque\n", myID()) 

 while (let task = nextTask(childTask = false); not task.isNil): 

 # TODO: duplicate schedulingLoop 

 profile(run_task): 

 runTask(task) 

 profile(enq_deq_task): 

 # The memory is reused but not zero-ed 

 localCtx.taskCache.add(task) 

 if workforce() == 1: 

 localCtx.runtimeIsQuiescent = true 

 break EmptyLocalQueue

nextTask() is not a simple proc however, it also contains a while loop

weave/weave/scheduler.nim

Lines 133 to 156 in a064692

 proc nextTask*(childTask: bool): Task {.inline.} = 

 profile(enq_deq_task): 

 if childTask: 

 result = myWorker().deque.popFirstIfChild(myTask()) 

 else: 

 result = myWorker().deque.popFirst() 

 # TODO: steal early 

 shareWork() 

 # Check if someone requested to steal from us 

 var req: StealRequest 

 while recv(req): 

 # If we just popped a loop task, we may split it here 

 # It makes dispatching tasks simpler 

 if myWorker().deque.isEmpty() and result.isSplittable(): 

 if req.thiefID != myID(): 

 splitAndSend(result, req) 

 else: 

 forget(req) 

 else: 

 dispatchTasks(req)

.
dispatchTask() is called repeatedly because there are no splittable tasks in nqueens so we can't conclude at this point if the dequeue is empty or not.
Another important point is that shareWork which normally wakeup and calls child worker should be triggered there.

weave/weave/victims.nim

Lines 333 to 355 in a064692

 proc shareWork*() {.inline.} = 

 ## Distribute work to all the idle children workers 

 ## if we can 

 while not myWorker().workSharingRequests.isEmpty(): 

 # Only dequeue if we find work 

 let req = myWorker().workSharingRequests.peek() 

 ascertain: req.thiefID == myWorker().left or req.thiefID == myWorker.right 

 if distributeWork(req): # Shouldn't this need a copy? 

 if req.thiefID == myWorker().left: 

 ascertain: myWorker().leftIsWaiting 

 myWorker().leftIsWaiting = false 

 else: 

 ascertain: myWorker().rightIsWaiting 

 myWorker().rightIsWaiting = false 

 Backoff: 

 wakeup(req.thiefID) 

 # Now we can dequeue as we found work 

 # We cannot access the steal request anymore or 

 # we would have a race with the child worker recycling it. 

 discard myWorker().workSharingRequests.dequeue() 

 else: 

 break

We need to know more about that part. Since shareWork doesn't appear at all in stacktraces, either it has been inlined, or the call is cheap because the worksharingRequest queue is empty.
This is the only place that can wakeup the child worker in that loop. However that doesn't also explain why the main thread doesn't work on tasks or detect termination.

It tries to dispatchTasks to the thief

weave/weave/victims.nim

Lines 224 to 246 in a064692

 proc dispatchTasks*(req: sink StealRequest) {.gcsafe.}= 

 ## Send tasks in return of a steal request 

 ## or decline and relay the steal request to another thread 

 if req.thiefID == myID(): 

 receivedOwn(req) 

 return 

 profile(enq_deq_task): 

 let (task, loot) = req.takeTasks() 

 if not task.isNil: 

 profile(send_recv_task): 

 task.batch = loot 

 # TODO LastVictim 

 LazyFV: 

 batchConvertLazyFlowvar(task) 

 debug: log("Worker %2d: preparing %d task(s) for worker %2d with function address 0x%.08x\n", 

 myID(), loot, req.thiefID, task.fn) 

 req.send(task, loot) 

 else: 

 ascertain: myWorker().deque.isEmpty() 

 decline(req)

We know from the last branch that takeTasks failed to find a task. How come?

runTask was run and no tasks are left

Since it ran out of tasks, dispatchTasks calls decline and declineOwn

weave/weave/victims.nim

Lines 166 to 179 in a064692

 proc decline*(req: sink StealRequest) = 

 ## Pass steal request to another worker 

 ## or the manager if it's our own that came back 

 preCondition: req.retry <= WV_MaxRetriesPerSteal 

 req.retry += 1 

 incCounter(stealDeclined) 

 profile(send_recv_req): 

 if req.thiefID == myID(): 

 req.declineOwn() 

 else: # Not our own request 

 req.victims.excl(myID()) 

 req.findVictimAndRelaySteal()

weave/weave/victims.nim

Lines 125 to 164 in a064692

 proc declineOwn(req: sink StealRequest) = 

 ## Decline our own steal request 

 # The assumption that no one had jobs to steal 

 # does not hold when we process our child requests 

 # we might have taken one we sent to our children 

 # TODO: how to prevent cascading sleep 

 # preCondition: req.victims.isEmpty() 

 # debug: 

 # log("Worker %2d: received own request (req.state: %s, left (%d): %s, right (%d): %s)\n", 

 # myID(), $req.state, 

 # myWorker().left, 

 # if myWorker().leftIsWaiting: "waiting" else: "not waiting", 

 # myWorker().right, 

 # if myWorker().rightIsWaiting: "waiting" else: "not waiting") 

 if req.state == Stealing and myWorker().leftIsWaiting and myWorker().rightIsWaiting: 

 when WV_MaxConcurrentStealPerWorker == 1: 

 # When there is only one concurrent steal request allowed, it's always the last. 

 lastStealAttemptFailure(req) 

 else: 

 # Is this the last theft attempt allowed per steal request? 

 # - if so: lastStealAttemptFailure special case (termination if lead thread, sleep if worker) 

 # - if not: drop it and wait until we receive work or all out steal requests failed. 

 if myThefts().outstanding == WV_MaxConcurrentStealPerWorker and 

 myTodoBoxes().len == WV_MaxConcurrentStealPerWorker - 1: 

 # "WV_MaxConcurrentStealPerWorker - 1" steal requests have been dropped 

 # as evidenced by the corresponding channel "address boxes" being recycled 

 ascertain: myThefts().dropped == WV_MaxConcurrentStealPerWorker - 1 

 lastStealAttemptFailure(req) 

 else: 

 drop(req) 

 else: 

 # Our own request but we still have work, so we reset it and recirculate. 

 ascertain: req.victims.capacity.int32 == workforce() 

 req.retry = 0 

 req.victims.refill() 

 req.victims.excl(myID()) 

 req.findVictimAndRelaySteal()

. From the if condition and the fact that termination is not detected we know that either the request is not in Stealing state, or the master thread didn't flag the worker as waiting yet. We are in the first case. This is proven by the call to recvProxy which receives steal requests on behalf of the idling worker

Then we find victim and relay the steal request. Looking into the underlying targeting, we know that that steal request will always target worker 1 as we take the myID() branch:

weave/weave/targets.nim

Lines 44 to 69 in a064692

 proc findVictim*(req: var StealRequest): WorkerID = 

 preCondition: 

 myID() notin req.victims 

 result = Not_a_worker 

 if req.thiefID == myID(): 

 # Steal request initiated by the current worker. 

 # Send it to a random one 

 ascertain: req.retry == 0 

 result = myThefts().rng.uniform(workforce()) 

 while result == myID(): 

 result = myThefts().rng.uniform(workforce()) 

 elif req.retry == WV_MaxRetriesPerSteal: 

 # Return steal request to thief 

 # logVictims(req.victims, req.thiefID) 

 result = req.thiefID 

 else: 

 # Forward steal request to a different worker if possible 

 # Also pass along information on the workers we manage 

 if myWorker().leftIsWaiting and myWorker().rightIsWaiting: 

 markIdle(req.victims, myID()) 

 elif myWorker().leftIsWaiting: 

 markIdle(req.victims, myWorker().left) 

 elif myWorker().rightIsWaiting: 

 markIdle(req.victims, myWorker().right)

Now we need to know when recvProxy is called within this sync/barrier loop and also where does the main thread as the opportunity to tag runtimeIsQuiescent or change to Stealing status. And it's in nextTask() in while recv(req)

weave/weave/scheduler.nim

Lines 133 to 156 in a064692

 proc nextTask*(childTask: bool): Task {.inline.} = 

 profile(enq_deq_task): 

 if childTask: 

 result = myWorker().deque.popFirstIfChild(myTask()) 

 else: 

 result = myWorker().deque.popFirst() 

 # TODO: steal early 

 shareWork() 

 # Check if someone requested to steal from us 

 var req: StealRequest 

 while recv(req): 

 # If we just popped a loop task, we may split it here 

 # It makes dispatching tasks simpler 

 if myWorker().deque.isEmpty() and result.isSplittable(): 

 if req.thiefID != myID(): 

 splitAndSend(result, req) 

 else: 

 forget(req) 

 else: 

 dispatchTasks(req)

In summary, here is the livelock as I understand it:

Worker 0 sends a steal request with state Working, how? why? That can only happen in forceFuture:

weave/weave/scheduler.nim

Lines 301 to 317 in a064692

 while not isFutReady(): 

 trySteal(isOutOfTasks = false) 

 var task: Task 

 profile(idle): 

 while not recv(task, isOutOfTasks = false): 

 # We might inadvertently remove our own steal request in 

 # dispatchTasks so resteal 

 profile_stop(idle) 

 trySteal(isOutOfTasks = false) 

 # If someone wants our non-child tasks, let's oblige 

 var req: StealRequest 

 while recv(req): 

 dispatchTasks(req) 

 profile_start(idle) 

 if isFutReady(): 

 profile_stop(idle) 

 return

In the mean time, Worker 1 resigned and asked to sleep.
Worker 0 finishes all tasks and proceed to the sync(Weave) barrier

It tries to pop up the first nextTask and will be livelocked happily ever after:

weave/weave/runtime.nim

Lines 89 to 103 in a064692

 proc sync*(_: type Weave) {.gcsafe.} = 

 ## Global barrier for the Picasso runtime 

 ## This is only valid in the root task 

 Worker: return 

 debugTermination: 

 log(">>> Worker %2d enters barrier <<<\n", myID()) 

 preCondition: myTask().isRootTask() 

 block EmptyLocalQueue: 

 ## Empty all the tasks and before leaving the barrier 

 while true: 

 debug: log("Worker %2d: globalsync 1 - task from local deque\n", myID()) 

 while (let task = nextTask(childTask = false); not task.isNil):

In nextTask

weave/weave/scheduler.nim

Lines 133 to 156 in a064692

 proc nextTask*(childTask: bool): Task {.inline.} = 

 profile(enq_deq_task): 

 if childTask: 

 result = myWorker().deque.popFirstIfChild(myTask()) 

 else: 

 result = myWorker().deque.popFirst() 

 # TODO: steal early 

 shareWork() 

 # Check if someone requested to steal from us 

 var req: StealRequest 

 while recv(req): 

 # If we just popped a loop task, we may split it here 

 # It makes dispatching tasks simpler 

 if myWorker().deque.isEmpty() and result.isSplittable(): 

 if req.thiefID != myID(): 

 splitAndSend(result, req) 

 else: 

 forget(req) 

 else: 

 dispatchTasks(req)

The first stealRequest received should be the "Waiting" request of the child, which is consistent with the interleave Worker 1 sends WAITING request -> Worker 0 enters barrier -> Worker 0 receives WAITING request. The master thread flags its child as waiting.

The master thread now rechecks steal request, but now it can check steal requests in its child stead as it knows it is sleeping, so it retrieves its own request in its child channel.

Remember, the request was sent in ForceFuture with the state flagged as Working so in declineOwn it's recirculated in the same state:

weave/weave/victims.nim

Lines 125 to 164 in a064692

 proc declineOwn(req: sink StealRequest) = 

 ## Decline our own steal request 

 # The assumption that no one had jobs to steal 

 # does not hold when we process our child requests 

 # we might have taken one we sent to our children 

 # TODO: how to prevent cascading sleep 

 # preCondition: req.victims.isEmpty() 

 # debug: 

 # log("Worker %2d: received own request (req.state: %s, left (%d): %s, right (%d): %s)\n", 

 # myID(), $req.state, 

 # myWorker().left, 

 # if myWorker().leftIsWaiting: "waiting" else: "not waiting", 

 # myWorker().right, 

 # if myWorker().rightIsWaiting: "waiting" else: "not waiting") 

 if req.state == Stealing and myWorker().leftIsWaiting and myWorker().rightIsWaiting: 

 when WV_MaxConcurrentStealPerWorker == 1: 

 # When there is only one concurrent steal request allowed, it's always the last. 

 lastStealAttemptFailure(req) 

 else: 

 # Is this the last theft attempt allowed per steal request? 

 # - if so: lastStealAttemptFailure special case (termination if lead thread, sleep if worker) 

 # - if not: drop it and wait until we receive work or all out steal requests failed. 

 if myThefts().outstanding == WV_MaxConcurrentStealPerWorker and 

 myTodoBoxes().len == WV_MaxConcurrentStealPerWorker - 1: 

 # "WV_MaxConcurrentStealPerWorker - 1" steal requests have been dropped 

 # as evidenced by the corresponding channel "address boxes" being recycled 

 ascertain: myThefts().dropped == WV_MaxConcurrentStealPerWorker - 1 

 lastStealAttemptFailure(req) 

 else: 

 drop(req) 

 else: 

 # Our own request but we still have work, so we reset it and recirculate. 

 ascertain: req.victims.capacity.int32 == workforce() 

 req.retry = 0 

 req.victims.refill() 

 req.victims.excl(myID()) 

 req.findVictimAndRelaySteal()

A job well done, so now let's see if we still have other steal requests to handle, no? let's look at our child then, oh, there is one, oh our own? oh not in Working state, let's recirculate it then, over and over ...

Now questions:

where does a worker change the state of its own requests from Working to Stealing?
-> In declineAll

weave/weave/scheduler.nim

Lines 158 to 166 in a064692

 proc declineAll*() = 

 var req: StealRequest 

 profile_stop(idle) 

 if recv(req): 

 if req.thiefID == myID() and req.state == Working: 

 req.state = Stealing 

 decline(req)

which is called in a later part of the sync barrier.

when does a "runtimeBecomeQuiescent" / the lead thread sends a termination signal
-> only when it receives its own Steal Request in status Stealing:

weave/weave/victims.nim

Lines 125 to 145 in a064692

 proc declineOwn(req: sink StealRequest) = 

 ## Decline our own steal request 

 # The assumption that no one had jobs to steal 

 # does not hold when we process our child requests 

 # we might have taken one we sent to our children 

 # TODO: how to prevent cascading sleep 

 # preCondition: req.victims.isEmpty() 

 # debug: 

 # log("Worker %2d: received own request (req.state: %s, left (%d): %s, right (%d): %s)\n", 

 # myID(), $req.state, 

 # myWorker().left, 

 # if myWorker().leftIsWaiting: "waiting" else: "not waiting", 

 # myWorker().right, 

 # if myWorker().rightIsWaiting: "waiting" else: "not waiting") 

 if req.state == Stealing and myWorker().leftIsWaiting and myWorker().rightIsWaiting: 

 when WV_MaxConcurrentStealPerWorker == 1: 

 # When there is only one concurrent steal request allowed, it's always the last. 

 lastStealAttemptFailure(req)

are we doomed?
--> yes

Disclaimer: this analysis is not backed by anything but my VTune stacktraces, the code and the unreliable screenshot from the previous post (unreliable because we can't rely on stdout print from interleave thread execution). I may have missed something. In short, I have no log or timestamped call sequences.

from weave.

mratsim commented on May 20, 2024

Fixed in #55 now there is the glibc stuff from #56 left ...

from weave.

Livelock with backoff on Nqueens 11 about weave HOT 5 CLOSED

Comments (5)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent

Jobs

	func notify*(en: var EventNotifier) {.inline.} =
	## Signal a thread that it can be unparked

	# No thread waiting, return
	let consumerState = en.consumerState.load(moRelaxed)
	if consumerState in {Busy, ShouldWakeup}:
	fence(moAcquire)
	return

	func intendToSleep*(en: var EventNotifier) {.inline.} =
	## The consumer intends to sleep soon.
	## This must be called before the formal notification
	## via a channel.
	assert en.consumerState.load(moRelaxed) == Busy

	fence(moRelease)
	en.consumerState.store(IntendToSleep, moRelaxed)

	func wait*(en: var EventNotifier) {.inline.} =
	## Wait until we are signaled of an event
	## Thread is parked and does not consume CPU resources

	var expected = IntendToSleep
	if compareExchange(en.consumerState, expected, Parked, moAcquireRelease):
	while en.consumerState.load(moRelaxed) == Parked:
	# We only used the lock for the condition variable, we protect via atomics otherwise
	fence(moAcquire)
	en.cond.wait(en.lock)

	trySteal(isOutOfTasks = true)
	ascertain: myThefts().outstanding > 0

	var task: Task
	profile(idle):
	while not recv(task, isOutOfTasks = true):
	ascertain: myWorker().deque.isEmpty()
	ascertain: myThefts().outstanding > 0
	declineAll()

	proc sync*(_: type Weave) {.gcsafe.} =
	## Global barrier for the Picasso runtime
	## This is only valid in the root task
	Worker: return

	debugTermination:
	log(">>> Worker %2d enters barrier <<<\n", myID())

	preCondition: myTask().isRootTask()

	block EmptyLocalQueue:
	## Empty all the tasks and before leaving the barrier
	while true:
	debug: log("Worker %2d: globalsync 1 - task from local deque\n", myID())
	while (let task = nextTask(childTask = false); not task.isNil):
	# TODO: duplicate schedulingLoop
	profile(run_task):
	runTask(task)
	profile(enq_deq_task):
	# The memory is reused but not zero-ed
	localCtx.taskCache.add(task)

	if workforce() == 1:
	localCtx.runtimeIsQuiescent = true
	break EmptyLocalQueue

	proc nextTask*(childTask: bool): Task {.inline.} =

	profile(enq_deq_task):
	if childTask:
	result = myWorker().deque.popFirstIfChild(myTask())
	else:
	result = myWorker().deque.popFirst()

	# TODO: steal early

	shareWork()

	# Check if someone requested to steal from us
	var req: StealRequest
	while recv(req):
	# If we just popped a loop task, we may split it here
	# It makes dispatching tasks simpler
	if myWorker().deque.isEmpty() and result.isSplittable():
	if req.thiefID != myID():
	splitAndSend(result, req)
	else:
	forget(req)
	else:
	dispatchTasks(req)

	proc shareWork*() {.inline.} =
	## Distribute work to all the idle children workers
	## if we can
	while not myWorker().workSharingRequests.isEmpty():
	# Only dequeue if we find work
	let req = myWorker().workSharingRequests.peek()
	ascertain: req.thiefID == myWorker().left or req.thiefID == myWorker.right
	if distributeWork(req): # Shouldn't this need a copy?
	if req.thiefID == myWorker().left:
	ascertain: myWorker().leftIsWaiting
	myWorker().leftIsWaiting = false
	else:
	ascertain: myWorker().rightIsWaiting
	myWorker().rightIsWaiting = false
	Backoff:
	wakeup(req.thiefID)

	# Now we can dequeue as we found work
	# We cannot access the steal request anymore or
	# we would have a race with the child worker recycling it.
	discard myWorker().workSharingRequests.dequeue()
	else:
	break

	proc dispatchTasks*(req: sink StealRequest) {.gcsafe.}=
	## Send tasks in return of a steal request
	## or decline and relay the steal request to another thread

	if req.thiefID == myID():
	receivedOwn(req)
	return

	profile(enq_deq_task):
	let (task, loot) = req.takeTasks()

	if not task.isNil:
	profile(send_recv_task):
	task.batch = loot
	# TODO LastVictim
	LazyFV:
	batchConvertLazyFlowvar(task)
	debug: log("Worker %2d: preparing %d task(s) for worker %2d with function address 0x%.08x\n",
	myID(), loot, req.thiefID, task.fn)
	req.send(task, loot)
	else:
	ascertain: myWorker().deque.isEmpty()
	decline(req)

	proc decline*(req: sink StealRequest) =
	## Pass steal request to another worker
	## or the manager if it's our own that came back
	preCondition: req.retry <= WV_MaxRetriesPerSteal

	req.retry += 1
	incCounter(stealDeclined)

	profile(send_recv_req):
	if req.thiefID == myID():
	req.declineOwn()
	else: # Not our own request
	req.victims.excl(myID())
	req.findVictimAndRelaySteal()

	proc declineOwn(req: sink StealRequest) =
	## Decline our own steal request

	# The assumption that no one had jobs to steal
	# does not hold when we process our child requests
	# we might have taken one we sent to our children
	# TODO: how to prevent cascading sleep
	# preCondition: req.victims.isEmpty()

	# debug:
	# log("Worker %2d: received own request (req.state: %s, left (%d): %s, right (%d): %s)\n",
	# myID(), $req.state,
	# myWorker().left,
	# if myWorker().leftIsWaiting: "waiting" else: "not waiting",
	# myWorker().right,
	# if myWorker().rightIsWaiting: "waiting" else: "not waiting")

	if req.state == Stealing and myWorker().leftIsWaiting and myWorker().rightIsWaiting:
	when WV_MaxConcurrentStealPerWorker == 1:
	# When there is only one concurrent steal request allowed, it's always the last.
	lastStealAttemptFailure(req)
	else:
	# Is this the last theft attempt allowed per steal request?
	# - if so: lastStealAttemptFailure special case (termination if lead thread, sleep if worker)
	# - if not: drop it and wait until we receive work or all out steal requests failed.
	if myThefts().outstanding == WV_MaxConcurrentStealPerWorker and
	myTodoBoxes().len == WV_MaxConcurrentStealPerWorker - 1:
	# "WV_MaxConcurrentStealPerWorker - 1" steal requests have been dropped
	# as evidenced by the corresponding channel "address boxes" being recycled
	ascertain: myThefts().dropped == WV_MaxConcurrentStealPerWorker - 1
	lastStealAttemptFailure(req)
	else:
	drop(req)
	else:
	# Our own request but we still have work, so we reset it and recirculate.
	ascertain: req.victims.capacity.int32 == workforce()
	req.retry = 0
	req.victims.refill()
	req.victims.excl(myID())
	req.findVictimAndRelaySteal()

	proc findVictim*(req: var StealRequest): WorkerID =
	preCondition:
	myID() notin req.victims

	result = Not_a_worker

	if req.thiefID == myID():
	# Steal request initiated by the current worker.
	# Send it to a random one
	ascertain: req.retry == 0
	result = myThefts().rng.uniform(workforce())
	while result == myID():
	result = myThefts().rng.uniform(workforce())
	elif req.retry == WV_MaxRetriesPerSteal:
	# Return steal request to thief
	# logVictims(req.victims, req.thiefID)
	result = req.thiefID
	else:
	# Forward steal request to a different worker if possible
	# Also pass along information on the workers we manage
	if myWorker().leftIsWaiting and myWorker().rightIsWaiting:
	markIdle(req.victims, myID())
	elif myWorker().leftIsWaiting:
	markIdle(req.victims, myWorker().left)
	elif myWorker().rightIsWaiting:
	markIdle(req.victims, myWorker().right)

	while not isFutReady():
	trySteal(isOutOfTasks = false)
	var task: Task
	profile(idle):
	while not recv(task, isOutOfTasks = false):
	# We might inadvertently remove our own steal request in
	# dispatchTasks so resteal
	profile_stop(idle)
	trySteal(isOutOfTasks = false)
	# If someone wants our non-child tasks, let's oblige
	var req: StealRequest
	while recv(req):
	dispatchTasks(req)
	profile_start(idle)
	if isFutReady():
	profile_stop(idle)
	return

	proc declineAll*() =
	var req: StealRequest

	profile_stop(idle)

	if recv(req):
	if req.thiefID == myID() and req.state == Working:
	req.state = Stealing
	decline(req)