7. Nested “try” blocks 3. Awaiter Pattern struct FooAsync_StateMachine : IAsyncStateMachine { //(1,2,3) private int _state; public AsyncTaskMethodBuilder _builder; private void MoveNext() { try { switch (_state) { TRANSFORMED_BODY } catch (Exception ex) { _builder.SetException(ex); return; } _builder.SetResult(); } private void SetStateMachine(IAsyncStateMachine sm) { _builder.SetStateMachine(sm); } switch (_state) { case 0: goto AFTERAWAIT0; case -1: // fallthrough } Console.WriteLine("a"); TaskAwaiter tmp = Task.Delay(100).GetAwaiter(); //(4) if (!tmp.IsCompleted) { //(5) _state = 0; _awaiter = tmp; _builder.AwaitOnCompleted(ref tmp, ref this); //(8) return; AFTERAWAIT0: //(9) tmp = (TaskAwaiter)_awaiter; _awaiter = default(TaskAwaiter); } tmp.GetResult(); //(6) tmp = default(TaskAwaiter); //(7) Console.WriteLine("b"); class Task { TaskAwaiter GetAwaiter() { return new TaskAwaiter (this); } struct TaskAwaiter : ICriticalNotifyCompletion { private var m_delegates = new ConcurrentQueue (); public bool IsCompleted { get; } public T GetResults() { return...; } public void OnCompleted(Action cont) { //(10) var ec = ExecutionContext.Capture(); var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { ExecutionContext.Run(ec, delegate { sc.Post(_ => cont(), null); }, null); }); } [SecurityCritical] //(11) public void UnsafeOnCompleted(Action cont) { var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } struct TaskAwaiter : INotifyCompletion { private var m_delegates = new ConcurrentQueue (); public bool IsCompleted { get; } public T GetResults() { return...; } public void OnCompleted(Action cont) { //(12) var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } 1. State Machine public struct System.Runtime.CompilerServices.AsyncTaskMethodBuilder { internal IAsyncStateMachine m_sm; public void Start (ref TSM sm) where TSM : IAsyncStateMachine { //(2) Thread t = Thread.CurrentThread; ExecutionContextSwitcher ecs = default(ExecutionContextSwitcher); RuntimeHelpers.PrepareConstrainedRegions(); try { ExecutionContext.EstablishCopyOnWriteScope(t, false, ref ecs); sm.MoveNext(); } finally { ecs.Undo(currentThread); } internal struct VoidTaskResult { } class AsyncMethodBuilder : public void AwaitOnCompleted (ref TA a, ref TSM sm) where TA : INotifyCompletion where TSM : IAsyncStateMachine { if (m_sm == null) { var ignored = this.Task; // allocate a reference m_sm = (IAsyncStateMachine)sm; // box on first await m_sm.SetStateMachine(m_sm); // tie up lose ends } Action cont; var ctx = ExecutionContext.FastCapture(); if (ctx == ExecutionContext.PreAllocatedDefault) { cont = m_defaultContextAction; if (cont == null) cont = new Action(new MoveNextRunner(ctx, this.m_sm).Run); } else { cont = new MoveNextRunner(ctx, m_sm).Run; } a.OnCompleted(cont); } public void AwaitUnsafeOnCompleted (ref TA a, ref TSM sm) //(11) where TA : ICriticalNotifyCompletion where TSM : IAsyncStateMachine {... exactly as above a.UnsafeOnCompleted(cont); } static public void MoveNext() { bool _fin = true; switch (_state) { case 0: goto AFTERAWAIT0; case 1,2: goto STAGEPOST; case 3: goto AFTERAWAIT3; case -1: /*fallthrough*/ } if (!t0.IsCompleted) {_state=0; _fin=false; return; AFTERAWAIT0: _state=-1;} STAGEPOST: try { switch (_state) { case 1: goto AFTERAWAIT1; case 2: goto AFTERAWAIT2; case -1: /*fallthrough*/ } if (!t1.IsCompleted) {_state=1; _fin=false; return; AFTERAWAIT1: _state=-1;} if (!t2.IsCompleted) {_state=2; _fin=false; return; AFTERAWAIT2; _state=-1;} } finally { if (_fin) { Console.Write("f"); } } if (!t3.IsCompleted) {_state=3; _fin=false; return; AFTERAWAIT3; _state=-1;} } class FooAsync_StateMachine: private int _x, _z; MoveNext: this._x = 10; await t1; this._z = 10; { int y = 15; Console.Write(this._x + y + this._x); } 6. Lifted local variables 2. struct AsyncTaskMethodBuilder 4. AwaitOnCompleted()5. struct TaskAwaiter [[PUSH a]] ; (int[]) [[PUSH i]] ; (int[], int) DUP0:DUP1 ; (int[], int, int[], int) LDELEM ; (int[], int, int) POP ; (int[], int) TUPLE.NEW ; (Tuple ) STFLD this._stack 8. Stack spilling async Task FooAsync() { BODY } Console.WriteLine("a"); await Task.Delay(100); Console.WriteLine("b"); int x = 10; await t1; int z = 10; { int y = 15; Console.Write(x + y + z); } await t0; try { await t1; await t2; } finally { Console.Write("f"); } await t3; int[] a; int i; a[i].CompareTo(await t); Async Codegen Lucian Wischik, VB Language PM These slides describe the IL that’s emitted when you use the Async and Await keywords in VB/C#. Task FooAsync() { var sm = new FooAsync_StateMachine(); sm._state = -1;... copy params & this if needed into S.M. sm._builder = AsyncTaskMethodBuilder.Create(); sm._builder.Start(ref sm); return sm._builder.Task; }... RETURN ; do awaiter pattern... AFTERAWAIT0: LDFLD this._stack ; (Tuple ) MAKE_STACK ; (int[], int) CALL t.GetResult ; (int[], int, int) MAKE_LVALUES ; (&int, int) CALL CompareTo ; (bool)
(1) The compiler implicitly generates a state machine for each async method. Each state corresponds to a piece of code between await statements. The MoveNext() method will advance it to the next state. (2) The state-machine is a struct, for efficiency reasons -- so that on the “fast path” where no awaits were actually needed, then it doesn’t need to be allocated on the heap. (3) The state machine implements System.Runtime.CompilerServices.IAsyncStateMachine. This interface is part of the protocol for making it efficient and secure -- detailed later. struct FooAsync_StateMachine : IAsyncStateMachine { //(1,2,3) private int _state; public AsyncTaskMethodBuilder _builder; private void MoveNext() { try { switch (_state) { TRANSFORMED_BODY } catch (Exception ex) { _builder.SetException(ex); return; } _builder.SetResult(); } private void SetStateMachine(IAsyncStateMachine sm) { _builder.SetStateMachine(sm); } 1. State Machine async Task FooAsync() { BODY } Task FooAsync() { var sm = new FooAsync_StateMachine(); sm._state = -1;... copy params & this if needed into S.M. sm._builder = AsyncTaskMethodBuilder.Create(); sm._builder.Start(ref sm); return sm._builder.Task; } Perf tip: The async tranformation will add code and will add local variables. If the JIT finds too much code or too many variables, then it degrades drastically. So, you’ll reach this JIT limit sooner with async methods. Perf tip: The async tranformation will add code and will add local variables. If the JIT finds too much code or too many variables, then it degrades drastically. So, you’ll reach this JIT limit sooner with async methods.
(2) We use structs a lot… The pattern "void Start (ref T s) where T:I" lets us use a struct via an interface, but without boxing or copying the struct. This comes at the cost of JIT having to generate a new Start for each TSM. The state-machine is a struct, which contains AsyncTaskMethodBuilder which is a struct. At the first await point, the state machine will be boxed onto the heap, with the builder inside it, and the builder will get a pointer to that boxed state machine. It takes some work to maintain these circular references of structs. async void f()- uses AsyncVoidMethodBuilder async Task f()- uses AsyncTaskMethodBuilder async Task f()- uses AsyncTaskMethodBuilder, which wraps AsyncTaskMethodBuilder public struct System.Runtime.CompilerServices.AsyncTaskMethodBuilder { internal IAsyncStateMachine m_sm; public void Start (ref TSM sm) where TSM : IAsyncStateMachine { //(2) Thread t = Thread.CurrentThread; ExecutionContextSwitcher ecs = default(ExecutionContextSwitcher); RuntimeHelpers.PrepareConstrainedRegions(); try { ExecutionContext.EstablishCopyOnWriteScope(t, false, ref ecs); sm.MoveNext(); } finally { ecs.Undo(currentThread); } internal struct VoidTaskResult { } 2. struct AsyncTaskMethodBuilder
(4) The await operator is pattern-based. For “await t”, the compiler makes a call to t.GetAwaiter() to get an awaiter. For instance, you could make an extension method “MyAwaiter GetAwaiter(this int i)” to be able to await integers – in which case tmp would have type MyAwaiter rather than TaskAwaiter. WinRT uses this, so you can await an IAsyncInfo. (5) In this case we awaited Task.Delay(100), which won’t have completed yet. But imagine if the task had already completed. Then it would go straight to calling tmp.GetResult(), with no need for heap allocations. (6) The job of tmp.GetResult() is to throw any exceptions from the task (if any), and to return a value (if any). (7) We null-out the temporary variable immediately so it can be garbage-collected. 3. Awaiter Pattern switch (_state) { case 0: goto AFTERAWAIT0; case -1: // fallthrough } Console.WriteLine("a"); TaskAwaiter tmp = Task.Delay(100).GetAwaiter(); //(4) if (!tmp.IsCompleted) { //(5) _state = 0; _awaiter = tmp; _builder.AwaitOnCompleted(ref tmp, ref this); //(8) return; AFTERAWAIT0: //(9) tmp = (TaskAwaiter)_awaiter; _awaiter = default(TaskAwaiter); } tmp.GetResult(); //(6) tmp = default(TaskAwaiter); //(7) Console.WriteLine("b"); Console.WriteLine("a"); await Task.Delay(100); Console.WriteLine("b"); Perf tip: If every “await” is already completed, then it will avoid all heap allocations apart from the final resultant Task. But the overhead is still about 2x that of a non-async method. Perf tip: If every “await” is already completed, then it will avoid all heap allocations apart from the final resultant Task. But the overhead is still about 2x that of a non-async method.
(8) AwaitOnCompleted ultimately calls awaiter.OnCompleted(MoveNext)… First, if needed, it boxes the state machine on the heap, including the AsyncMethodBuilder struct. SetStateMachine gives the builder a back-pointer to the S.M. Its two arguments are both ref parameters so that we don’t need to copy them to pass them. The builder backs up the ExecutionContext before invoking Awaiter.MoveNext. And it allocates & passes a delegate (through class MoveNextRunner) which will restore the ExecutionContext. This is for security. It’s much cheaper if the ExecutionContext is never changed. class AsyncMethodBuilder : public void AwaitOnCompleted (ref TA a, ref TSM sm) where TA : INotifyCompletion where TSM : IAsyncStateMachine { if (m_sm == null) { var ignored = this.Task; // allocate a reference m_sm = (IAsyncStateMachine)sm; // box on first await m_sm.SetStateMachine(m_sm); // tie up lose ends } Action cont; var ctx = ExecutionContext.FastCapture(); if (ctx == ExecutionContext.PreAllocatedDefault) { cont = m_defaultContextAction; if (cont == null) cont = new Action(new MoveNextRunner(ctx, this.m_sm).Run); } else { cont = new MoveNextRunner(ctx, m_sm).Run; } a.OnCompleted(cont); } 4. AwaitOnCompleted() Perf tip: If you avoid modifying ExecutionContext, then it avoids further heap allocations and is faster. Perf tip: If you avoid modifying ExecutionContext, then it avoids further heap allocations and is faster.
(12) The user expects the code after "await" to resume on the same SynchronizationContext. It's the job of each awaiter to save the sync. context, and execute its continuation on that context. Thus: the AsyncMethodBuilder takes care of saving and restoring ExecutionContext (since that would be a security loophole otherwise), and the awaiter takes care of saving and restoring SynchronizationContext. class Task { TaskAwaiter GetAwaiter() { return new TaskAwaiter (this); } struct TaskAwaiter : INotifyCompletion { private var m_delegates = new ConcurrentQueue (); public bool IsCompleted { get; } public T GetResults() { return...; } public void OnCompleted(Action cont) { //(12) var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } 5. struct TaskAwaiter Perf tip: You’ll typically make Structure awaiters, to avoid unnecessary heap allocation. They should be immutable: mutations in OnCompleted will be discarded if the state-machine gets boxed! Perf tip: You’ll typically make Structure awaiters, to avoid unnecessary heap allocation. They should be immutable: mutations in OnCompleted will be discarded if the state-machine gets boxed!
(10) But if your awaiter is in full- trust assembly which AllowsPartiallyTrustedCallers, then this would be a security hole. Therefore, your OnCompleted method has to save and restore ExecutionContext as well. (11) It would be inefficient if both AsyncMethodBuilder and your awaiter had to save+restore ExecutionContext. And so, if your awaiter implements ICriticalNotifyCompletion, then the compiler will instead emit a call to the _builder method AwaitUnsafeOnCompleted, which saves ExecutionContext as before. This will call the awaiter method UnsafeOnCompleted. You’re at liberty here to avoid restorign ExecutionContext, so long as you mark your method [SecurityCritical] – this prevents partially-trusted callers. class Task { TaskAwaiter GetAwaiter() { return new TaskAwaiter (this); } 5. struct TaskAwaiter struct TaskAwaiter : ICriticalNotifyCompletion { private var m_delegates = new ConcurrentQueue (); public bool IsCompleted { get; } public T GetResults() { return...; } public void OnCompleted(Action cont) { //(10) var ec = ExecutionContext.Capture(); var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { ExecutionContext.Run(ec, delegate { sc.Post(_ => cont(), null); }, null); }); } [SecurityCritical] //(11) public void UnsafeOnCompleted(Action cont) { var sc = SynchronizationContext.Current; m_delegates.Enqueue(delegate { sc.Post(_ => cont(), null); }); } class AsyncMethodBuilder : public void AwaitUnsafeOnCompleted (ref TA a, ref TSM sm) //(11) where TA : ICriticalNotifyCompletion where TSM : IAsyncStateMachine {... exactly as above a.UnsafeOnCompleted(cont); } Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick. Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick.
3. Awaiter Pattern - dynamic Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick. Perf tip: If you’re full-trust and allow partially-trusted callers, then use this UnsafeOnCompleted trick. Console.WriteLine("a"); dynamic tmp = t.GetAwaiter(); // dynamic method-call if (!tmp.IsCompleted) { // dynamic property-get _state = 0; _awaiter = tmp; var cnc = tmp as ICriticalNotifyCompletion; // CLR cast, not dynamic if (cnc != null) { _builder.AwaitUnsafeOnCompleted(ref cnc, ref this); } else { var nc = (INotifyCompletion)tmp; // CLR cast, not dynamic _builder.AwaitOnCompleted(ref nc, ref this); } return; AFTERAWAIT0: tmp = _awaiter; _awaiter = null; } tmp.GetResult(); // dynamic method-call tmp = null; Console.WriteLine("b"); Console.WriteLine("a"); await (dynamic)t; Console.WriteLine("b"); In the case of a late-bound (dynamic) await call, it attempts to cast the awaiter first as ICriticalNotifyCompletion and then as INotifyCompletion. This is done using CLR casts rather than dynamic casts.
Ideally it’d work like this: “If a local is written before an await, and read after an await, then it must be lifted into the state-machine (either by permanently locating it in the state machine, or by putting there just before an await and restored afterwards, whichever is more efficient).” In practice: If a local’s scope includes an await, then C# will permanently relocate the local into the state machine. This includes “z” in the above code. VB will relocate ALL locals into the state machine. class FooAsync_StateMachine: private int _x, _z; MoveNext: this._x = 10; await t1; this._z = 10; { int y = 15; Console.Write(this._x + y + this._x); } 6. Lifted local variables int x = 10; await t1; int z = 10; { int y = 15; Console.Write(x + y + z); } Perf tip: Everything runs much faster with local variables. Factor out your compute-bound inner loops into separate methods, away from “await”, so they can run faster. Perf tip: Everything runs much faster with local variables. Factor out your compute-bound inner loops into separate methods, away from “await”, so they can run faster.
If there are “try” blocks, then the compiler emits additional switch-blocks for each one. It’s illegal to jump straight into a TRY, so the compiler uses a “staging post” label to fall into the try. Also, VB iterators use the same MoveNext trick for iterators: when iterator’s Dispose method is called, it sets a field _isDisposing=true in the state machine, and jumps into the MoveNext. Each switchblock is followed by a test “if (isDisposing) then return false”, which will end up calling each finally block on the way out. 7. Nested “try” blocks static public void MoveNext() { bool _fin = true; switch (_state) { case 0: goto AFTERAWAIT0; case 1,2: goto STAGEPOST; case 3: goto AFTERAWAIT3; case -1: /*fallthrough*/ } if (!t0.IsCompleted) {_state=0; _fin=false; return; AFTERAWAIT0: _state=-1;} STAGEPOST: try { switch (_state) { case 1: goto AFTERAWAIT1; case 2: goto AFTERAWAIT2; case -1: /*fallthrough*/ } if (!t1.IsCompleted) {_state=1; _fin=false; return; AFTERAWAIT1: _state=-1;} if (!t2.IsCompleted) {_state=2; _fin=false; return; AFTERAWAIT2; _state=-1;} } finally { if (_fin) { Console.Write("f"); } } if (!t3.IsCompleted) {_state=3; _fin=false; return; AFTERAWAIT3; _state=-1;} } await t0; try { await t1; await t2; } finally { Console.Write("f"); } await t3;
Normally, the compiler will evaluate each sub-expression in turn (pushing it onto the stack), then it will call the desired operation. If there is an “await” which doesn’t take the fast-path, then it’ll have to save the stack into the state machine before returning. That’s because the stack has to be empty when at each RETURN operaton. The compiler saves the stack into a Tuple of the appropriate type, stored in a state-machine field “_stack” of type Object. But if there were any managed addresses on the stack, then this isn’t allowed. In the above code, it would normally want to evaluate a[i] as a managed address (lvalue) before calling the CompareTo method; all struct methods are similarly invoked on managed addresses in case the method mutates the struct. In such cases the compiler has to avoid pushing the managed address onto the stack in the first place. Instead it pushes the constituent parts (in this case “a” and “i”). These can be saved into the Tuple okay. Later on, after the “await” has finished and immediately prior to the call to “CompareTo”, it reconstitutes those constituent parts into the address. Note: it still had to issue a dummy LDELEM call in advance, just to shake out any ArrayIndexExceptions that might arise. Managed addresses can come only from “LOCAL”, “rvalue.FIELD”, “rvalue[rvalue]” and from ByRef parameters (disallowed in async methods). Managed addresses are only ever consumed by “lvalue.M(…)”, “lvalue=rvalue”, “lvalue+=rvalue”, “lvalue++”, and passing an lvalue ByRef. [[PUSH a]] ; (int[]) [[PUSH i]] ; (int[], int) DUP0:DUP1 ; (int[], int, int[], int) LDELEM ; (int[], int, int) POP ; (int[], int) TUPLE.NEW ; (Tuple ) STFLD this._stack 8. Stack spilling int[] a; int i; a[i].CompareTo(await t);... RETURN ; do awaiter pattern... AFTERAWAIT0: LDFLD this._stack ; (Tuple ) MAKE_STACK ; (int[], int) CALL t.GetResult ; (int[], int, int) MAKE_LVALUES ; (&int, int) CALL CompareTo ; (bool) Perf tip: You can typically do a better job of stack-spilling than the compiler. Avoid “await” that’s nested deep inside expressions. Use it mostly in statements where it’s the first expression to be evaluated, e.g. var x = await t; using (await t) {…} foreach (var i in await t) {…} Perf tip: You can typically do a better job of stack-spilling than the compiler. Avoid “await” that’s nested deep inside expressions. Use it mostly in statements where it’s the first expression to be evaluated, e.g. var x = await t; using (await t) {…} foreach (var i in await t) {…}