Shadow space example

The shadow space must be provided directly previous to the call. Imagine the shadow space as a relic from the old stdcall/cdecl convention: For WriteFile you needed five pushes. The shadow space stands for the last four pushes (the first four arguments). Now you need four registers, the shadow space (just the space, contents don't matter) and one value on the stack after the shadow space (which is in fact the first push). Currently the return address to the caller (start) is in the space that WriteFile will use as shadow space -> crash.

You can create a new shadow space for the WinAPI functions (GetStdHandle and WriteConsoleA) inside the function write:

write:
    push rbp
    mov rbp, rsp
    sub rsp, (16 + 32)      ; 5th argument of WriteConsoleA (8) + Shadow space (32)
                            ; plus another 8 to make it a multiple of 16 (to keep stack aligned after one push aligned it after function entry)

    mov [rbp+16],rcx        ; <-- use our Shadow space, provided by `start`
    mov [rbp+24],rdx        ; <-- and again, to save our incoming args

    mov rcx, -11            ; Get handle to StdOut
    call GetStdHandle

    mov rcx,rax             ; hConsoleOutput
    mov rdx, [rbp+16]       ; lpBuffer        ; reloaded saved copy of register arg
    mov r8, [rbp+24]        ; nNumberOfCharsToWrite
    mov r9,empty            ; lpNumberOfCharsWritten
    mov qword [rsp+32],0    ; lpReserved - 5th argument directly behind the shadow space
    call WriteConsoleA

    leave
    ret

For completeness, I am posting this here as this is what I have ended up on. This works perfectly and as far as I can see, barring the UNWIND_INFO/Exception Handling requirements of x64 ASM on Windows, this is pretty much spot on. The comments are hopefully accurate too.

EDIT:

This is now updated after Raymonds comment below. I removed the preservation of rbp because it wasn't required and threw my stack alignment out further than I intended.

; Windows APIs

; GetStdHandle
; ------------
; HANDLE WINAPI GetStdHandle(
;     _In_ DWORD nStdHandle
; ); 
extern GetStdHandle

; WriteFile
; ------------
; BOOL WINAPI WriteFile(
;   _In_        HANDLE       hFile,
;   _In_        LPCVOID      lpBuffer,
;   _In_        DWORD        nNumberOfBytesToWrite,
;   _Out_opt_   LPDWORD      lpNumberOfBytesWritten,
;   _Inout_opt_ LPOVERLAPPED lpOverlapped
; );
extern WriteFile

; ExitProcess
; -----------
; VOID WINAPI ExitProcess(
;     _In_ UINT uExitCode
; );
extern ExitProcess

global start

section .data

    STD_OUTPUT_HANDLE   equ -11
    NULL                equ 0

    msg1                 db "Hello ", 0
    msg1.len             equ $-msg1

    msg2                 db "World!", 10, 0
    msg2.len             equ $-msg2

section .bss

empty               resd 1

section .text

start:

    sub rsp,0x28    ; Allocate 32 bytes of Shadow Space + align it to 16 bytes (8 byte return address already on stack, so 8 + 40 = 16*3)

    mov rcx,msg1
    mov rdx,msg1.len
    call write

    mov rcx,msg2
    mov rdx,msg2.len
    call write

    mov rcx,NULL
    call ExitProcess

    add rsp,0x28    ; Restore the stack pointer before exiting

    ret

write:

    ; Allocate another 40 bytes of stack space (the return address makes 48 total). Its 32
    ; bytes of Shadow Space for the WinAPI calls + 8 more bytes for the fifth argument
    ; to the WriteFile API call.
    sub rsp,0x28

    mov [rsp+0x30],rcx      ; Argument 1 is 48 bytes back in the stack (40 for Shadow Space above, 8 for return address)
    mov [rsp+0x38],rdx      ; Argument 2 is just after Argument 1

    mov rcx,STD_OUTPUT_HANDLE   ; Get handle to StdOut
    call GetStdHandle

    mov rcx,rax             ; hFile
    mov rdx,[rsp+0x30]      ; lpBuffer
    mov r8,[rsp+0x38]       ; nNumberOfBytesToWrite
    mov r9,empty            ; lpNumberOfBytesWritten

    ; Move the 5th argument directly behind the Shadow Space
   mov qword [rsp+0x20],0   ; lpOverlapped, Argument 5 (just after the Shadow Space 32 bytes back)
    call WriteFile

    add rsp,0x28        ; Restore the stack pointer (remove the Shadow Space)

    ret

Which results in...:

Finally working!