golang Duff’s devices
I’m just starting to scratch golang surface, and until now it has been a pleasant experience.
golang feels solid and fast.
Let me share with you a few golang internals on this notes as long as I’m stumbling upon them:
Today I’ve tripped over some golang source code that looks like a copy-paste frenzy, but is a very clever assembler trick to make the processor go as fast as possible:
The code is in runtime/asm_amd64.s and looks like this:
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
...
… repeated 128 27 times
and later:
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
...
… repeated, again, 27 times, (512 LOC)
It turns out that this is a clever trick called “A Duff’s device”. Wow! today we’re learning very low-level stuff.
If this is the trickery level of golang, no wonder why is so fast!
I’ll paste the code at the end of this post, and you can google Duff’s device yourself, but before that, let me share a warning snippet I’ve found on source comments: Keep your hands off vprintf! :)
at runtime\print1.go:
// Very simple printf. Only for debugging prints.
// Do not add to this without checking with Rob.
func vprintf(str string, arg unsafe.Pointer)…*
And now for something completely different: Here is the full code section with the Duff’s devices from runtime/asm_amd64.s, for your viewing pleasure:
// A Duff's device for zeroing memory.
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory. Do not
// change this code without also changing the code
// in ../../cmd/6g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
STOSQ
RET
// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory. Source
// and destination must not overlap. Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.
// NOTE: this is equivalent to a sequence of MOVSQ but
// for some reason that is 3.5x slower than this code.
// The STOSQ above seem fine, though.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
MOVQ (SI),CX
ADDQ $8,SI
MOVQ CX,(DI)
ADDQ $8,DI
RET