golang Duff’s devices

I’m just starting to scratch golang surface, and until now it has been a pleasant experience.

golang feels solid and fast.

Let me share with you a few golang internals on this notes as long as I’m stumbling upon them:

Today I’ve tripped over some golang source code that looks like a copy-paste frenzy, but is a very clever assembler trick to make the processor go as fast as possible:

The code is in runtime/asm_amd64.s and looks like this:

    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    ...

… repeated 128 27 times

and later:

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    ...

… repeated, again, 27 times, (512 LOC)

It turns out that this is a clever trick called “A Duff’s device”. Wow! today we’re learning very low-level stuff.

If this is the trickery level of golang, no wonder why is so fast!

I’ll paste the code at the end of this post, and you can google Duff’s device yourself, but before that, let me share a warning snippet I’ve found on source comments: Keep your hands off vprintf! :)

at runtime\print1.go:

// Very simple printf. Only for debugging prints.

// Do not add to this without checking with Rob.

func vprintf(str string, arg unsafe.Pointer)…*

And now for something completely different: Here is the full code section with the Duff’s devices from runtime/asm_amd64.s, for your viewing pleasure:

// A Duff's device for zeroing memory. 
// The compiler jumps to computed addresses within
// this routine to zero chunks of memory.  Do not
// change this code without also changing the code
// in ../../cmd/6g/ggen.c:clearfat.
// AX: zero
// DI: ptr to memory to be zeroed
// DI is updated as a side effect.
TEXT runtime·duffzero(SB), NOSPLIT, $0-0
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    STOSQ
    RET

// A Duff's device for copying memory.
// The compiler jumps to computed addresses within
// this routine to copy chunks of memory.  Source
// and destination must not overlap.  Do not
// change this code without also changing the code
// in ../../cmd/6g/cgen.c:sgen.
// SI: ptr to source memory
// DI: ptr to destination memory
// SI and DI are updated as a side effect.

// NOTE: this is equivalent to a sequence of MOVSQ but
// for some reason that is 3.5x slower than this code.
// The STOSQ above seem fine, though.
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    MOVQ    (SI),CX
    ADDQ    $8,SI
    MOVQ    CX,(DI)
    ADDQ    $8,DI

    RET
 
325
Kudos
 
325
Kudos

Now read this

Keep your coder’s mind at full speed: avoid mental branch mispredictions

Brains and CPUs # One can make an analogy of the human brain and a CPU, but never is the analogy more valid than in the case of a programmer reading source code. The coder’s mind # When reading source code, trying to understand what the... Continue →