Bitfield accessors

Tue Nov 20 17:03:32 PST 2007

Jarrett Billingsley:

> Cool :)

But later I have changed it again. I have changed the first part of the Bitfield code to this, because sometimes you don't need to access the whole data:

template Bitfield(alias data, Args...) {
    static assert(!(Args.length & 1), "Bitfield arguments must be an even number");
    static assert(data.sizeof*8 >= _SumLens!(Args),
                  "The sum of bit fields is bigger than data.sizeof*8");
    const char[] Bitfield = _BitfieldShim!((typeof(data)).stringof, data, Args).Ret;
}

private template _SumLens(Args...) {
    static if(Args.length == 0)
        const uint _SumLens = 0;
    else {
        static assert(Args[1] > 0, "Bitfield '" ~ Args[0] ~ "' is <= 0");
        const uint _SumLens = Args[1] + _SumLens!(Args[2 .. $]);
    }
}

> If you're using it in a struct, I'd be very surprised if the call weren't 
> optimized out, in which case I'd expect the performance to be about the same 
> as C bitfields.

The calls become inlined, but the results may surprise you anyway :-)
For the speed tests I have used this simple D code on a very old PC:

import std.stdio;

// Bitfield code here...

struct INTORFLOAT {
    uint i;
    mixin(Bitfield!(i, "sign", 1, "biasedexponent", 8, "significand", 23));
}

void main() {
    ulong tot;
    INTORFLOAT f;
    const N = 25;

    uint v[N] = [1628676761, 1620574103, 1237153253, 1098880307, 87513741,
                 13181925, 14686126, 7429435, 16286706, 6474381,
                 4879794, 7734725, 3745958, 13353858, 4236193,
                 7587, 4309, 28846, 7313, 14516,
                 126, 143, 171, 221, 156];

    for(int j = 0; j < 10_000_000; j++)
        for(int i = 0; i < N; i++) {
            f.i = v[i];
            f.biasedexponent = f.biasedexponent / 2;
            tot += f.biasedexponent + f.sign;
            //printf("%d %d\n", f.i, tot);
        }

   printf("%d\n", tot);
}

The C code:

#include <stdio.h>

#define N 25

typedef union {
    unsigned int i;
    struct {
        unsigned int sign: 1;
        unsigned int biasedexponent: 8;
        unsigned int significand: 23;
    } bits;
} INTORFLOAT;

int main(void) {
    int i, j;
    unsigned long long tot = 0;
    INTORFLOAT f;

    unsigned int v[N] = {1628676761, 1620574103, 1237153253, 1098880307, 87513741,
                         13181925, 14686126, 7429435, 16286706, 6474381,
                         4879794, 7734725, 3745958, 13353858, 4236193,
                         7587, 4309, 28846, 7313, 14516,
                         126, 143, 171, 221, 156};

    for(j = 0; j < 10000000; j++)
        for(i = 0; i < N; i++) {
            f.i = v[i];
            f.bits.biasedexponent = f.bits.biasedexponent / 2; // line1
            tot += f.bits.biasedexponent + f.bits.sign; // line2
            //printf("%d %d\n", f.i, tot);
        }

   printf("%d\n", tot);
}

I have compiled them with:
DMD: v1.023
dmd -O -inline -release bitfields1.d

gcc: V. 3.4.2 (mingw-special)
gcc -O3 -s bitfields1.c -o bitfields1

Bitfield produces the following getters and setters:
public uint sign() { return (i >> 0x0) & 0x1; }
public void sign(uint val) { i = (i & 0xfffffffffffffffe) | ((val & 0x1) << 0x0); }
public uint biasedexponent() { return (i >> 0x1) & 0xff; }
public void biasedexponent(uint val) { i = (i & 0xfffffffffffffe01) | ((val & 0xff) << 0x1); }
public uint significand() { return (i >> 0x9) & 0x7fffff; }
public void significand(uint val) { i = (i & 0x1ff) | ((val & 0x7fffff) << 0x9); }

The C code runs about 2.5 times faster than the D code (on a faster CPU the difference is probably smaller). (The numerical output of the code is fortunately the same!). I have tested that the methods (member functions) actually become inlined.
Removing line1 and line2 from both the C and D code the running time becomes (much smaller and) equal between D and C, so I presume the D slowdown is just in the reading/writing of the bit fields.

The Bitfield code can be improved (but such improvement of the templates has resulted hairy, because the code has to adapt to data of different sizeof) to produce a method like this:

public void biasedexponent(uint val) { i = (i & 0xfffffe01) | ((val & 0xff) << 0x1); }

With that the running time of the D code becomes just about 2 times the C code.

Just for reference the ASM code of the following C code:

#include <stdio.h>

#define N 25

typedef union {
    unsigned int i;
    struct {
        unsigned int sign: 1;
        unsigned int biasedexponent: 8;
        unsigned int significand: 23;
    } bits;
} INTORFLOAT;

int main(void) {
    int i;
    unsigned long tot = 0;
    INTORFLOAT f;

    unsigned int v[N] = {1628676761, 1620574103, 1237153253, 1098880307, 87513741,
                         13181925, 14686126, 7429435, 16286706, 6474381,
                         4879794, 7734725, 3745958, 13353858, 4236193,
                         7587, 4309, 28846, 7313, 14516,
                         126, 143, 171, 221, 156};

    for(i = 0; i < N; i++) {
        f.i = v[i];
        f.bits.biasedexponent = f.bits.biasedexponent / 2; // line1
        tot += f.bits.biasedexponent + f.bits.sign; // line2
    }

   printf("%d\n", tot);
}

Produced by:
gcc -O3 -S bitfields2.c -o bitfields2.asm

Is:

	.file	"bitfields2.c"
	.def	___main;	.scl	2;	.type	32;	.endef
	.data
	.align 4
LC0:
	.long	1628676761
	.long	1620574103
	.long	1237153253
	.long	1098880307
	.long	87513741
	.long	13181925
	.long	14686126
	.long	7429435
	.long	16286706
	.long	6474381
	.long	4879794
	.long	7734725
	.long	3745958
	.long	13353858
	.long	4236193
	.long	7587
	.long	4309
	.long	28846
	.long	7313
	.long	14516
	.long	126
	.long	143
	.long	171
	.long	221
	.long	156
	.section .rdata,"dr"
LC1:
	.ascii "%d\12\0"
	.text
	.p2align 4,,15
.globl _main
	.def	_main;	.scl	2;	.type	32;	.endef
_main:
	pushl	%ebp
	movl	$16, %eax
	movl	%esp, %ebp
	pushl	%ebx
	subl	$132, %esp
	andl	$-16, %esp
	call	__alloca
	call	___main
	movl	$100, %ecx
	leal	-120(%ebp), %eax
	movl	$LC0, %edx
	movl	%ecx, 8(%esp)
	xorl	%ebx, %ebx
	movl	%edx, 4(%esp)
	movl	%eax, (%esp)
	call	_memcpy
	xorl	%ecx, %ecx
	.p2align 4,,15
L5:
	movl	-120(%ebp,%ecx,4), %edx
	incl	%ecx
	movl	%edx, %eax
	shrl	$2, %eax
	andl	$1, %edx
	andl	$127, %eax
	addl	%edx, %eax
	addl	%eax, %ebx
	cmpl	$24, %ecx
	jle	L5
	movl	%ebx, 4(%esp)
	movl	$LC1, (%esp)
	call	_printf
	movl	-4(%ebp), %ebx
	leave
	ret
	.def	_memcpy;	.scl	3;	.type	32;	.endef
	.def	_printf;	.scl	3;	.type	32;	.endef

I have no good way to see the ASM generated by dmd yet (I may buy Walter's utility). (Maybe some of you can show that asm, but remember to remove the outer j loop as in the C version).

My obvious conclusion is that I hope to see built-in bitfields in D ;-)
The D docs say they aren't used often, but now and then I translate C code to D and I don't want to write many (slow) getter and setter properties like that :-)

Bye,
bearophile