Ceci n'est pas une char

Thu Apr 6 16:50:26 PDT 2006

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Georg Wrede schrieb am 2006-04-06:
> Mike Capp wrote:
>> "Wasteful" is also relative. UTF-32 is certainly wasteful of memory
>> space, but UTF-8 is potentially far more wasteful of CPU cycles and
>> memory bandwidth.
>
> It sure looks like it. Then again, studying the UTF-8 spec, and "why we 
> did it this way" (sorry, no URL here. Anybody?), shows that it actually 
> is _amazingly_ light on CPU cycles! Really.

Have a look at the endcoding of Hangul(Korean) and polytonic Greek <g>

> (( I sure wish there was somebody in this NG who could write a 
> Scientifically Valid test to compare the time needed to find the 
> millionth character in UTF-8 vs. UTF-8 first converted to UTF-32. ))

Challenge:
Provide a D implementation that firsts converts to UTF-32 and has
shorter runtime than the code below:

# size_t codepoint_to_index(size_t codepoint_number, char[] data){
#	char* start = data.ptr;
#	char* end = start + data.length;
#	size_t index;
#
#	if(!data.length){
# insufficent_input:
#		throw new Exception("not enough input");
#	}
#
#	if(!codepoint_number){
#		return 0;
#	}
#
#	asm{
#			mov	EDX,	codepoint_number;
#			mov	ECX,	start;
#			mov	EBX,	end;
#			
#		next_codepoint:
#			mov	AL,	[ECX];
#			inc	ECX;
#			sal	AL,	1;
#			jnc	end_of_codepoint;
#			sal	AL,	1;
#		inner_loop:
#			inc	ECX;
#			sal	AL,	1;
#			jc	inner_loop;
#		
#		end_of_codepoint:
#			// array bounds
#			cmp	ECX,	EBX;
#			jnb	insufficent_input;
#		
#			// the interresting codepoint?
#			dec	EDX;
#			jnz	next_codepoint;
#
#			// calculate index
#			mov	EBX,	start;
#			sub	ECX,	EBX;
#			mov	index,	ECX;	
#	}
#
#	return index;
# }

Thomas


-----BEGIN PGP SIGNATURE-----

iD8DBQFENbZw3w+/yD4P9tIRAjTkAJsEcE6xM0fSLrT3x+iArgdVacZIXgCgsnNa
19AB53HGi6fbH9AuHTMvjq4=
=gZWL
-----END PGP SIGNATURE-----