suggestion: clean white space / end of line definition

Thomas Kuehne thomas-dloop at kuehne.cn
Wed Nov 1 00:20:33 PST 2006


-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Walter Bright schrieb am 2006-11-01:
> There is a problem though with replacing it all with a function - lexing 
> speed. Lexing speed is critically dependent on being able to consume 
> whitespace fast, hence all the inline code to do it. Running the source 
> through two passes makes it half as fast.

Here is a faster mock-up(untested!) using functions.
Use of macros is certanly possible too.

Thomas

# unsigned char* isEndOfLine(unsigned char* input){
#     switch(input[0]){
#	 /* covered by the lexer:
#	  * case 0x0A:    // LINE FEED
#	  */
#	 case 0x0B:    // LINE TABULATION
#	 case 0x0C:    // FORM FEED
#	     return input;
#	 case 0x0D:    // CARRIAGE RETURN
#	     if(input[1] == 0x0A){
#		 return input + 1;
#	     }
#	     return input;
#	 case 0xC2:    // NEXT LINE
#	     if(input[1] == 0x85){
#		 return input + 1;
#	     }
#	     break;
#	 case 0xE2:    // LINE SEPARATOR || PARAGRAPH SEPARATOR
#	     if((input[1] == 0x80) && ((input[2] == 0xA8) || (input[2] == 0xA9))){
#		 return input + 2;
#	     }
#	     break;
#	 default:
#	     break;
#     }
# 
#     return 0;
# }
# 
# unsigned char* isSpace(unsigned char* input){
#     switch(input[0]){
#	 /* covered by the lexer:
#	  * case 0x20:    // SPACE
#	  */
#	 case 0x09:    // CHARACTER TABULATION
#	 case 0x1F:    // INFORMATION SEPARATOR ONE
#	     return input;
#	 case 0xC2:
#	     if(input[1] == 0xA0){
#		 // NO-BREAK SPACE
#		 return input + 1;
#	     }
#	     break;
#	 case 0xE1:
#	     switch(input[1]){
#		 case 0xA9:
#		     if(input[2] == 0x80){
#			 // OGHAM SPACE MARK
#			 return input + 2;
#		     }
#		     break;
#		 case 0xA0:
#		     if(input[2] == 0x8E){
#			 // MONGOLIAN VOWEL SEPARATOR
#			 return input + 2;
#		     }
#		     break;
#		 default:
#		     break;
#	     }
#	     break;
#	 case 0xE2:
#	     switch(input[1]){
#		 case 0x80:
#		     if((0x80 <= input[2]) && (input[2] <= 0x8A)){
#			 // EN QUAD..HAIR SPACE
#			 return input + 2;
#		     }else if(input[2] == 0xAF){
#			 // NARROW NO-BREAK SPACE
#			 return input + 2;
#		     }
#		     break;
#		 case 0x81:
#		     if(input[2] == 0x9F){
#			 // MEDIUM MATHEMATICAL SPACE
#			 return input + 2;
#		     }
#		     break;
#		 default:
#		     break;
#	     }
#	     break;
#	 case 0xE3:
#	     if((input[1] == 0x80) && (input[2] == 0x80)){
#		 // IDEOGRAPHIC SPACE
#		 return input + 2;
#	     }
#	     break;
#	 default:
#	     break;
#     }
#     return 0;
# }
# 
# void lexer(){
#     unsigned char* p;
#     unsigned char* tmp;
#     while (1)
#     {
#	 switch (*p)
#	 {
#	     Lspace:
#	     case ' ':
#		 p++;
#		 continue;	    // skip white space
# 
#	     Lnew_line:
#	     case '\n':
#		 p++;
#		 //loc.linnum++;
#		 continue;	    // skip white space
# 
# /* a lot more code goes here */
# 
#	     default:
#		 if((tmp = isEndOfLine(p))){
#		     p = tmp;
#		     goto Lnew_line;
#		 }
#		 if((tmp = isSpace(p))){
#		     p = tmp;
#		     goto Lspace;
#		 }
# 
# /* a lot more code goes here */
#	 }
#     }
# }

-----BEGIN PGP SIGNATURE-----

iD8DBQFFSGXwLK5blCcjpWoRAvzLAKCO0gfLsLKj0nLykQoYOobQ1TKJXwCfUwg+
mSqDFqxJiaBcbCh5LR1Cae4=
=mpgd
-----END PGP SIGNATURE-----



More information about the Digitalmars-d mailing list