parsing fastq files with D

rikki cattermole via Digitalmars-d-learn digitalmars-d-learn at puremagic.com
Wed Mar 23 23:34:51 PDT 2016


As a little fun thing to do I implemented it for you.

It won't allocate. Making this perfect for you.
With a bit of work you could make Result have buffers for result instead 
of using the input array allow for the source to be an input range itself.

I made this up on dpaste and single quotes were not playing nicely 
there. So you'll see "\r"[0] as a workaround.

struct FastQRecord {
	const(char)[] sequenceId;
	const(char)[] sequenceLetters;
	const(char)[] quality;
	
	static auto parse(const(char)[] from) {
		struct Result {
			private {
				const(char)[] source;
				FastQRecord value;
				bool isEmpty;
			}
			
			this(const(char)[] source) {
				this.source = source;
				popFront;
			}
			
			@property {
				FastQRecord front() {
					return value;
				}
				
				bool empty() {
					return isEmpty;
				}
			}
			
			void popFront() {
				import std.string : indexOf;
				
				if (source is null) {
					isEmpty = true;
					return;
				}
				
				void tidyInput() {
					foreach(i, c; source) {
						switch(c) {
							case 0: .. case ' ':
								break;
							default:
								source = source[i .. $];
								return;
						}
					}
					
					source = null;
				}
				
				tidyInput();
				
				if (source is null)
					return;
				
				// sequenceId
				
				assert(source[0] == '@');
				
				ptrdiff_t len = source.indexOf("\n");
				assert(len > 0);
				
				value.sequenceId = source[1 .. len];
				if (value.sequenceId[$-1] == "\r"[0])
					value.sequenceId = value.sequenceId[0 .. $-1];
					
				source = source[len + 1 .. $];
				
				// sequenceLetters
				
				len = source.indexOf("\n");
				assert(len > 0);
				
				value.sequenceLetters = source[0 .. len];
				if (value.sequenceLetters[$-1] == "\r"[0])
					value.sequenceLetters = value.sequenceLetters[0 .. $-1];
					
				source = source[len + 1 .. $];
				
				// +sequenceId
				
				len = source.indexOf("\n");
				assert(len > 0);
				source = source[len + 1 .. $];
				
				// quality
				
				len = source.indexOf("\n");
				assert(len > 0);
				
				value.quality = source[0 .. len];
				if (value.quality[$-1] == "\r"[0])
					value.quality = value.quality[0 .. $-1];
				
				if (source.length > len + 1) {
					source = source[len + 1 .. $];
					tidyInput();
				} else
					source = null;
			}
		}
		
		return Result(from);
	}
}

void main() {}

unittest {
	string input = """
@seq1
TTATTTTAAT
+
?+BBB/DHH@
@seq2
GACCCTTTGCA
+
?+BHB/DIH@
@SEQ_ID
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65		
"""[1 .. $];
		
	foreach(record; FastQRecord.parse(input)) {
		import std.stdio;
		writeln(record);
	}
}

---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus



More information about the Digitalmars-d-learn mailing list