parsing fastq files with D
rikki cattermole via Digitalmars-d-learn
digitalmars-d-learn at puremagic.com
Wed Mar 23 23:34:51 PDT 2016
As a little fun thing to do I implemented it for you.
It won't allocate. Making this perfect for you.
With a bit of work you could make Result have buffers for result instead
of using the input array allow for the source to be an input range itself.
I made this up on dpaste and single quotes were not playing nicely
there. So you'll see "\r"[0] as a workaround.
struct FastQRecord {
const(char)[] sequenceId;
const(char)[] sequenceLetters;
const(char)[] quality;
static auto parse(const(char)[] from) {
struct Result {
private {
const(char)[] source;
FastQRecord value;
bool isEmpty;
}
this(const(char)[] source) {
this.source = source;
popFront;
}
@property {
FastQRecord front() {
return value;
}
bool empty() {
return isEmpty;
}
}
void popFront() {
import std.string : indexOf;
if (source is null) {
isEmpty = true;
return;
}
void tidyInput() {
foreach(i, c; source) {
switch(c) {
case 0: .. case ' ':
break;
default:
source = source[i .. $];
return;
}
}
source = null;
}
tidyInput();
if (source is null)
return;
// sequenceId
assert(source[0] == '@');
ptrdiff_t len = source.indexOf("\n");
assert(len > 0);
value.sequenceId = source[1 .. len];
if (value.sequenceId[$-1] == "\r"[0])
value.sequenceId = value.sequenceId[0 .. $-1];
source = source[len + 1 .. $];
// sequenceLetters
len = source.indexOf("\n");
assert(len > 0);
value.sequenceLetters = source[0 .. len];
if (value.sequenceLetters[$-1] == "\r"[0])
value.sequenceLetters = value.sequenceLetters[0 .. $-1];
source = source[len + 1 .. $];
// +sequenceId
len = source.indexOf("\n");
assert(len > 0);
source = source[len + 1 .. $];
// quality
len = source.indexOf("\n");
assert(len > 0);
value.quality = source[0 .. len];
if (value.quality[$-1] == "\r"[0])
value.quality = value.quality[0 .. $-1];
if (source.length > len + 1) {
source = source[len + 1 .. $];
tidyInput();
} else
source = null;
}
}
return Result(from);
}
}
void main() {}
unittest {
string input = """
@seq1
TTATTTTAAT
+
?+BBB/DHH@
@seq2
GACCCTTTGCA
+
?+BHB/DIH@
@SEQ_ID
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
"""[1 .. $];
foreach(record; FastQRecord.parse(input)) {
import std.stdio;
writeln(record);
}
}
---
This email has been checked for viruses by Avast antivirus software.
https://www.avast.com/antivirus
More information about the Digitalmars-d-learn
mailing list