Python-like slicing and handling UTF-8 strings as a bonus
FG
home at fgda.pl
Sat Dec 29 14:25:15 PST 2012
Slices are great but not really what I had expected, coming from Python.
I've seen code like s[a..$-b] used without checking the values, just to end up
with a Range violation. But there are 3 constraints to check here:
a >= 0 && a + b <= s.length && b >= 0
That's way too much coding for a simple program/script that shortens a string,
before it prints it on a screen. If I can't write s[0..80] without fear, then
let there at least be a function that does it like Python would.
Additionally, as strings are UTF-8-encoded, I'd like such a function to give me
proper substrings, without multibyte characters cut in the middle, where
s[0..80] would mean 80 characters on the screen and not 80 bytes.
I would envision it being part of std.string eventually.
Forgive me if such a function already exists -- I couldn't find it.
I also still don't speak D too well, so don't laugh. :)
import std.array, std.range, std.stdio;
auto getSlice(T)(T[] s, ptrdiff_t start, ptrdiff_t end = ptrdiff_t.max)
pure @safe
{
bool start_from_back, end_from_back;
size_t full_len = s.length;
ptrdiff_t len;
if (full_len > ptrdiff_t.max)
len = ptrdiff_t.max;
else len = cast(ptrdiff_t) full_len;
if (end < 0)
{
end_from_back = true;
end += len;
}
if (end > len) end = len;
if (start < 0)
{
if (0 - start >= len)
start = 0;
else
{
start += len;
start_from_back = true;
}
}
if (start < 0) start = 0;
if (start > end || start >= len || end <= 0)
return s[0..0];
static if(is(T == char) || is(T == immutable(char)) ||
is(T : wchar) || is(T : immutable(wchar)))
{
ptrdiff_t real_start = -1, real_end = -1, loop, last_pos;
if (!start_from_back || !end_from_back)
{
foreach (ptrdiff_t i, dchar c; s)
{
if (!start_from_back && loop >= start && real_start < 0)
real_start = i;
if (!end_from_back && loop >= end && real_end < 0)
real_end = i;
if ((start_from_back || real_start > -1) &&
(end_from_back || real_end > -1 || end == len))
break;
loop++;
}
}
start -= len;
end -= len;
loop = -1;
if (start_from_back || end_from_back)
{
foreach_reverse (ptrdiff_t i, dchar c; s)
{
if (start_from_back && loop <= start && real_start < 0)
real_start = i;
if (end_from_back && loop <= end && real_end < 0)
real_end = i;
if ((!start_from_back || real_start > -1) &&
(!end_from_back || real_end > -1))
break;
loop--;
}
}
if (real_end < 0) real_end = (end_from_back ? 0 : len);
if (real_start < 0) real_start = (start_from_back ? 0 : len);
if (real_start > real_end) real_start = real_end = 0;
return s[real_start..real_end];
}
else return s[start..end];
}
unittest {
string s = "okrągły stół";
dstring d = "okrągły stół"d;
auto t = [0, 1, 2, 3, 4];
assert(t.getSlice(0, -1) == [0, 1, 2, 3]);
assert(t.getSlice(1, -2) == [1, 2]);
assert(t.getSlice(-4, -2) == [1, 2]);
assert(t.getSlice(-5, 7) == [0, 1, 2, 3, 4]);
assert(s.getSlice(0, 0) == "");
assert(s.getSlice(0, 1) == "o");
assert(s.getSlice(0) == s);
assert(s.getSlice(8) == "stół");
assert(s.getSlice(8, -1) == "stó");
assert(s.getSlice(8, -2) == "st");
assert(s.getSlice(8, -4) == "");
assert(s.getSlice(10, 11) == "ó");
assert(s.getSlice(10, -1) == "ó");
assert(s.getSlice(10, 12) == "ół");
assert(s.getSlice(11, 12) == "ł");
assert(s.getSlice(11, 15) == "ł");
assert(d.getSlice(0, 0) == ""d);
assert(d.getSlice(0, 1) == "o"d);
assert(d.getSlice(0) == d);
assert(d.getSlice(8) == "stół"d);
assert(d.getSlice(8, -1) == "stó"d);
assert(d.getSlice(8, -2) == "st"d);
assert(d.getSlice(8, -4) == ""d);
assert(d.getSlice(10, 11) == "ó"d);
assert(d.getSlice(10, -1) == "ó"d);
assert(d.getSlice(10, 12) == "ół"d);
assert(d.getSlice(11, 12) == "ł"d);
assert(d.getSlice(11, 15) == "ł"d);
assert(d.getSlice(11, 15) == "ł"d);
}
More information about the Digitalmars-d
mailing list