1 // Written in the D programming language.
2
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5
6 For cases where the encoding is known at compile-time, functions are provided
7 for arbitrary encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250, WINDOWS-1251
12 and WINDOWS-1252.
13
14 $(SCRIPT inhibitQuickIndex = 1;)
15 $(DIVC quickindex,
16 $(BOOKTABLE,
17 $(TR $(TH Category) $(TH Functions))
18 $(TR $(TD Decode) $(TD
19 $(LREF codePoints)
20 $(LREF decode)
21 $(LREF decodeReverse)
22 $(LREF safeDecode)
23 ))
24 $(TR $(TD Conversion) $(TD
25 $(LREF codeUnits)
26 $(LREF sanitize)
27 $(LREF transcode)
28 ))
29 $(TR $(TD Classification) $(TD
30 $(LREF canEncode)
31 $(LREF isValid)
32 $(LREF isValidCodePoint)
33 $(LREF isValidCodeUnit)
34 ))
35 $(TR $(TD BOM) $(TD
36 $(LREF BOM)
37 $(LREF BOMSeq)
38 $(LREF getBOM)
39 $(LREF utfBOM)
40 ))
41 $(TR $(TD Length & Index) $(TD
42 $(LREF firstSequence)
43 $(LREF encodedLength)
44 $(LREF index)
45 $(LREF lastSequence)
46 $(LREF validLength)
47 ))
48 $(TR $(TD Encoding schemes) $(TD
49 $(LREF encodingName)
50 $(LREF EncodingScheme)
51 $(LREF EncodingSchemeASCII)
52 $(LREF EncodingSchemeLatin1)
53 $(LREF EncodingSchemeLatin2)
54 $(LREF EncodingSchemeUtf16Native)
55 $(LREF EncodingSchemeUtf32Native)
56 $(LREF EncodingSchemeUtf8)
57 $(LREF EncodingSchemeWindows1250)
58 $(LREF EncodingSchemeWindows1251)
59 $(LREF EncodingSchemeWindows1252)
60 ))
61 $(TR $(TD Representation) $(TD
62 $(LREF AsciiChar)
63 $(LREF AsciiString)
64 $(LREF Latin1Char)
65 $(LREF Latin1String)
66 $(LREF Latin2Char)
67 $(LREF Latin2String)
68 $(LREF Windows1250Char)
69 $(LREF Windows1250String)
70 $(LREF Windows1251Char)
71 $(LREF Windows1251String)
72 $(LREF Windows1252Char)
73 $(LREF Windows1252String)
74 ))
75 $(TR $(TD Exceptions) $(TD
76 $(LREF INVALID_SEQUENCE)
77 $(LREF EncodingException)
78 ))
79 ))
80
81 For cases where the encoding is not known at compile-time, but is
82 known at run-time, the abstract class $(LREF EncodingScheme)
83 and its subclasses is provided. To construct a run-time encoder/decoder,
84 one does e.g.
85
86 ----------------------------------------------------
87 auto e = EncodingScheme.create("utf-8");
88 ----------------------------------------------------
89
90 This library supplies $(LREF EncodingScheme) subclasses for ASCII,
91 ISO-8859-1 (also known as LATIN-1), ISO-8859-2 (LATIN-2), WINDOWS-1250,
92 WINDOWS-1251, WINDOWS-1252, UTF-8, and (on little-endian architectures)
93 UTF-16LE and UTF-32LE; or (on big-endian architectures) UTF-16BE and UTF-32BE.
94
95 This library provides a mechanism whereby other modules may add $(LREF
96 EncodingScheme) subclasses for any other encoding.
97
98 Copyright: Copyright Janice Caron 2008 - 2009.
99 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
100 Authors: Janice Caron
101 Source: $(PHOBOSSRC std/encoding.d)
102 */
103 /*
104 Copyright Janice Caron 2008 - 2009.
105 Distributed under the Boost Software License, Version 1.0.
106 (See accompanying file LICENSE_1_0.txt or copy at
107 http://www.boost.org/LICENSE_1_0.txt)
108 */
109 module std.encoding;
110
111 import std.range.primitives;
112 import std.traits;
113 import std.typecons;
114
115 @system unittest
116 {
117 static ubyte[][] validStrings =
118 [
119 // Plain ASCII
120 cast(ubyte[])"hello",
121
122 // First possible sequence of a certain length
123 [ 0x00 ], // U+00000000 one byte
124 [ 0xC2, 0x80 ], // U+00000080 two bytes
125 [ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
126 [ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
127
128 // Last possible sequence of a certain length
129 [ 0x7F ], // U+0000007F one byte
130 [ 0xDF, 0xBF ], // U+000007FF two bytes
131 [ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
132
133 // Other boundary conditions
134 [ 0xED, 0x9F, 0xBF ],
135 // U+0000D7FF Last character before surrogates
136 [ 0xEE, 0x80, 0x80 ],
137 // U+0000E000 First character after surrogates
138 [ 0xEF, 0xBF, 0xBD ],
139 // U+0000FFFD Unicode replacement character
140 [ 0xF4, 0x8F, 0xBF, 0xBF ],
141 // U+0010FFFF Very last character
142
143 // Non-character code points
144 /* NOTE: These are legal in UTF, and may be converted from
145 one UTF to another, however they do not represent Unicode
146 characters. These code points have been reserved by
147 Unicode as non-character code points. They are permissible
148 for data exchange within an application, but they are are
149 not permitted to be used as characters. Since this module
150 deals with UTF, and not with Unicode per se, we choose to
151 accept them here. */
152 [ 0xDF, 0xBE ], // U+0000FFFE
153 [ 0xDF, 0xBF ], // U+0000FFFF
154 ];
155
156 static ubyte[][] invalidStrings =
157 [
158 // First possible sequence of a certain length, but greater
159 // than U+10FFFF
160 [ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
161 [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
162
163 // Last possible sequence of a certain length, but greater than U+10FFFF
164 [ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
165 [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
166 [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
167
168 // Other boundary conditions
169 [ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000
170 // First code
171 // point after
172 // last character
173
174 // Unexpected continuation bytes
175 [ 0x80 ],
176 [ 0xBF ],
177 [ 0x20, 0x80, 0x20 ],
178 [ 0x20, 0xBF, 0x20 ],
179 [ 0x80, 0x9F, 0xA0 ],
180
181 // Lonely start bytes
182 [ 0xC0 ],
183 [ 0xCF ],
184 [ 0x20, 0xC0, 0x20 ],
185 [ 0x20, 0xCF, 0x20 ],
186 [ 0xD0 ],
187 [ 0xDF ],
188 [ 0x20, 0xD0, 0x20 ],
189 [ 0x20, 0xDF, 0x20 ],
190 [ 0xE0 ],
191 [ 0xEF ],
192 [ 0x20, 0xE0, 0x20 ],
193 [ 0x20, 0xEF, 0x20 ],
194 [ 0xF0 ],
195 [ 0xF1 ],
196 [ 0xF2 ],
197 [ 0xF3 ],
198 [ 0xF4 ],
199 [ 0xF5 ], // If this were legal it would start a character > U+10FFFF
200 [ 0xF6 ], // If this were legal it would start a character > U+10FFFF
201 [ 0xF7 ], // If this were legal it would start a character > U+10FFFF
202
203 [ 0xEF, 0xBF ], // Three byte sequence with third byte missing
204 [ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
205 [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above
206
207 // Impossible bytes
208 [ 0xF8 ],
209 [ 0xF9 ],
210 [ 0xFA ],
211 [ 0xFB ],
212 [ 0xFC ],
213 [ 0xFD ],
214 [ 0xFE ],
215 [ 0xFF ],
216 [ 0x20, 0xF8, 0x20 ],
217 [ 0x20, 0xF9, 0x20 ],
218 [ 0x20, 0xFA, 0x20 ],
219 [ 0x20, 0xFB, 0x20 ],
220 [ 0x20, 0xFC, 0x20 ],
221 [ 0x20, 0xFD, 0x20 ],
222 [ 0x20, 0xFE, 0x20 ],
223 [ 0x20, 0xFF, 0x20 ],
224
225 // Overlong sequences, all representing U+002F
226 /* With a safe UTF-8 decoder, all of the following five overlong
227 representations of the ASCII character slash ("/") should be
228 rejected like a malformed UTF-8 sequence */
229 [ 0xC0, 0xAF ],
230 [ 0xE0, 0x80, 0xAF ],
231 [ 0xF0, 0x80, 0x80, 0xAF ],
232 [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
233 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
234
235 // Maximum overlong sequences
236 /* Below you see the highest Unicode value that is still resulting in
237 an overlong sequence if represented with the given number of bytes.
238 This is a boundary test for safe UTF-8 decoders. All five
239 characters should be rejected like malformed UTF-8 sequences. */
240 [ 0xC1, 0xBF ], // U+0000007F
241 [ 0xE0, 0x9F, 0xBF ], // U+000007FF
242 [ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
243 [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
244 [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
245
246 // Overlong representation of the NUL character
247 /* The following five sequences should also be rejected like malformed
248 UTF-8 sequences and should not be treated like the ASCII NUL
249 character. */
250 [ 0xC0, 0x80 ],
251 [ 0xE0, 0x80, 0x80 ],
252 [ 0xF0, 0x80, 0x80, 0x80 ],
253 [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
254 [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
255
256 // Illegal code positions
257 /* The following UTF-8 sequences should be rejected like malformed
258 sequences, because they never represent valid ISO 10646 characters
259 and a UTF-8 decoder that accepts them might introduce security
260 problems comparable to overlong UTF-8 sequences. */
261 [ 0xED, 0xA0, 0x80 ], // U+D800
262 [ 0xED, 0xAD, 0xBF ], // U+DB7F
263 [ 0xED, 0xAE, 0x80 ], // U+DB80
264 [ 0xED, 0xAF, 0xBF ], // U+DBFF
265 [ 0xED, 0xB0, 0x80 ], // U+DC00
266 [ 0xED, 0xBE, 0x80 ], // U+DF80
267 [ 0xED, 0xBF, 0xBF ], // U+DFFF
268 ];
269
270 static string[] sanitizedStrings =
271 [
272 "\uFFFD","\uFFFD",
273 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
274 " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
275 "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
276 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
277 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
278 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
279 " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
280 " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
281 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
282 "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
283 ];
284
285 // HELPER FUNCTIONS
286 // we can probably do this better...
287 static char toHexDigit(int n)
288 {
289 return "0123456789ABCDEF"[n & 0xF];
290 }
291
292 static string makeReadable(string s)
293 {
294 string r = "\"";
295 foreach (char c;s)
296 {
297 if (c >= 0x20 && c < 0x80)
298 {
299 r ~= c;
300 }
301 else
302 {
303 r ~= "\\x";
304 r ~= toHexDigit(c >> 4);
305 r ~= toHexDigit(c);
306 }
307 }
308 r ~= "\"";
309 return r;
310 }
311
312 void transcodeReverse(Src,Dst)(immutable(Src)[] s, out immutable(Dst)[] r)
313 {
314 static if (is(Src == Dst))
315 {
316 return s;
317 }
318 else static if (is(Src == AsciiChar))
319 {
320 transcodeReverse!(char,Dst)(cast(string) s,r);
321 }
322 else
323 {
324 foreach_reverse (d;codePoints(s))
325 {
326 foreach_reverse (c;codeUnits!(Dst)(d))
327 {
328 r = c ~ r;
329 }
330 }
331 }
332 }
333
334 // Make sure everything that should be valid, is
335 foreach (a;validStrings)
336 {
337 string s = cast(string) a;
338 assert(isValid(s),"Failed to validate: "~makeReadable(s));
339 }
340
341 // Make sure everything that shouldn't be valid, isn't
342 foreach (a;invalidStrings)
343 {
344 string s = cast(string) a;
345 assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
346 }
347
348 // Make sure we can sanitize everything bad
349 assert(invalidStrings.length == sanitizedStrings.length);
350 for (int i=0; i<invalidStrings.length; ++i)
351 {
352 string s = cast(string) invalidStrings[i];
353 string t = sanitize(s);
354 assert(isValid(t));
355 assert(t == sanitizedStrings[i]);
356 ubyte[] u = cast(ubyte[]) t;
357 validStrings ~= u;
358 }
359
360 // Make sure all transcodings work in both directions, using both forward
361 // and reverse iteration
362 foreach (a; validStrings)
363 {
364 string s = cast(string) a;
365 string s2;
366 wstring ws, ws2;
367 dstring ds, ds2;
368
369 transcode(s,ws);
370 assert(isValid(ws));
371 transcode(ws,s2);
372 assert(s == s2);
373
374 transcode(s,ds);
375 assert(isValid(ds));
376 transcode(ds,s2);
377 assert(s == s2);
378
379 transcode(ws,s);
380 assert(isValid(s));
381 transcode(s,ws2);
382 assert(ws == ws2);
383
384 transcode(ws,ds);
385 assert(isValid(ds));
386 transcode(ds,ws2);
387 assert(ws == ws2);
388
389 transcode(ds,s);
390 assert(isValid(s));
391 transcode(s,ds2);
392 assert(ds == ds2);
393
394 transcode(ds,ws);
395 assert(isValid(ws));
396 transcode(ws,ds2);
397 assert(ds == ds2);
398
399 transcodeReverse(s,ws);
400 assert(isValid(ws));
401 transcodeReverse(ws,s2);
402 assert(s == s2);
403
404 transcodeReverse(s,ds);
405 assert(isValid(ds));
406 transcodeReverse(ds,s2);
407 assert(s == s2);
408
409 transcodeReverse(ws,s);
410 assert(isValid(s));
411 transcodeReverse(s,ws2);
412 assert(ws == ws2);
413
414 transcodeReverse(ws,ds);
415 assert(isValid(ds));
416 transcodeReverse(ds,ws2);
417 assert(ws == ws2);
418
419 transcodeReverse(ds,s);
420 assert(isValid(s));
421 transcodeReverse(s,ds2);
422 assert(ds == ds2);
423
424 transcodeReverse(ds,ws);
425 assert(isValid(ws));
426 transcodeReverse(ws,ds2);
427 assert(ds == ds2);
428 }
429
430 // Make sure the non-UTF encodings work too
431 {
432 auto s = "\u20AC100";
433 Windows1252String t;
434 transcode(s,t);
435 assert(t == cast(Windows1252Char[])[0x80, '1', '0', '0']);
436 string u;
437 transcode(s,u);
438 assert(s == u);
439 Latin1String v;
440 transcode(s,v);
441 assert(cast(string) v == "?100");
442 AsciiString w;
443 transcode(v,w);
444 assert(cast(string) w == "?100");
445 s = "\u017Dlu\u0165ou\u010Dk\u00FD k\u016F\u0148";
446 Latin2String x;
447 transcode(s,x);
448 assert(x == cast(Latin2Char[])[0xae, 'l', 'u', 0xbb, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
449 Windows1250String y;
450 transcode(s,y);
451 assert(y == cast(Windows1250Char[])[0x8e, 'l', 'u', 0x9d, 'o', 'u', 0xe8, 'k', 0xfd, ' ', 'k', 0xf9, 0xf2]);
452 s = "\u0402lu\u0403ou\u201D\u045C k\u0414\u044F";
453 Windows1251String s51;
454 transcode(s,s51);
455 assert(s51 == cast(Windows1251Char[])[0x80, 'l', 'u', 0x81, 'o', 'u', 0x94, 0x9d, ' ', 'k', 0xc4, 0xff]);
456 }
457
458 // Make sure we can count properly
459 {
460 assert(encodedLength!(char)('A') == 1);
461 assert(encodedLength!(char)('\u00E3') == 2);
462 assert(encodedLength!(char)('\u2028') == 3);
463 assert(encodedLength!(char)('\U0010FFF0') == 4);
464 assert(encodedLength!(wchar)('A') == 1);
465 assert(encodedLength!(wchar)('\U0010FFF0') == 2);
466 }
467
468 // Make sure we can write into mutable arrays
469 {
470 char[4] buffer;
471 auto n = encode(cast(dchar)'\u00E3',buffer);
472 assert(n == 2);
473 assert(buffer[0] == 0xC3);
474 assert(buffer[1] == 0xA3);
475 }
476 }
477
478 //=============================================================================
479
480 /** Special value returned by `safeDecode` */
481 enum dchar INVALID_SEQUENCE = cast(dchar) 0xFFFFFFFF;
482
483 template EncoderFunctions()
484 {
485 // Various forms of read
486
487 template ReadFromString()
488 {
489 @property bool canRead() { return s.length != 0; }
490 E peek() @safe pure @nogc nothrow { return s[0]; }
491 E read() @safe pure @nogc nothrow { E t = s[0]; s = s[1..$]; return t; }
492 }
493
494 template ReverseReadFromString()
495 {
496 @property bool canRead() { return s.length != 0; }
497 E peek() @safe pure @nogc nothrow { return s[$-1]; }
498 E read() @safe pure @nogc nothrow { E t = s[$-1]; s = s[0..$-1]; return t; }
499 }
500
501 // Various forms of Write
502
503 template WriteToString()
504 {
505 E[] s;
506 void write(E c) @safe pure nothrow { s ~= c; }
507 }
508
509 template WriteToArray()
510 {
511 void write(E c) @safe pure @nogc nothrow { array[0] = c; array = array[1..$]; }
512 }
513
514 template WriteToDelegate()
515 {
516 void write(E c) { dg(c); }
517 }
518
519 // Functions we will export
520
521 template EncodeViaWrite()
522 {
523 mixin encodeViaWrite;
524 void encode(dchar c) { encodeViaWrite(c); }
525 }
526
527 template SkipViaRead()
528 {
529 mixin skipViaRead;
530 void skip() @safe pure @nogc nothrow { skipViaRead(); }
531 }
532
533 template DecodeViaRead()
534 {
535 mixin decodeViaRead;
536 dchar decode() @safe pure @nogc nothrow { return decodeViaRead(); }
537 }
538
539 template SafeDecodeViaRead()
540 {
541 mixin safeDecodeViaRead;
542 dchar safeDecode() @safe pure @nogc nothrow { return safeDecodeViaRead(); }
543 }
544
545 template DecodeReverseViaRead()
546 {
547 mixin decodeReverseViaRead;
548 dchar decodeReverse() @safe pure @nogc nothrow { return decodeReverseViaRead(); }
549 }
550
551 // Encoding to different destinations
552
553 template EncodeToString()
554 {
555 mixin WriteToString;
556 mixin EncodeViaWrite;
557 }
558
559 template EncodeToArray()
560 {
561 mixin WriteToArray;
562 mixin EncodeViaWrite;
563 }
564
565 template EncodeToDelegate()
566 {
567 mixin WriteToDelegate;
568 mixin EncodeViaWrite;
569 }
570
571 // Decoding functions
572
573 template SkipFromString()
574 {
575 mixin ReadFromString;
576 mixin SkipViaRead;
577 }
578
579 template DecodeFromString()
580 {
581 mixin ReadFromString;
582 mixin DecodeViaRead;
583 }
584
585 template SafeDecodeFromString()
586 {
587 mixin ReadFromString;
588 mixin SafeDecodeViaRead;
589 }
590
591 template DecodeReverseFromString()
592 {
593 mixin ReverseReadFromString;
594 mixin DecodeReverseViaRead;
595 }
596
597 //=========================================================================
598
599 // Below are the functions we will ultimately expose to the user
600
601 E[] encode(dchar c) @safe pure nothrow
602 {
603 mixin EncodeToString e;
604 e.encode(c);
605 return e.s;
606 }
607
608 void encode(dchar c, ref E[] array) @safe pure nothrow
609 {
610 mixin EncodeToArray e;
611 e.encode(c);
612 }
613
614 void encode(dchar c, void delegate(E) dg)
615 {
616 mixin EncodeToDelegate e;
617 e.encode(c);
618 }
619
620 void skip(ref const(E)[] s) @safe pure nothrow
621 {
622 mixin SkipFromString e;
623 e.skip();
624 }
625
626 dchar decode(S)(ref S s)
627 {
628 mixin DecodeFromString e;
629 return e.decode();
630 }
631
632 dchar safeDecode(S)(ref S s)
633 {
634 mixin SafeDecodeFromString e;
635 return e.safeDecode();
636 }
637
638 dchar decodeReverse(ref const(E)[] s) @safe pure nothrow
639 {
640 mixin DecodeReverseFromString e;
641 return e.decodeReverse();
642 }
643 }
644
645 //=========================================================================
646
647 struct CodePoints(E)
648 {
649 const(E)[] s;
650
651 this(const(E)[] s)
652 in
653 {
654 assert(isValid(s));
655 }
656 do
657 {
658 this.s = s;
659 }
660
661 int opApply(scope int delegate(ref dchar) dg)
662 {
663 int result = 0;
664 while (s.length != 0)
665 {
666 dchar c = decode(s);
667 result = dg(c);
668 if (result != 0) break;
669 }
670 return result;
671 }
672
673 int opApply(scope int delegate(ref size_t, ref dchar) dg)
674 {
675 size_t i = 0;
676 int result = 0;
677 while (s.length != 0)
678 {
679 immutable len = s.length;
680 dchar c = decode(s);
681 size_t j = i; // We don't want the delegate corrupting i
682 result = dg(j,c);
683 if (result != 0) break;
684 i += len - s.length;
685 }
686 return result;
687 }
688
689 int opApplyReverse(scope int delegate(ref dchar) dg)
690 {
691 int result = 0;
692 while (s.length != 0)
693 {
694 dchar c = decodeReverse(s);
695 result = dg(c);
696 if (result != 0) break;
697 }
698 return result;
699 }
700
701 int opApplyReverse(scope int delegate(ref size_t, ref dchar) dg)
702 {
703 int result = 0;
704 while (s.length != 0)
705 {
706 dchar c = decodeReverse(s);
707 size_t i = s.length;
708 result = dg(i,c);
709 if (result != 0) break;
710 }
711 return result;
712 }
713 }
714
715 struct CodeUnits(E)
716 {
717 E[] s;
718
719 this(dchar d)
720 in
721 {
722 assert(isValidCodePoint(d));
723 }
724 do
725 {
726 s = encode!(E)(d);
727 }
728
729 int opApply(scope int delegate(ref E) dg)
730 {
731 int result = 0;
732 foreach (E c;s)
733 {
734 result = dg(c);
735 if (result != 0) break;
736 }
737 return result;
738 }
739
740 int opApplyReverse(scope int delegate(ref E) dg)
741 {
742 int result = 0;
743 foreach_reverse (E c;s)
744 {
745 result = dg(c);
746 if (result != 0) break;
747 }
748 return result;
749 }
750 }
751
752 //=============================================================================
753
754 template EncoderInstance(E)
755 {
756 static assert(false,"Cannot instantiate EncoderInstance for type "
757 ~ E.stringof);
758 }
759
760 private template GenericEncoder()
761 {
762 bool canEncode(dchar c) @safe pure @nogc nothrow
763 {
764 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) return true;
765 if (c >= 0xFFFD) return false;
766
767 auto idx = 0;
768 while (idx < bstMap.length)
769 {
770 if (bstMap[idx][0] == c) return true;
771 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
772 }
773
774 return false;
775 }
776
777 bool isValidCodeUnit(E c) @safe pure @nogc nothrow
778 {
779 if (c < m_charMapStart || c > m_charMapEnd) return true;
780 return charMap[c-m_charMapStart] != 0xFFFD;
781 }
782
783 size_t encodedLength(dchar c) @safe pure @nogc nothrow
784 in
785 {
786 assert(canEncode(c));
787 }
788 do
789 {
790 return 1;
791 }
792
793 void encodeViaWrite()(dchar c)
794 {
795 if (c < m_charMapStart || (c > m_charMapEnd && c < 0x100)) {}
796 else if (c >= 0xFFFD) { c = '?'; }
797 else
798 {
799 auto idx = 0;
800 while (idx < bstMap.length)
801 {
802 if (bstMap[idx][0] == c)
803 {
804 write(cast(E) bstMap[idx][1]);
805 return;
806 }
807 idx = bstMap[idx][0] > c ? 2 * idx + 1 : 2 * idx + 2; // next BST index
808 }
809 c = '?';
810 }
811 write(cast(E) c);
812 }
813
814 void skipViaRead()()
815 {
816 read();
817 }
818
819 dchar decodeViaRead()()
820 {
821 E c = read();
822 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
823 }
824
825 dchar safeDecodeViaRead()()
826 {
827 immutable E c = read();
828 immutable d = (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
829 return d == 0xFFFD ? INVALID_SEQUENCE : d;
830 }
831
832 dchar decodeReverseViaRead()()
833 {
834 E c = read();
835 return (c >= m_charMapStart && c <= m_charMapEnd) ? charMap[c-m_charMapStart] : c;
836 }
837
838 @property EString replacementSequence() @safe pure @nogc nothrow
839 {
840 return cast(EString)("?");
841 }
842
843 mixin EncoderFunctions;
844 }
845
846 //=============================================================================
847 // ASCII
848 //=============================================================================
849
850 /** Defines various character sets. */
851 enum AsciiChar : ubyte { _init }
852 /// Ditto
853 alias AsciiString = immutable(AsciiChar)[];
854
855 template EncoderInstance(CharType : AsciiChar)
856 {
857 alias E = AsciiChar;
858 alias EString = AsciiString;
859
860 @property string encodingName() @safe pure nothrow @nogc
861 {
862 return "ASCII";
863 }
864
865 bool canEncode(dchar c) @safe pure nothrow @nogc
866 {
867 return c < 0x80;
868 }
869
870 bool isValidCodeUnit(AsciiChar c) @safe pure nothrow @nogc
871 {
872 return c < 0x80;
873 }
874
875 size_t encodedLength(dchar c) @safe pure nothrow @nogc
876 in
877 {
878 assert(canEncode(c));
879 }
880 do
881 {
882 return 1;
883 }
884
885 void encodeX(Range)(dchar c, Range r)
886 {
887 if (!canEncode(c)) c = '?';
888 r.write(cast(AsciiChar) c);
889 }
890
891 void encodeViaWrite()(dchar c)
892 {
893 if (!canEncode(c)) c = '?';
894 write(cast(AsciiChar) c);
895 }
896
897 void skipViaRead()()
898 {
899 read();
900 }
901
902 dchar decodeViaRead()()
903 {
904 return read();
905 }
906
907 dchar safeDecodeViaRead()()
908 {
909 immutable c = read();
910 return canEncode(c) ? c : INVALID_SEQUENCE;
911 }
912
913 dchar decodeReverseViaRead()()
914 {
915 return read();
916 }
917
918 @property EString replacementSequence() @safe pure nothrow @nogc
919 {
920 return cast(EString)("?");
921 }
922
923 mixin EncoderFunctions;
924 }
925
926 //=============================================================================
927 // ISO-8859-1
928 //=============================================================================
929
930 /** Defines an Latin1-encoded character. */
931 enum Latin1Char : ubyte { _init }
932 /**
933 Defines an Latin1-encoded string (as an array of $(D
934 immutable(Latin1Char))).
935 */
936 alias Latin1String = immutable(Latin1Char)[];
937
938 template EncoderInstance(CharType : Latin1Char)
939 {
940 alias E = Latin1Char;
941 alias EString = Latin1String;
942
943 @property string encodingName() @safe pure nothrow @nogc
944 {
945 return "ISO-8859-1";
946 }
947
948 bool canEncode(dchar c) @safe pure nothrow @nogc
949 {
950 return c < 0x100;
951 }
952
953 bool isValidCodeUnit(Latin1Char c) @safe pure nothrow @nogc
954 {
955 return true;
956 }
957
958 size_t encodedLength(dchar c) @safe pure nothrow @nogc
959 in
960 {
961 assert(canEncode(c));
962 }
963 do
964 {
965 return 1;
966 }
967
968 void encodeViaWrite()(dchar c)
969 {
970 if (!canEncode(c)) c = '?';
971 write(cast(Latin1Char) c);
972 }
973
974 void skipViaRead()()
975 {
976 read();
977 }
978
979 dchar decodeViaRead()()
980 {
981 return read();
982 }
983
984 dchar safeDecodeViaRead()()
985 {
986 return read();
987 }
988
989 dchar decodeReverseViaRead()()
990 {
991 return read();
992 }
993
994 @property EString replacementSequence() @safe pure nothrow @nogc
995 {
996 return cast(EString)("?");
997 }
998
999 mixin EncoderFunctions;
1000 }
1001
1002 //=============================================================================
1003 // ISO-8859-2
1004 //=============================================================================
1005
1006 /// Defines a Latin2-encoded character.
1007 enum Latin2Char : ubyte { _init }
1008
1009 /**
1010 * Defines an Latin2-encoded string (as an array of $(D
1011 * immutable(Latin2Char))).
1012 */
1013 alias Latin2String = immutable(Latin2Char)[];
1014
1015 private template EncoderInstance(CharType : Latin2Char)
1016 {
1017 import std.typecons : Tuple, tuple;
1018
1019 alias E = Latin2Char;
1020 alias EString = Latin2String;
1021
1022 @property string encodingName() @safe pure nothrow @nogc
1023 {
1024 return "ISO-8859-2";
1025 }
1026
1027 private static immutable dchar m_charMapStart = 0xa1;
1028 private static immutable dchar m_charMapEnd = 0xff;
1029
1030 private immutable wstring charMap =
1031 "\u0104\u02D8\u0141\u00A4\u013D\u015A\u00A7\u00A8"~
1032 "\u0160\u015E\u0164\u0179\u00AD\u017D\u017B\u00B0"~
1033 "\u0105\u02DB\u0142\u00B4\u013E\u015B\u02C7\u00B8"~
1034 "\u0161\u015F\u0165\u017A\u02DD\u017E\u017C\u0154"~
1035 "\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7\u010C"~
1036 "\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E\u0110"~
1037 "\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7\u0158"~
1038 "\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF\u0155"~
1039 "\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7\u010D"~
1040 "\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F\u0111"~
1041 "\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7\u0159"~
1042 "\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1043
1044 private immutable Tuple!(wchar, char)[] bstMap = [
1045 tuple('\u0148','\xF2'), tuple('\u00F3','\xF3'), tuple('\u0165','\xBB'),
1046 tuple('\u00D3','\xD3'), tuple('\u010F','\xEF'), tuple('\u015B','\xB6'),
1047 tuple('\u017C','\xBF'), tuple('\u00C1','\xC1'), tuple('\u00E1','\xE1'),
1048 tuple('\u0103','\xE3'), tuple('\u013A','\xE5'), tuple('\u0155','\xE0'),
1049 tuple('\u0161','\xB9'), tuple('\u0171','\xFB'), tuple('\u02D8','\xA2'),
1050 tuple('\u00AD','\xAD'), tuple('\u00C9','\xC9'), tuple('\u00DA','\xDA'),
1051 tuple('\u00E9','\xE9'), tuple('\u00FA','\xFA'), tuple('\u0107','\xE6'),
1052 tuple('\u0119','\xEA'), tuple('\u0142','\xB3'), tuple('\u0151','\xF5'),
1053 tuple('\u0159','\xF8'), tuple('\u015F','\xBA'), tuple('\u0163','\xFE'),
1054 tuple('\u016F','\xF9'), tuple('\u017A','\xBC'), tuple('\u017E','\xBE'),
1055 tuple('\u02DB','\xB2'), tuple('\u00A7','\xA7'), tuple('\u00B4','\xB4'),
1056 tuple('\u00C4','\xC4'), tuple('\u00CD','\xCD'), tuple('\u00D6','\xD6'),
1057 tuple('\u00DD','\xDD'), tuple('\u00E4','\xE4'), tuple('\u00ED','\xED'),
1058 tuple('\u00F6','\xF6'), tuple('\u00FD','\xFD'), tuple('\u0105','\xB1'),
1059 tuple('\u010D','\xE8'), tuple('\u0111','\xF0'), tuple('\u011B','\xEC'),
1060 tuple('\u013E','\xB5'), tuple('\u0144','\xF1'), tuple('\u0150','\xD5'),
1061 tuple('\u0154','\xC0'), tuple('\u0158','\xD8'), tuple('\u015A','\xA6'),
1062 tuple('\u015E','\xAA'), tuple('\u0160','\xA9'), tuple('\u0162','\xDE'),
1063 tuple('\u0164','\xAB'), tuple('\u016E','\xD9'), tuple('\u0170','\xDB'),
1064 tuple('\u0179','\xAC'), tuple('\u017B','\xAF'), tuple('\u017D','\xAE'),
1065 tuple('\u02C7','\xB7'), tuple('\u02D9','\xFF'), tuple('\u02DD','\xBD'),
1066 tuple('\u00A4','\xA4'), tuple('\u00A8','\xA8'), tuple('\u00B0','\xB0'),
1067 tuple('\u00B8','\xB8'), tuple('\u00C2','\xC2'), tuple('\u00C7','\xC7'),
1068 tuple('\u00CB','\xCB'), tuple('\u00CE','\xCE'), tuple('\u00D4','\xD4'),
1069 tuple('\u00D7','\xD7'), tuple('\u00DC','\xDC'), tuple('\u00DF','\xDF'),
1070 tuple('\u00E2','\xE2'), tuple('\u00E7','\xE7'), tuple('\u00EB','\xEB'),
1071 tuple('\u00EE','\xEE'), tuple('\u00F4','\xF4'), tuple('\u00F7','\xF7'),
1072 tuple('\u00FC','\xFC'), tuple('\u0102','\xC3'), tuple('\u0104','\xA1'),
1073 tuple('\u0106','\xC6'), tuple('\u010C','\xC8'), tuple('\u010E','\xCF'),
1074 tuple('\u0110','\xD0'), tuple('\u0118','\xCA'), tuple('\u011A','\xCC'),
1075 tuple('\u0139','\xC5'), tuple('\u013D','\xA5'), tuple('\u0141','\xA3'),
1076 tuple('\u0143','\xD1'), tuple('\u0147','\xD2')
1077 ];
1078
1079 mixin GenericEncoder!();
1080 }
1081
1082 //=============================================================================
1083 // WINDOWS-1250
1084 //=============================================================================
1085
1086 /// Defines a Windows1250-encoded character.
1087 enum Windows1250Char : ubyte { _init }
1088
1089 /**
1090 * Defines an Windows1250-encoded string (as an array of $(D
1091 * immutable(Windows1250Char))).
1092 */
1093 alias Windows1250String = immutable(Windows1250Char)[];
1094
1095 private template EncoderInstance(CharType : Windows1250Char)
1096 {
1097 import std.typecons : Tuple, tuple;
1098
1099 alias E = Windows1250Char;
1100 alias EString = Windows1250String;
1101
1102 @property string encodingName() @safe pure nothrow @nogc
1103 {
1104 return "windows-1250";
1105 }
1106
1107 private static immutable dchar m_charMapStart = 0x80;
1108 private static immutable dchar m_charMapEnd = 0xff;
1109
1110 private immutable wstring charMap =
1111 "\u20AC\uFFFD\u201A\uFFFD\u201E\u2026\u2020\u2021"~
1112 "\uFFFD\u2030\u0160\u2039\u015A\u0164\u017D\u0179"~
1113 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1114 "\uFFFD\u2122\u0161\u203A\u015B\u0165\u017E\u017A"~
1115 "\u00A0\u02C7\u02D8\u0141\u00A4\u0104\u00A6\u00A7"~
1116 "\u00A8\u00A9\u015E\u00AB\u00AC\u00AD\u00AE\u017B"~
1117 "\u00B0\u00B1\u02DB\u0142\u00B4\u00B5\u00B6\u00B7"~
1118 "\u00B8\u0105\u015F\u00BB\u013D\u02DD\u013E\u017C"~
1119 "\u0154\u00C1\u00C2\u0102\u00C4\u0139\u0106\u00C7"~
1120 "\u010C\u00C9\u0118\u00CB\u011A\u00CD\u00CE\u010E"~
1121 "\u0110\u0143\u0147\u00D3\u00D4\u0150\u00D6\u00D7"~
1122 "\u0158\u016E\u00DA\u0170\u00DC\u00DD\u0162\u00DF"~
1123 "\u0155\u00E1\u00E2\u0103\u00E4\u013A\u0107\u00E7"~
1124 "\u010D\u00E9\u0119\u00EB\u011B\u00ED\u00EE\u010F"~
1125 "\u0111\u0144\u0148\u00F3\u00F4\u0151\u00F6\u00F7"~
1126 "\u0159\u016F\u00FA\u0171\u00FC\u00FD\u0163\u02D9";
1127
1128 private immutable Tuple!(wchar, char)[] bstMap = [
1129 tuple('\u011A','\xCC'), tuple('\u00DC','\xDC'), tuple('\u0179','\x8F'),
1130 tuple('\u00B7','\xB7'), tuple('\u00FC','\xFC'), tuple('\u0158','\xD8'),
1131 tuple('\u201C','\x93'), tuple('\u00AC','\xAC'), tuple('\u00CB','\xCB'),
1132 tuple('\u00EB','\xEB'), tuple('\u010C','\xC8'), tuple('\u0143','\xD1'),
1133 tuple('\u0162','\xDE'), tuple('\u02D9','\xFF'), tuple('\u2039','\x8B'),
1134 tuple('\u00A7','\xA7'), tuple('\u00B1','\xB1'), tuple('\u00C2','\xC2'),
1135 tuple('\u00D4','\xD4'), tuple('\u00E2','\xE2'), tuple('\u00F4','\xF4'),
1136 tuple('\u0104','\xA5'), tuple('\u0110','\xD0'), tuple('\u013D','\xBC'),
1137 tuple('\u0150','\xD5'), tuple('\u015E','\xAA'), tuple('\u016E','\xD9'),
1138 tuple('\u017D','\x8E'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1139 tuple('\u20AC','\x80'), tuple('\u00A4','\xA4'), tuple('\u00A9','\xA9'),
1140 tuple('\u00AE','\xAE'), tuple('\u00B5','\xB5'), tuple('\u00BB','\xBB'),
1141 tuple('\u00C7','\xC7'), tuple('\u00CE','\xCE'), tuple('\u00D7','\xD7'),
1142 tuple('\u00DF','\xDF'), tuple('\u00E7','\xE7'), tuple('\u00EE','\xEE'),
1143 tuple('\u00F7','\xF7'), tuple('\u0102','\xC3'), tuple('\u0106','\xC6'),
1144 tuple('\u010E','\xCF'), tuple('\u0118','\xCA'), tuple('\u0139','\xC5'),
1145 tuple('\u0141','\xA3'), tuple('\u0147','\xD2'), tuple('\u0154','\xC0'),
1146 tuple('\u015A','\x8C'), tuple('\u0160','\x8A'), tuple('\u0164','\x8D'),
1147 tuple('\u0170','\xDB'), tuple('\u017B','\xAF'), tuple('\u02C7','\xA1'),
1148 tuple('\u02DD','\xBD'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1149 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1150 tuple('\u00A0','\xA0'), tuple('\u00A6','\xA6'), tuple('\u00A8','\xA8'),
1151 tuple('\u00AB','\xAB'), tuple('\u00AD','\xAD'), tuple('\u00B0','\xB0'),
1152 tuple('\u00B4','\xB4'), tuple('\u00B6','\xB6'), tuple('\u00B8','\xB8'),
1153 tuple('\u00C1','\xC1'), tuple('\u00C4','\xC4'), tuple('\u00C9','\xC9'),
1154 tuple('\u00CD','\xCD'), tuple('\u00D3','\xD3'), tuple('\u00D6','\xD6'),
1155 tuple('\u00DA','\xDA'), tuple('\u00DD','\xDD'), tuple('\u00E1','\xE1'),
1156 tuple('\u00E4','\xE4'), tuple('\u00E9','\xE9'), tuple('\u00ED','\xED'),
1157 tuple('\u00F3','\xF3'), tuple('\u00F6','\xF6'), tuple('\u00FA','\xFA'),
1158 tuple('\u00FD','\xFD'), tuple('\u0103','\xE3'), tuple('\u0105','\xB9'),
1159 tuple('\u0107','\xE6'), tuple('\u010D','\xE8'), tuple('\u010F','\xEF'),
1160 tuple('\u0111','\xF0'), tuple('\u0119','\xEA'), tuple('\u011B','\xEC'),
1161 tuple('\u013A','\xE5'), tuple('\u013E','\xBE'), tuple('\u0142','\xB3'),
1162 tuple('\u0144','\xF1'), tuple('\u0148','\xF2'), tuple('\u0151','\xF5'),
1163 tuple('\u0155','\xE0'), tuple('\u0159','\xF8'), tuple('\u015B','\x9C'),
1164 tuple('\u015F','\xBA'), tuple('\u0161','\x9A'), tuple('\u0163','\xFE'),
1165 tuple('\u0165','\x9D'), tuple('\u016F','\xF9'), tuple('\u0171','\xFB'),
1166 tuple('\u017A','\x9F'), tuple('\u017C','\xBF'), tuple('\u017E','\x9E'),
1167 tuple('\u02D8','\xA2'), tuple('\u02DB','\xB2'), tuple('\u2013','\x96'),
1168 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1169 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1170 ];
1171
1172 mixin GenericEncoder!();
1173 }
1174
1175 //=============================================================================
1176 // WINDOWS-1251
1177 //=============================================================================
1178
1179 /// Defines a Windows1251-encoded character.
1180 enum Windows1251Char : ubyte { _init }
1181
1182 /**
1183 * Defines an Windows1251-encoded string (as an array of $(D
1184 * immutable(Windows1251Char))).
1185 */
1186 alias Windows1251String = immutable(Windows1251Char)[];
1187
1188 private template EncoderInstance(CharType : Windows1251Char)
1189 {
1190 import std.typecons : Tuple, tuple;
1191
1192 alias E = Windows1251Char;
1193 alias EString = Windows1251String;
1194
1195 @property string encodingName() @safe pure nothrow @nogc
1196 {
1197 return "windows-1251";
1198 }
1199
1200 private static immutable dchar m_charMapStart = 0x80;
1201 private static immutable dchar m_charMapEnd = 0xff;
1202
1203 private immutable wstring charMap =
1204 "\u0402\u0403\u201A\u0453\u201E\u2026\u2020\u2021"~
1205 "\u20AC\u2030\u0409\u2039\u040A\u040C\u040B\u040F"~
1206 "\u0452\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1207 "\uFFFD\u2122\u0459\u203A\u045A\u045C\u045B\u045F"~
1208 "\u00A0\u040E\u045E\u0408\u00A4\u0490\u00A6\u00A7"~
1209 "\u0401\u00A9\u0404\u00AB\u00AC\u00AD\u00AE\u0407"~
1210 "\u00B0\u00B1\u0406\u0456\u0491\u00B5\u00B6\u00B7"~
1211 "\u0451\u2116\u0454\u00BB\u0458\u0405\u0455\u0457"~
1212 "\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417"~
1213 "\u0418\u0419\u041A\u041B\u041C\u041D\u041E\u041F"~
1214 "\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427"~
1215 "\u0428\u0429\u042A\u042B\u042C\u042D\u042E\u042F"~
1216 "\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437"~
1217 "\u0438\u0439\u043A\u043B\u043C\u043D\u043E\u043F"~
1218 "\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447"~
1219 "\u0448\u0449\u044A\u044B\u044C\u044D\u044E\u044F";
1220
1221 private immutable Tuple!(wchar, char)[] bstMap = [
1222 tuple('\u0432','\xE2'),tuple('\u0412','\xC2'),tuple('\u0453','\x83'),
1223 tuple('\u0401','\xA8'),tuple('\u0422','\xD2'),tuple('\u0442','\xF2'),
1224 tuple('\u2018','\x91'),tuple('\u00AD','\xAD'),tuple('\u0409','\x8A'),
1225 tuple('\u041A','\xCA'),tuple('\u042A','\xDA'),tuple('\u043A','\xEA'),
1226 tuple('\u044A','\xFA'),tuple('\u045B','\x9E'),tuple('\u2022','\x95'),
1227 tuple('\u00A7','\xA7'),tuple('\u00B5','\xB5'),tuple('\u0405','\xBD'),
1228 tuple('\u040E','\xA1'),tuple('\u0416','\xC6'),tuple('\u041E','\xCE'),
1229 tuple('\u0426','\xD6'),tuple('\u042E','\xDE'),tuple('\u0436','\xE6'),
1230 tuple('\u043E','\xEE'),tuple('\u0446','\xF6'),tuple('\u044E','\xFE'),
1231 tuple('\u0457','\xBF'),tuple('\u0490','\xA5'),tuple('\u201D','\x94'),
1232 tuple('\u203A','\x9B'),tuple('\u00A4','\xA4'),tuple('\u00AB','\xAB'),
1233 tuple('\u00B0','\xB0'),tuple('\u00B7','\xB7'),tuple('\u0403','\x81'),
1234 tuple('\u0407','\xAF'),tuple('\u040B','\x8E'),tuple('\u0410','\xC0'),
1235 tuple('\u0414','\xC4'),tuple('\u0418','\xC8'),tuple('\u041C','\xCC'),
1236 tuple('\u0420','\xD0'),tuple('\u0424','\xD4'),tuple('\u0428','\xD8'),
1237 tuple('\u042C','\xDC'),tuple('\u0430','\xE0'),tuple('\u0434','\xE4'),
1238 tuple('\u0438','\xE8'),tuple('\u043C','\xEC'),tuple('\u0440','\xF0'),
1239 tuple('\u0444','\xF4'),tuple('\u0448','\xF8'),tuple('\u044C','\xFC'),
1240 tuple('\u0451','\xB8'),tuple('\u0455','\xBE'),tuple('\u0459','\x9A'),
1241 tuple('\u045E','\xA2'),tuple('\u2013','\x96'),tuple('\u201A','\x82'),
1242 tuple('\u2020','\x86'),tuple('\u2030','\x89'),tuple('\u2116','\xB9'),
1243 tuple('\u00A0','\xA0'),tuple('\u00A6','\xA6'),tuple('\u00A9','\xA9'),
1244 tuple('\u00AC','\xAC'),tuple('\u00AE','\xAE'),tuple('\u00B1','\xB1'),
1245 tuple('\u00B6','\xB6'),tuple('\u00BB','\xBB'),tuple('\u0402','\x80'),
1246 tuple('\u0404','\xAA'),tuple('\u0406','\xB2'),tuple('\u0408','\xA3'),
1247 tuple('\u040A','\x8C'),tuple('\u040C','\x8D'),tuple('\u040F','\x8F'),
1248 tuple('\u0411','\xC1'),tuple('\u0413','\xC3'),tuple('\u0415','\xC5'),
1249 tuple('\u0417','\xC7'),tuple('\u0419','\xC9'),tuple('\u041B','\xCB'),
1250 tuple('\u041D','\xCD'),tuple('\u041F','\xCF'),tuple('\u0421','\xD1'),
1251 tuple('\u0423','\xD3'),tuple('\u0425','\xD5'),tuple('\u0427','\xD7'),
1252 tuple('\u0429','\xD9'),tuple('\u042B','\xDB'),tuple('\u042D','\xDD'),
1253 tuple('\u042F','\xDF'),tuple('\u0431','\xE1'),tuple('\u0433','\xE3'),
1254 tuple('\u0435','\xE5'),tuple('\u0437','\xE7'),tuple('\u0439','\xE9'),
1255 tuple('\u043B','\xEB'),tuple('\u043D','\xED'),tuple('\u043F','\xEF'),
1256 tuple('\u0441','\xF1'),tuple('\u0443','\xF3'),tuple('\u0445','\xF5'),
1257 tuple('\u0447','\xF7'),tuple('\u0449','\xF9'),tuple('\u044B','\xFB'),
1258 tuple('\u044D','\xFD'),tuple('\u044F','\xFF'),tuple('\u0452','\x90'),
1259 tuple('\u0454','\xBA'),tuple('\u0456','\xB3'),tuple('\u0458','\xBC'),
1260 tuple('\u045A','\x9C'),tuple('\u045C','\x9D'),tuple('\u045F','\x9F'),
1261 tuple('\u0491','\xB4'),tuple('\u2014','\x97'),tuple('\u2019','\x92'),
1262 tuple('\u201C','\x93'),tuple('\u201E','\x84'),tuple('\u2021','\x87'),
1263 tuple('\u2026','\x85'),tuple('\u2039','\x8B'),tuple('\u20AC','\x88'),
1264 tuple('\u2122','\x99')
1265 ];
1266
1267 mixin GenericEncoder!();
1268 }
1269
1270 //=============================================================================
1271 // WINDOWS-1252
1272 //=============================================================================
1273
1274 /// Defines a Windows1252-encoded character.
1275 enum Windows1252Char : ubyte { _init }
1276
1277 /**
1278 * Defines an Windows1252-encoded string (as an array of $(D
1279 * immutable(Windows1252Char))).
1280 */
1281 alias Windows1252String = immutable(Windows1252Char)[];
1282
1283 template EncoderInstance(CharType : Windows1252Char)
1284 {
1285 import std.typecons : Tuple, tuple;
1286
1287 alias E = Windows1252Char;
1288 alias EString = Windows1252String;
1289
1290 @property string encodingName() @safe pure nothrow @nogc
1291 {
1292 return "windows-1252";
1293 }
1294
1295 private static immutable dchar m_charMapStart = 0x80;
1296 private static immutable dchar m_charMapEnd = 0x9f;
1297
1298 private immutable wstring charMap =
1299 "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"~
1300 "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"~
1301 "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2013\u2014"~
1302 "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
1303
1304 private immutable Tuple!(wchar, char)[] bstMap = [
1305 tuple('\u201C','\x93'), tuple('\u0192','\x83'), tuple('\u2039','\x8B'),
1306 tuple('\u0161','\x9A'), tuple('\u2014','\x97'), tuple('\u2021','\x87'),
1307 tuple('\u20AC','\x80'), tuple('\u0153','\x9C'), tuple('\u017D','\x8E'),
1308 tuple('\u02DC','\x98'), tuple('\u2019','\x92'), tuple('\u201E','\x84'),
1309 tuple('\u2026','\x85'), tuple('\u203A','\x9B'), tuple('\u2122','\x99'),
1310 tuple('\u0152','\x8C'), tuple('\u0160','\x8A'), tuple('\u0178','\x9F'),
1311 tuple('\u017E','\x9E'), tuple('\u02C6','\x88'), tuple('\u2013','\x96'),
1312 tuple('\u2018','\x91'), tuple('\u201A','\x82'), tuple('\u201D','\x94'),
1313 tuple('\u2020','\x86'), tuple('\u2022','\x95'), tuple('\u2030','\x89')
1314 ];
1315
1316 mixin GenericEncoder!();
1317 }
1318
1319 //=============================================================================
1320 // UTF-8
1321 //=============================================================================
1322
1323 template EncoderInstance(CharType : char)
1324 {
1325 alias E = char;
1326 alias EString = immutable(char)[];
1327
1328 @property string encodingName() @safe pure nothrow @nogc
1329 {
1330 return "UTF-8";
1331 }
1332
1333 bool canEncode(dchar c) @safe pure nothrow @nogc
1334 {
1335 return isValidCodePoint(c);
1336 }
1337
1338 bool isValidCodeUnit(char c) @safe pure nothrow @nogc
1339 {
1340 return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
1341 }
1342
1343 immutable ubyte[128] tailTable =
1344 [
1345 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1346 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1347 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1348 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1349 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1350 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1351 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1352 3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
1353 ];
1354
1355 private int tails(char c) @safe pure nothrow @nogc
1356 in
1357 {
1358 assert(c >= 0x80);
1359 }
1360 do
1361 {
1362 return tailTable[c-0x80];
1363 }
1364
1365 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1366 in
1367 {
1368 assert(canEncode(c));
1369 }
1370 do
1371 {
1372 if (c < 0x80) return 1;
1373 if (c < 0x800) return 2;
1374 if (c < 0x10000) return 3;
1375 return 4;
1376 }
1377
1378 void encodeViaWrite()(dchar c)
1379 {
1380 if (c < 0x80)
1381 {
1382 write(cast(char) c);
1383 }
1384 else if (c < 0x800)
1385 {
1386 write(cast(char)((c >> 6) + 0xC0));
1387 write(cast(char)((c & 0x3F) + 0x80));
1388 }
1389 else if (c < 0x10000)
1390 {
1391 write(cast(char)((c >> 12) + 0xE0));
1392 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1393 write(cast(char)((c & 0x3F) + 0x80));
1394 }
1395 else
1396 {
1397 write(cast(char)((c >> 18) + 0xF0));
1398 write(cast(char)(((c >> 12) & 0x3F) + 0x80));
1399 write(cast(char)(((c >> 6) & 0x3F) + 0x80));
1400 write(cast(char)((c & 0x3F) + 0x80));
1401 }
1402 }
1403
1404 void skipViaRead()()
1405 {
1406 auto c = read();
1407 if (c < 0xC0) return;
1408 int n = tails(cast(char) c);
1409 for (size_t i=0; i<n; ++i)
1410 {
1411 read();
1412 }
1413 }
1414
1415 dchar decodeViaRead()()
1416 {
1417 dchar c = read();
1418 if (c < 0xC0) return c;
1419 int n = tails(cast(char) c);
1420 c &= (1 << (6 - n)) - 1;
1421 for (size_t i=0; i<n; ++i)
1422 {
1423 c = (c << 6) + (read() & 0x3F);
1424 }
1425 return c;
1426 }
1427
1428 dchar safeDecodeViaRead()()
1429 {
1430 dchar c = read();
1431 if (c < 0x80) return c;
1432 int n = tails(cast(char) c);
1433 if (n == 0) return INVALID_SEQUENCE;
1434
1435 if (!canRead) return INVALID_SEQUENCE;
1436 size_t d = peek();
1437 immutable err =
1438 (
1439 (c < 0xC2) // fail overlong 2-byte sequences
1440 || (c > 0xF4) // fail overlong 4-6-byte sequences
1441 || (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences
1442 || (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates
1443 || (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences
1444 || (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF
1445 );
1446
1447 c &= (1 << (6 - n)) - 1;
1448 for (size_t i=0; i<n; ++i)
1449 {
1450 if (!canRead) return INVALID_SEQUENCE;
1451 d = peek();
1452 if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
1453 c = (c << 6) + (read() & 0x3F);
1454 }
1455
1456 return err ? INVALID_SEQUENCE : c;
1457 }
1458
1459 dchar decodeReverseViaRead()()
1460 {
1461 dchar c = read();
1462 if (c < 0x80) return c;
1463 size_t shift = 0;
1464 c &= 0x3F;
1465 for (size_t i=0; i<4; ++i)
1466 {
1467 shift += 6;
1468 auto d = read();
1469 size_t n = tails(cast(char) d);
1470 immutable mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
1471 c += ((d & mask) << shift);
1472 if (n != 0) break;
1473 }
1474 return c;
1475 }
1476
1477 @property EString replacementSequence() @safe pure nothrow @nogc
1478 {
1479 return "\uFFFD";
1480 }
1481
1482 mixin EncoderFunctions;
1483 }
1484
1485 //=============================================================================
1486 // UTF-16
1487 //=============================================================================
1488
1489 template EncoderInstance(CharType : wchar)
1490 {
1491 alias E = wchar;
1492 alias EString = immutable(wchar)[];
1493
1494 @property string encodingName() @safe pure nothrow @nogc
1495 {
1496 return "UTF-16";
1497 }
1498
1499 bool canEncode(dchar c) @safe pure nothrow @nogc
1500 {
1501 return isValidCodePoint(c);
1502 }
1503
1504 bool isValidCodeUnit(wchar c) @safe pure nothrow @nogc
1505 {
1506 return true;
1507 }
1508
1509 size_t encodedLength(dchar c) @safe pure nothrow @nogc
1510 in
1511 {
1512 assert(canEncode(c));
1513 }
1514 do
1515 {
1516 return (c < 0x10000) ? 1 : 2;
1517 }
1518
1519 void encodeViaWrite()(dchar c)
1520 {
1521 if (c < 0x10000)
1522 {
1523 write(cast(wchar) c);
1524 }
1525 else
1526 {
1527 size_t n = c - 0x10000;
1528 write(cast(wchar)(0xD800 + (n >> 10)));
1529 write(cast(wchar)(0xDC00 + (n & 0x3FF)));
1530 }
1531 }
1532
1533 void skipViaRead()()
1534 {
1535 immutable c = read();
1536 if (c < 0xD800 || c >= 0xE000) return;
1537 read();
1538 }
1539
1540 dchar decodeViaRead()()
1541 {
1542 wchar c = read();
1543 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1544 wchar d = read();
1545 c &= 0x3FF;
1546 d &= 0x3FF;
1547 return 0x10000 + (c << 10) + d;
1548 }
1549
1550 dchar safeDecodeViaRead()()
1551 {
1552 wchar c = read();
1553 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1554 if (c >= 0xDC00) return INVALID_SEQUENCE;
1555 if (!canRead) return INVALID_SEQUENCE;
1556 wchar d = peek();
1557 if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
1558 d = read();
1559 c &= 0x3FF;
1560 d &= 0x3FF;
1561 return 0x10000 + (c << 10) + d;
1562 }
1563
1564 dchar decodeReverseViaRead()()
1565 {
1566 wchar c = read();
1567 if (c < 0xD800 || c >= 0xE000) return cast(dchar) c;
1568 wchar d = read();
1569 c &= 0x3FF;
1570 d &= 0x3FF;
1571 return 0x10000 + (d << 10) + c;
1572 }
1573
1574 @property EString replacementSequence() @safe pure nothrow @nogc
1575 {
1576 return "\uFFFD"w;
1577 }
1578
1579 mixin EncoderFunctions;
1580 }
1581
1582 //=============================================================================
1583 // UTF-32
1584 //=============================================================================
1585
1586 template EncoderInstance(CharType : dchar)
1587 {
1588 alias E = dchar;
1589 alias EString = immutable(dchar)[];
1590
1591 @property string encodingName() @safe pure nothrow @nogc
1592 {
1593 return "UTF-32";
1594 }
1595
1596 bool canEncode(dchar c) @safe pure @nogc nothrow
1597 {
1598 return isValidCodePoint(c);
1599 }
1600
1601 bool isValidCodeUnit(dchar c) @safe pure @nogc nothrow
1602 {
1603 return isValidCodePoint(c);
1604 }
1605
1606 size_t encodedLength(dchar c) @safe pure @nogc nothrow
1607 in
1608 {
1609 assert(canEncode(c));
1610 }
1611 do
1612 {
1613 return 1;
1614 }
1615
1616 void encodeViaWrite()(dchar c)
1617 {
1618 write(c);
1619 }
1620
1621 void skipViaRead()()
1622 {
1623 read();
1624 }
1625
1626 dchar decodeViaRead()()
1627 {
1628 return cast(dchar) read();
1629 }
1630
1631 dchar safeDecodeViaRead()()
1632 {
1633 immutable c = read();
1634 return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
1635 }
1636
1637 dchar decodeReverseViaRead()()
1638 {
1639 return cast(dchar) read();
1640 }
1641
1642 @property EString replacementSequence() @safe pure nothrow @nogc
1643 {
1644 return "\uFFFD"d;
1645 }
1646
1647 mixin EncoderFunctions;
1648 }
1649
1650 //=============================================================================
1651 // Below are forwarding functions which expose the function to the user
1652
1653 /**
1654 Returns true if c is a valid code point
1655
1656 Note that this includes the non-character code points U+FFFE and U+FFFF,
1657 since these are valid code points (even though they are not valid
1658 characters).
1659
1660 Supersedes:
1661 This function supersedes `std.utf.startsValidDchar()`.
1662
1663 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1664 WINDOWS-1251, WINDOWS-1252
1665
1666 Params:
1667 c = the code point to be tested
1668 */
1669 bool isValidCodePoint(dchar c) @safe pure nothrow @nogc
1670 {
1671 return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
1672 }
1673
1674 /**
1675 Returns the name of an encoding.
1676
1677 The type of encoding cannot be deduced. Therefore, it is necessary to
1678 explicitly specify the encoding type.
1679
1680 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1681 WINDOWS-1251, WINDOWS-1252
1682 */
1683 @property string encodingName(T)()
1684 {
1685 return EncoderInstance!(T).encodingName;
1686 }
1687
1688 ///
1689 @safe unittest
1690 {
1691 assert(encodingName!(char) == "UTF-8");
1692 assert(encodingName!(wchar) == "UTF-16");
1693 assert(encodingName!(dchar) == "UTF-32");
1694 assert(encodingName!(AsciiChar) == "ASCII");
1695 assert(encodingName!(Latin1Char) == "ISO-8859-1");
1696 assert(encodingName!(Latin2Char) == "ISO-8859-2");
1697 assert(encodingName!(Windows1250Char) == "windows-1250");
1698 assert(encodingName!(Windows1251Char) == "windows-1251");
1699 assert(encodingName!(Windows1252Char) == "windows-1252");
1700 }
1701
1702 /**
1703 Returns true iff it is possible to represent the specified codepoint
1704 in the encoding.
1705
1706 The type of encoding cannot be deduced. Therefore, it is necessary to
1707 explicitly specify the encoding type.
1708
1709 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1710 WINDOWS-1251, WINDOWS-1252
1711 */
1712 bool canEncode(E)(dchar c)
1713 {
1714 return EncoderInstance!(E).canEncode(c);
1715 }
1716
1717 ///
1718 @safe pure unittest
1719 {
1720 assert( canEncode!(Latin1Char)('A'));
1721 assert( canEncode!(Latin2Char)('A'));
1722 assert(!canEncode!(AsciiChar)('\u00A0'));
1723 assert( canEncode!(Latin1Char)('\u00A0'));
1724 assert( canEncode!(Latin2Char)('\u00A0'));
1725 assert( canEncode!(Windows1250Char)('\u20AC'));
1726 assert(!canEncode!(Windows1250Char)('\u20AD'));
1727 assert(!canEncode!(Windows1250Char)('\uFFFD'));
1728 assert( canEncode!(Windows1251Char)('\u0402'));
1729 assert(!canEncode!(Windows1251Char)('\u20AD'));
1730 assert(!canEncode!(Windows1251Char)('\uFFFD'));
1731 assert( canEncode!(Windows1252Char)('\u20AC'));
1732 assert(!canEncode!(Windows1252Char)('\u20AD'));
1733 assert(!canEncode!(Windows1252Char)('\uFFFD'));
1734 assert(!canEncode!(char)(cast(dchar) 0x110000));
1735 }
1736
1737 /// How to check an entire string
1738 @safe pure unittest
1739 {
1740 import std.algorithm.searching : find;
1741 import std.utf : byDchar;
1742
1743 assert("The quick brown fox"
1744 .byDchar
1745 .find!(x => !canEncode!AsciiChar(x))
1746 .empty);
1747 }
1748
1749 /**
1750 Returns true if the code unit is legal. For example, the byte 0x80 would
1751 not be legal in ASCII, because ASCII code units must always be in the range
1752 0x00 to 0x7F.
1753
1754 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1755 WINDOWS-1251, WINDOWS-1252
1756
1757 Params:
1758 c = the code unit to be tested
1759 */
1760 bool isValidCodeUnit(E)(E c)
1761 {
1762 return EncoderInstance!(E).isValidCodeUnit(c);
1763 }
1764
1765 ///
1766 @system pure unittest
1767 {
1768 assert(!isValidCodeUnit(cast(char) 0xC0));
1769 assert(!isValidCodeUnit(cast(char) 0xFF));
1770 assert( isValidCodeUnit(cast(wchar) 0xD800));
1771 assert(!isValidCodeUnit(cast(dchar) 0xD800));
1772 assert(!isValidCodeUnit(cast(AsciiChar) 0xA0));
1773 assert( isValidCodeUnit(cast(Windows1250Char) 0x80));
1774 assert(!isValidCodeUnit(cast(Windows1250Char) 0x81));
1775 assert( isValidCodeUnit(cast(Windows1251Char) 0x80));
1776 assert(!isValidCodeUnit(cast(Windows1251Char) 0x98));
1777 assert( isValidCodeUnit(cast(Windows1252Char) 0x80));
1778 assert(!isValidCodeUnit(cast(Windows1252Char) 0x81));
1779 }
1780
1781 /**
1782 Returns true if the string is encoded correctly
1783
1784 Supersedes:
1785 This function supersedes std.utf.validate(), however note that this
1786 function returns a bool indicating whether the input was valid or not,
1787 whereas the older function would throw an exception.
1788
1789 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1790 WINDOWS-1251, WINDOWS-1252
1791
1792 Params:
1793 s = the string to be tested
1794 */
1795 bool isValid(E)(const(E)[] s)
1796 {
1797 return s.length == validLength(s);
1798 }
1799
1800 ///
1801 @system pure unittest
1802 {
1803 assert( isValid("\u20AC100"));
1804 assert(!isValid(cast(char[3])[167, 133, 175]));
1805 }
1806
1807 /**
1808 Returns the length of the longest possible substring, starting from
1809 the first code unit, which is validly encoded.
1810
1811 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1812 WINDOWS-1251, WINDOWS-1252
1813
1814 Params:
1815 s = the string to be tested
1816 */
1817 size_t validLength(E)(const(E)[] s)
1818 {
1819 size_t result, before = void;
1820 while ((before = s.length) > 0)
1821 {
1822 if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1823 break;
1824 result += before - s.length;
1825 }
1826 return result;
1827 }
1828
1829 /**
1830 Sanitizes a string by replacing malformed code unit sequences with valid
1831 code unit sequences. The result is guaranteed to be valid for this encoding.
1832
1833 If the input string is already valid, this function returns the original,
1834 otherwise it constructs a new string by replacing all illegal code unit
1835 sequences with the encoding's replacement character, Invalid sequences will
1836 be replaced with the Unicode replacement character (U+FFFD) if the
1837 character repertoire contains it, otherwise invalid sequences will be
1838 replaced with '?'.
1839
1840 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1841 WINDOWS-1251, WINDOWS-1252
1842
1843 Params:
1844 s = the string to be sanitized
1845 */
1846 immutable(E)[] sanitize(E)(immutable(E)[] s)
1847 {
1848 size_t n = validLength(s);
1849 if (n == s.length) return s;
1850
1851 auto repSeq = EncoderInstance!(E).replacementSequence;
1852
1853 // Count how long the string needs to be.
1854 // Overestimating is not a problem
1855 size_t len = s.length;
1856 const(E)[] t = s[n..$];
1857 while (t.length != 0)
1858 {
1859 immutable c = EncoderInstance!(E).safeDecode(t);
1860 assert(c == INVALID_SEQUENCE);
1861 len += repSeq.length;
1862 t = t[validLength(t)..$];
1863 }
1864
1865 // Now do the write
1866 E[] array = new E[len];
1867 array[0 .. n] = s[0 .. n];
1868 size_t offset = n;
1869
1870 t = s[n..$];
1871 while (t.length != 0)
1872 {
1873 immutable c = EncoderInstance!(E).safeDecode(t);
1874 assert(c == INVALID_SEQUENCE);
1875 array[offset .. offset+repSeq.length] = repSeq[];
1876 offset += repSeq.length;
1877 n = validLength(t);
1878 array[offset .. offset+n] = t[0 .. n];
1879 offset += n;
1880 t = t[n..$];
1881 }
1882 return cast(immutable(E)[])array[0 .. offset];
1883 }
1884
1885 ///
1886 @system pure unittest
1887 {
1888 assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1889 }
1890
1891 /**
1892 Returns the length of the first encoded sequence.
1893
1894 The input to this function MUST be validly encoded.
1895 This is enforced by the function's in-contract.
1896
1897 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1898 WINDOWS-1251, WINDOWS-1252
1899
1900 Params:
1901 s = the string to be sliced
1902 */
1903 size_t firstSequence(E)(const(E)[] s)
1904 in
1905 {
1906 assert(s.length != 0);
1907 const(E)[] u = s;
1908 assert(safeDecode(u) != INVALID_SEQUENCE);
1909 }
1910 do
1911 {
1912 auto before = s.length;
1913 EncoderInstance!(E).skip(s);
1914 return before - s.length;
1915 }
1916
1917 ///
1918 @system pure unittest
1919 {
1920 assert(firstSequence("\u20AC1000") == "\u20AC".length);
1921 assert(firstSequence("hel") == "h".length);
1922 }
1923
1924 /**
1925 Returns the length of the last encoded sequence.
1926
1927 The input to this function MUST be validly encoded.
1928 This is enforced by the function's in-contract.
1929
1930 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1931 WINDOWS-1251, WINDOWS-1252
1932
1933 Params:
1934 s = the string to be sliced
1935 */
1936 size_t lastSequence(E)(const(E)[] s)
1937 in
1938 {
1939 assert(s.length != 0);
1940 assert(isValid(s));
1941 }
1942 do
1943 {
1944 const(E)[] t = s;
1945 EncoderInstance!(E).decodeReverse(s);
1946 return t.length - s.length;
1947 }
1948
1949 ///
1950 @system pure unittest
1951 {
1952 assert(lastSequence("1000\u20AC") == "\u20AC".length);
1953 assert(lastSequence("hellö") == "ö".length);
1954 }
1955
1956 /**
1957 Returns the array index at which the (n+1)th code point begins.
1958
1959 The input to this function MUST be validly encoded.
1960 This is enforced by the function's in-contract.
1961
1962 Supersedes:
1963 This function supersedes std.utf.toUTFindex().
1964
1965 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
1966 WINDOWS-1251, WINDOWS-1252
1967
1968 Params:
1969 s = the string to be counted
1970 n = the current code point index
1971 */
1972 ptrdiff_t index(E)(const(E)[] s,int n)
1973 in
1974 {
1975 assert(isValid(s));
1976 assert(n >= 0);
1977 }
1978 do
1979 {
1980 const(E)[] t = s;
1981 for (size_t i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1982 return t.length - s.length;
1983 }
1984
1985 ///
1986 @system pure unittest
1987 {
1988 assert(index("\u20AC100",1) == 3);
1989 assert(index("hällo",2) == 3);
1990 }
1991
1992 /**
1993 Decodes a single code point.
1994
1995 This function removes one or more code units from the start of a string,
1996 and returns the decoded code point which those code units represent.
1997
1998 The input to this function MUST be validly encoded.
1999 This is enforced by the function's in-contract.
2000
2001 Supersedes:
2002 This function supersedes std.utf.decode(), however, note that the
2003 function codePoints() supersedes it more conveniently.
2004
2005 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2006 WINDOWS-1251, WINDOWS-1252
2007
2008 Params:
2009 s = the string whose first code point is to be decoded
2010 */
2011 dchar decode(S)(ref S s)
2012 in
2013 {
2014 assert(s.length != 0);
2015 auto u = s;
2016 assert(safeDecode(u) != INVALID_SEQUENCE);
2017 }
2018 do
2019 {
2020 return EncoderInstance!(typeof(s[0])).decode(s);
2021 }
2022
2023 /**
2024 Decodes a single code point from the end of a string.
2025
2026 This function removes one or more code units from the end of a string,
2027 and returns the decoded code point which those code units represent.
2028
2029 The input to this function MUST be validly encoded.
2030 This is enforced by the function's in-contract.
2031
2032 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2033 WINDOWS-1251, WINDOWS-1252
2034
2035 Params:
2036 s = the string whose first code point is to be decoded
2037 */
2038 dchar decodeReverse(E)(ref const(E)[] s)
2039 in
2040 {
2041 assert(s.length != 0);
2042 assert(isValid(s));
2043 }
2044 do
2045 {
2046 return EncoderInstance!(E).decodeReverse(s);
2047 }
2048
2049 /**
2050 Decodes a single code point. The input does not have to be valid.
2051
2052 This function removes one or more code units from the start of a string,
2053 and returns the decoded code point which those code units represent.
2054
2055 This function will accept an invalidly encoded string as input.
2056 If an invalid sequence is found at the start of the string, this
2057 function will remove it, and return the value INVALID_SEQUENCE.
2058
2059 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2060 WINDOWS-1251, WINDOWS-1252
2061
2062 Params:
2063 s = the string whose first code point is to be decoded
2064 */
2065 dchar safeDecode(S)(ref S s)
2066 in
2067 {
2068 assert(s.length != 0);
2069 }
2070 do
2071 {
2072 return EncoderInstance!(typeof(s[0])).safeDecode(s);
2073 }
2074
2075 /**
2076 Returns the number of code units required to encode a single code point.
2077
2078 The input to this function MUST be a valid code point.
2079 This is enforced by the function's in-contract.
2080
2081 The type of the output cannot be deduced. Therefore, it is necessary to
2082 explicitly specify the encoding as a template parameter.
2083
2084 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2085 WINDOWS-1251, WINDOWS-1252
2086
2087 Params:
2088 c = the code point to be encoded
2089 */
2090 size_t encodedLength(E)(dchar c)
2091 in
2092 {
2093 assert(isValidCodePoint(c));
2094 }
2095 do
2096 {
2097 return EncoderInstance!(E).encodedLength(c);
2098 }
2099
2100 /**
2101 Encodes a single code point.
2102
2103 This function encodes a single code point into one or more code units.
2104 It returns a string containing those code units.
2105
2106 The input to this function MUST be a valid code point.
2107 This is enforced by the function's in-contract.
2108
2109 The type of the output cannot be deduced. Therefore, it is necessary to
2110 explicitly specify the encoding as a template parameter.
2111
2112 Supersedes:
2113 This function supersedes std.utf.encode(), however, note that the
2114 function codeUnits() supersedes it more conveniently.
2115
2116 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2117 WINDOWS-1251, WINDOWS-1252
2118
2119 Params:
2120 c = the code point to be encoded
2121 */
2122 E[] encode(E)(dchar c)
2123 in
2124 {
2125 assert(isValidCodePoint(c));
2126 }
2127 do
2128 {
2129 return EncoderInstance!(E).encode(c);
2130 }
2131
2132 /**
2133 Encodes a single code point into an array.
2134
2135 This function encodes a single code point into one or more code units
2136 The code units are stored in a user-supplied fixed-size array,
2137 which must be passed by reference.
2138
2139 The input to this function MUST be a valid code point.
2140 This is enforced by the function's in-contract.
2141
2142 The type of the output cannot be deduced. Therefore, it is necessary to
2143 explicitly specify the encoding as a template parameter.
2144
2145 Supersedes:
2146 This function supersedes std.utf.encode(), however, note that the
2147 function codeUnits() supersedes it more conveniently.
2148
2149 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2150 WINDOWS-1251, WINDOWS-1252
2151
2152 Params:
2153 c = the code point to be encoded
2154 array = the destination array
2155
2156 Returns:
2157 the number of code units written to the array
2158 */
2159 size_t encode(E)(dchar c, E[] array)
2160 in
2161 {
2162 assert(isValidCodePoint(c));
2163 }
2164 do
2165 {
2166 E[] t = array;
2167 EncoderInstance!(E).encode(c,t);
2168 return array.length - t.length;
2169 }
2170
2171 /*
2172 Encodes `c` in units of type `E` and writes the result to the
2173 output range `R`. Returns the number of `E`s written.
2174 */
2175 size_t encode(E, R)(dchar c, auto ref R range)
2176 if (isNativeOutputRange!(R, E))
2177 {
2178 static if (is(immutable E == immutable char))
2179 {
2180 if (c <= 0x7F)
2181 {
2182 put(range, cast(char) c);
2183 return 1;
2184 }
2185 if (c <= 0x7FF)
2186 {
2187 put(range, cast(char)(0xC0 | (c >> 6)));
2188 put(range, cast(char)(0x80 | (c & 0x3F)));
2189 return 2;
2190 }
2191 if (c <= 0xFFFF)
2192 {
2193 put(range, cast(char)(0xE0 | (c >> 12)));
2194 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2195 put(range, cast(char)(0x80 | (c & 0x3F)));
2196 return 3;
2197 }
2198 if (c <= 0x10FFFF)
2199 {
2200 put(range, cast(char)(0xF0 | (c >> 18)));
2201 put(range, cast(char)(0x80 | ((c >> 12) & 0x3F)));
2202 put(range, cast(char)(0x80 | ((c >> 6) & 0x3F)));
2203 put(range, cast(char)(0x80 | (c & 0x3F)));
2204 return 4;
2205 }
2206 else
2207 {
2208 assert(0);
2209 }
2210 }
2211 else static if (is(immutable E == immutable wchar))
2212 {
2213 if (c <= 0xFFFF)
2214 {
2215 range.put(cast(wchar) c);
2216 return 1;
2217 }
2218 range.put(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
2219 range.put(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
2220 return 2;
2221 }
2222 else static if (is(immutable E == immutable dchar))
2223 {
2224 range.put(c);
2225 return 1;
2226 }
2227 else
2228 {
2229 static assert(0);
2230 }
2231 }
2232
2233 @safe pure unittest
2234 {
2235 import std.array;
2236 Appender!(char[]) r;
2237 assert(encode!(char)('T', r) == 1);
2238 assert(encode!(wchar)('T', r) == 1);
2239 assert(encode!(dchar)('T', r) == 1);
2240 }
2241
2242 /**
2243 Encodes a single code point to a delegate.
2244
2245 This function encodes a single code point into one or more code units.
2246 The code units are passed one at a time to the supplied delegate.
2247
2248 The input to this function MUST be a valid code point.
2249 This is enforced by the function's in-contract.
2250
2251 The type of the output cannot be deduced. Therefore, it is necessary to
2252 explicitly specify the encoding as a template parameter.
2253
2254 Supersedes:
2255 This function supersedes std.utf.encode(), however, note that the
2256 function codeUnits() supersedes it more conveniently.
2257
2258 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2259 WINDOWS-1251, WINDOWS-1252
2260
2261 Params:
2262 c = the code point to be encoded
2263 dg = the delegate to invoke for each code unit
2264 */
2265 void encode(E)(dchar c, void delegate(E) dg)
2266 in
2267 {
2268 assert(isValidCodePoint(c));
2269 }
2270 do
2271 {
2272 EncoderInstance!(E).encode(c,dg);
2273 }
2274
2275 /**
2276 Encodes the contents of `s` in units of type `Tgt`, writing the result to an
2277 output range.
2278
2279 Returns: The number of `Tgt` elements written.
2280 Params:
2281 Tgt = Element type of `range`.
2282 s = Input array.
2283 range = Output range.
2284 */
2285 size_t encode(Tgt, Src, R)(in Src[] s, R range)
2286 {
2287 size_t result;
2288 foreach (c; s)
2289 {
2290 result += encode!(Tgt)(c, range);
2291 }
2292 return result;
2293 }
2294
2295 /**
2296 Returns a foreachable struct which can bidirectionally iterate over all
2297 code points in a string.
2298
2299 The input to this function MUST be validly encoded.
2300 This is enforced by the function's in-contract.
2301
2302 You can foreach either
2303 with or without an index. If an index is specified, it will be initialized
2304 at each iteration with the offset into the string at which the code point
2305 begins.
2306
2307 Supersedes:
2308 This function supersedes std.utf.decode().
2309
2310 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2311 WINDOWS-1251, WINDOWS-1252
2312
2313 Params:
2314 s = the string to be decoded
2315
2316 Example:
2317 --------------------------------------------------------
2318 string s = "hello world";
2319 foreach (c;codePoints(s))
2320 {
2321 // do something with c (which will always be a dchar)
2322 }
2323 --------------------------------------------------------
2324
2325 Note that, currently, foreach (c:codePoints(s)) is superior to foreach (c;s)
2326 in that the latter will fall over on encountering U+FFFF.
2327 */
2328 CodePoints!(E) codePoints(E)(immutable(E)[] s)
2329 in
2330 {
2331 assert(isValid(s));
2332 }
2333 do
2334 {
2335 return CodePoints!(E)(s);
2336 }
2337
2338 ///
2339 @system unittest
2340 {
2341 string s = "hello";
2342 string t;
2343 foreach (c;codePoints(s))
2344 {
2345 t ~= cast(char) c;
2346 }
2347 assert(s == t);
2348 }
2349
2350 /**
2351 Returns a foreachable struct which can bidirectionally iterate over all
2352 code units in a code point.
2353
2354 The input to this function MUST be a valid code point.
2355 This is enforced by the function's in-contract.
2356
2357 The type of the output cannot be deduced. Therefore, it is necessary to
2358 explicitly specify the encoding type in the template parameter.
2359
2360 Supersedes:
2361 This function supersedes std.utf.encode().
2362
2363 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2364 WINDOWS-1251, WINDOWS-1252
2365
2366 Params:
2367 c = the code point to be encoded
2368 */
2369 CodeUnits!(E) codeUnits(E)(dchar c)
2370 in
2371 {
2372 assert(isValidCodePoint(c));
2373 }
2374 do
2375 {
2376 return CodeUnits!(E)(c);
2377 }
2378
2379 ///
2380 @system unittest
2381 {
2382 char[] a;
2383 foreach (c;codeUnits!(char)(cast(dchar)'\u20AC'))
2384 {
2385 a ~= c;
2386 }
2387 assert(a.length == 3);
2388 assert(a[0] == 0xE2);
2389 assert(a[1] == 0x82);
2390 assert(a[2] == 0xAC);
2391 }
2392
2393 /**
2394 Convert a string from one encoding to another.
2395
2396 Supersedes:
2397 This function supersedes std.utf.toUTF8(), std.utf.toUTF16() and
2398 std.utf.toUTF32()
2399 (but note that to!() supersedes it more conveniently).
2400
2401 Standards: Unicode 5.0, ASCII, ISO-8859-1, ISO-8859-2, WINDOWS-1250,
2402 WINDOWS-1251, WINDOWS-1252
2403
2404 Params:
2405 s = Source string. $(B Must) be validly encoded.
2406 This is enforced by the function's in-contract.
2407 r = Destination string
2408
2409 See_Also:
2410 $(REF to, std,conv)
2411 */
2412 void transcode(Src, Dst)(Src[] s, out Dst[] r)
2413 in
2414 {
2415 assert(isValid(s));
2416 }
2417 do
2418 {
2419 static if (is(Src == Dst) && is(Src == immutable))
2420 {
2421 r = s;
2422 }
2423 else static if (is(immutable Src == immutable AsciiChar))
2424 {
2425 transcode(cast(const(char)[])s, r);
2426 }
2427 else
2428 {
2429 static if (is(immutable Dst == immutable wchar))
2430 {
2431 immutable minReservePlace = 2;
2432 }
2433 else static if (is(immutable Dst == immutable dchar))
2434 {
2435 immutable minReservePlace = 1;
2436 }
2437 else
2438 {
2439 immutable minReservePlace = 6;
2440 }
2441
2442 auto buffer = new Unqual!Dst[s.length];
2443 auto tmpBuffer = buffer;
2444
2445 while (s.length != 0)
2446 {
2447 if (tmpBuffer.length < minReservePlace)
2448 {
2449 size_t prevLength = buffer.length;
2450 buffer.length += s.length + minReservePlace;
2451 tmpBuffer = buffer[prevLength - tmpBuffer.length .. $];
2452 }
2453 EncoderInstance!(Unqual!Dst).encode(decode(s), tmpBuffer);
2454 }
2455
2456 r = cast(Dst[]) buffer[0 .. buffer.length - tmpBuffer.length];
2457 }
2458 }
2459
2460 ///
2461 @system pure unittest
2462 {
2463 wstring ws;
2464 // transcode from UTF-8 to UTF-16
2465 transcode("hello world",ws);
2466 assert(ws == "hello world"w);
2467
2468 Latin1String ls;
2469 // transcode from UTF-16 to ISO-8859-1
2470 transcode(ws, ls);
2471 assert(ls == "hello world");
2472 }
2473
2474 @system pure unittest
2475 {
2476 import std.meta;
2477 import std.range;
2478 {
2479 import std.conv : to;
2480
2481 string asciiCharString = to!string(iota(0, 128, 1));
2482
2483 alias Types = AliasSeq!(string, Latin1String, Latin2String, AsciiString,
2484 Windows1250String, Windows1251String, Windows1252String, dstring, wstring);
2485 foreach (S; Types)
2486 foreach (D; Types)
2487 {
2488 string str;
2489 S sStr;
2490 D dStr;
2491 transcode(asciiCharString, sStr);
2492 transcode(sStr, dStr);
2493 transcode(dStr, str);
2494 assert(asciiCharString == str);
2495 }
2496 }
2497 {
2498 string czechChars = "Příliš žluťoučký kůň úpěl ďábelské ódy.";
2499 alias Types = AliasSeq!(string, dstring, wstring);
2500 foreach (S; Types)
2501 foreach (D; Types)
2502 {
2503 string str;
2504 S sStr;
2505 D dStr;
2506 transcode(czechChars, sStr);
2507 transcode(sStr, dStr);
2508 transcode(dStr, str);
2509 assert(czechChars == str);
2510 }
2511 }
2512 }
2513
2514 @system unittest // mutable/const input/output
2515 {
2516 import std.meta : AliasSeq;
2517
2518 static foreach (O; AliasSeq!(Latin1Char, const Latin1Char, immutable Latin1Char))
2519 {{
2520 O[] output;
2521
2522 char[] mutableInput = "äbc".dup;
2523 transcode(mutableInput, output);
2524 assert(output == [0xE4, 'b', 'c']);
2525
2526 const char[] constInput = "öbc";
2527 transcode(constInput, output);
2528 assert(output == [0xF6, 'b', 'c']);
2529
2530 immutable char[] immutInput = "übc";
2531 transcode(immutInput, output);
2532 assert(output == [0xFC, 'b', 'c']);
2533 }}
2534
2535 // Make sure that const/mutable input is copied.
2536 static foreach (C; AliasSeq!(char, const char))
2537 {{
2538 C[] input = "foo".dup;
2539 C[] output;
2540 transcode(input, output);
2541 assert(input == output);
2542 assert(input !is output);
2543 }}
2544
2545 // But immutable input should not be copied.
2546 string input = "foo";
2547 string output;
2548 transcode(input, output);
2549 assert(input is output);
2550 }
2551
2552 //=============================================================================
2553
2554 /** The base class for exceptions thrown by this module */
2555 class EncodingException : Exception { this(string msg) @safe pure { super(msg); } }
2556
2557 class UnrecognizedEncodingException : EncodingException
2558 {
2559 private this(string msg) @safe pure { super(msg); }
2560 }
2561
2562 /** Abstract base class of all encoding schemes */
2563 abstract class EncodingScheme
2564 {
2565 import std.uni : toLower;
2566
2567 /**
2568 * Registers a subclass of EncodingScheme.
2569 *
2570 * This function allows user-defined subclasses of EncodingScheme to
2571 * be declared in other modules.
2572 *
2573 * Params:
2574 * Klass = The subclass of EncodingScheme to register.
2575 *
2576 * Example:
2577 * ----------------------------------------------
2578 * class Amiga1251 : EncodingScheme
2579 * {
2580 * shared static this()
2581 * {
2582 * EncodingScheme.register!Amiga1251;
2583 * }
2584 * }
2585 * ----------------------------------------------
2586 */
2587 static void register(Klass:EncodingScheme)()
2588 {
2589 scope scheme = new Klass();
2590 foreach (encodingName;scheme.names())
2591 {
2592 supported[toLower(encodingName)] = () => new Klass();
2593 }
2594 }
2595
2596 deprecated("Please pass the EncodingScheme subclass as template argument instead.")
2597 static void register(string className)
2598 {
2599 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2600 if (scheme is null)
2601 throw new EncodingException("Unable to create class "~className);
2602 foreach (encodingName;scheme.names())
2603 {
2604 supportedFactories[toLower(encodingName)] = className;
2605 }
2606 }
2607
2608 /**
2609 * Obtains a subclass of EncodingScheme which is capable of encoding
2610 * and decoding the named encoding scheme.
2611 *
2612 * This function is only aware of EncodingSchemes which have been
2613 * registered with the register() function.
2614 *
2615 * Example:
2616 * ---------------------------------------------------
2617 * auto scheme = EncodingScheme.create("Amiga-1251");
2618 * ---------------------------------------------------
2619 */
2620 static EncodingScheme create(string encodingName)
2621 {
2622 static bool registerDefaultEncodings()
2623 {
2624 EncodingScheme.register!EncodingSchemeASCII;
2625 EncodingScheme.register!EncodingSchemeLatin1;
2626 EncodingScheme.register!EncodingSchemeLatin2;
2627 EncodingScheme.register!EncodingSchemeWindows1250;
2628 EncodingScheme.register!EncodingSchemeWindows1251;
2629 EncodingScheme.register!EncodingSchemeWindows1252;
2630 EncodingScheme.register!EncodingSchemeUtf8;
2631 EncodingScheme.register!EncodingSchemeUtf16Native;
2632 EncodingScheme.register!EncodingSchemeUtf32Native;
2633 return true;
2634 }
2635
2636 static shared bool initialized;
2637 import std.concurrency : initOnce;
2638 initOnce!initialized(registerDefaultEncodings());
2639 encodingName = toLower(encodingName);
2640
2641 if (auto p = encodingName in supported)
2642 return (*p)();
2643
2644 auto p = encodingName in supportedFactories;
2645 if (p is null)
2646 throw new EncodingException("Unrecognized Encoding: "~encodingName);
2647 string className = *p;
2648 auto scheme = cast(EncodingScheme) ClassInfo.find(className).create();
2649 if (scheme is null) throw new EncodingException("Unable to create class "~className);
2650 return scheme;
2651 }
2652
2653 const
2654 {
2655 /**
2656 * Returns the standard name of the encoding scheme
2657 */
2658 abstract override string toString();
2659
2660 /**
2661 * Returns an array of all known names for this encoding scheme
2662 */
2663 abstract string[] names();
2664
2665 /**
2666 * Returns true if the character c can be represented
2667 * in this encoding scheme.
2668 */
2669 abstract bool canEncode(dchar c);
2670
2671 /**
2672 * Returns the number of ubytes required to encode this code point.
2673 *
2674 * The input to this function MUST be a valid code point.
2675 *
2676 * Params:
2677 * c = the code point to be encoded
2678 *
2679 * Returns:
2680 * the number of ubytes required.
2681 */
2682 abstract size_t encodedLength(dchar c);
2683
2684 /**
2685 * Encodes a single code point into a user-supplied, fixed-size buffer.
2686 *
2687 * This function encodes a single code point into one or more ubytes.
2688 * The supplied buffer must be code unit aligned.
2689 * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
2690 * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
2691 *
2692 * The input to this function MUST be a valid code point.
2693 *
2694 * Params:
2695 * c = the code point to be encoded
2696 * buffer = the destination array
2697 *
2698 * Returns:
2699 * the number of ubytes written.
2700 */
2701 abstract size_t encode(dchar c, ubyte[] buffer);
2702
2703 /**
2704 * Decodes a single code point.
2705 *
2706 * This function removes one or more ubytes from the start of an array,
2707 * and returns the decoded code point which those ubytes represent.
2708 *
2709 * The input to this function MUST be validly encoded.
2710 *
2711 * Params:
2712 * s = the array whose first code point is to be decoded
2713 */
2714 abstract dchar decode(ref const(ubyte)[] s);
2715
2716 /**
2717 * Decodes a single code point. The input does not have to be valid.
2718 *
2719 * This function removes one or more ubytes from the start of an array,
2720 * and returns the decoded code point which those ubytes represent.
2721 *
2722 * This function will accept an invalidly encoded array as input.
2723 * If an invalid sequence is found at the start of the string, this
2724 * function will remove it, and return the value INVALID_SEQUENCE.
2725 *
2726 * Params:
2727 * s = the array whose first code point is to be decoded
2728 */
2729 abstract dchar safeDecode(ref const(ubyte)[] s);
2730
2731 /**
2732 * Returns the sequence of ubytes to be used to represent
2733 * any character which cannot be represented in the encoding scheme.
2734 *
2735 * Normally this will be a representation of some substitution
2736 * character, such as U+FFFD or '?'.
2737 */
2738 abstract @property immutable(ubyte)[] replacementSequence();
2739 }
2740
2741 /**
2742 * Returns true if the array is encoded correctly
2743 *
2744 * Params:
2745 * s = the array to be tested
2746 */
2747 bool isValid(const(ubyte)[] s)
2748 {
2749 while (s.length != 0)
2750 {
2751 if (safeDecode(s) == INVALID_SEQUENCE)
2752 return false;
2753 }
2754 return true;
2755 }
2756
2757 /**
2758 * Returns the length of the longest possible substring, starting from
2759 * the first element, which is validly encoded.
2760 *
2761 * Params:
2762 * s = the array to be tested
2763 */
2764 size_t validLength()(const(ubyte)[] s)
2765 {
2766 const(ubyte)[] r = s;
2767 const(ubyte)[] t = s;
2768 while (s.length != 0)
2769 {
2770 if (safeDecode(s) == INVALID_SEQUENCE) break;
2771 t = s;
2772 }
2773 return r.length - t.length;
2774 }
2775
2776 /**
2777 * Sanitizes an array by replacing malformed ubyte sequences with valid
2778 * ubyte sequences. The result is guaranteed to be valid for this
2779 * encoding scheme.
2780 *
2781 * If the input array is already valid, this function returns the
2782 * original, otherwise it constructs a new array by replacing all illegal
2783 * sequences with the encoding scheme's replacement sequence.
2784 *
2785 * Params:
2786 * s = the string to be sanitized
2787 */
2788 immutable(ubyte)[] sanitize()(immutable(ubyte)[] s)
2789 {
2790 auto n = validLength(s);
2791 if (n == s.length) return s;
2792
2793 auto repSeq = replacementSequence;
2794
2795 // Count how long the string needs to be.
2796 // Overestimating is not a problem
2797 auto len = s.length;
2798 const(ubyte)[] t = s[n..$];
2799 while (t.length != 0)
2800 {
2801 immutable c = safeDecode(t);
2802 assert(c == INVALID_SEQUENCE);
2803 len += repSeq.length;
2804 t = t[validLength(t)..$];
2805 }
2806
2807 // Now do the write
2808 ubyte[] array = new ubyte[len];
2809 array[0 .. n] = s[0 .. n];
2810 auto offset = n;
2811
2812 t = s[n..$];
2813 while (t.length != 0)
2814 {
2815 immutable c = safeDecode(t);
2816 assert(c == INVALID_SEQUENCE);
2817 array[offset .. offset+repSeq.length] = repSeq[];
2818 offset += repSeq.length;
2819 n = validLength(t);
2820 array[offset .. offset+n] = t[0 .. n];
2821 offset += n;
2822 t = t[n..$];
2823 }
2824 return cast(immutable(ubyte)[])array[0 .. offset];
2825 }
2826
2827 /**
2828 * Returns the length of the first encoded sequence.
2829 *
2830 * The input to this function MUST be validly encoded.
2831 * This is enforced by the function's in-contract.
2832 *
2833 * Params:
2834 * s = the array to be sliced
2835 */
2836 size_t firstSequence()(const(ubyte)[] s)
2837 in
2838 {
2839 assert(s.length != 0);
2840 const(ubyte)[] u = s;
2841 assert(safeDecode(u) != INVALID_SEQUENCE);
2842 }
2843 do
2844 {
2845 const(ubyte)[] t = s;
2846 decode(s);
2847 return t.length - s.length;
2848 }
2849
2850 /**
2851 * Returns the total number of code points encoded in a ubyte array.
2852 *
2853 * The input to this function MUST be validly encoded.
2854 * This is enforced by the function's in-contract.
2855 *
2856 * Params:
2857 * s = the string to be counted
2858 */
2859 size_t count()(const(ubyte)[] s)
2860 in
2861 {
2862 assert(isValid(s));
2863 }
2864 do
2865 {
2866 size_t n = 0;
2867 while (s.length != 0)
2868 {
2869 decode(s);
2870 ++n;
2871 }
2872 return n;
2873 }
2874
2875 /**
2876 * Returns the array index at which the (n+1)th code point begins.
2877 *
2878 * The input to this function MUST be validly encoded.
2879 * This is enforced by the function's in-contract.
2880 *
2881 * Params:
2882 * s = the string to be counted
2883 * n = the current code point index
2884 */
2885 ptrdiff_t index()(const(ubyte)[] s, size_t n)
2886 in
2887 {
2888 assert(isValid(s));
2889 assert(n >= 0);
2890 }
2891 do
2892 {
2893 const(ubyte)[] t = s;
2894 for (size_t i=0; i<n; ++i) decode(s);
2895 return t.length - s.length;
2896 }
2897
2898 __gshared EncodingScheme function()[string] supported;
2899 __gshared string[string] supportedFactories;
2900 }
2901
2902 /**
2903 EncodingScheme to handle ASCII
2904
2905 This scheme recognises the following names:
2906 "ANSI_X3.4-1968",
2907 "ANSI_X3.4-1986",
2908 "ASCII",
2909 "IBM367",
2910 "ISO646-US",
2911 "ISO_646.irv:1991",
2912 "US-ASCII",
2913 "cp367",
2914 "csASCII"
2915 "iso-ir-6",
2916 "us"
2917 */
2918 class EncodingSchemeASCII : EncodingScheme
2919 {
2920 /* // moved to std.internal.phobosinit
2921 shared static this()
2922 {
2923 EncodingScheme.register("std.encoding.EncodingSchemeASCII");
2924 }*/
2925
2926 const
2927 {
2928 override string[] names() @safe pure nothrow
2929 {
2930 return
2931 [
2932 "ANSI_X3.4-1968",
2933 "ANSI_X3.4-1986",
2934 "ASCII",
2935 "IBM367",
2936 "ISO646-US",
2937 "ISO_646.irv:1991",
2938 "US-ASCII",
2939 "cp367",
2940 "csASCII",
2941 "iso-ir-6",
2942 "us"
2943 ];
2944 }
2945
2946 override string toString() @safe pure nothrow @nogc
2947 {
2948 return "ASCII";
2949 }
2950
2951 override bool canEncode(dchar c) @safe pure nothrow @nogc
2952 {
2953 return std.encoding.canEncode!(AsciiChar)(c);
2954 }
2955
2956 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
2957 {
2958 return std.encoding.encodedLength!(AsciiChar)(c);
2959 }
2960
2961 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
2962 {
2963 auto r = cast(AsciiChar[]) buffer;
2964 return std.encoding.encode(c,r);
2965 }
2966
2967 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2968 {
2969 auto t = cast(const(AsciiChar)[]) s;
2970 dchar c = std.encoding.decode(t);
2971 s = s[$-t.length..$];
2972 return c;
2973 }
2974
2975 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
2976 {
2977 auto t = cast(const(AsciiChar)[]) s;
2978 dchar c = std.encoding.safeDecode(t);
2979 s = s[$-t.length..$];
2980 return c;
2981 }
2982
2983 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
2984 {
2985 return cast(immutable(ubyte)[])"?";
2986 }
2987 }
2988 }
2989
2990 /**
2991 EncodingScheme to handle Latin-1
2992
2993 This scheme recognises the following names:
2994 "CP819",
2995 "IBM819",
2996 "ISO-8859-1",
2997 "ISO_8859-1",
2998 "ISO_8859-1:1987",
2999 "csISOLatin1",
3000 "iso-ir-100",
3001 "l1",
3002 "latin1"
3003 */
3004 class EncodingSchemeLatin1 : EncodingScheme
3005 {
3006 /* // moved to std.internal.phobosinit
3007 shared static this()
3008 {
3009 EncodingScheme.register("std.encoding.EncodingSchemeLatin1");
3010 }*/
3011
3012 const
3013 {
3014 override string[] names() @safe pure nothrow
3015 {
3016 return
3017 [
3018 "CP819",
3019 "IBM819",
3020 "ISO-8859-1",
3021 "ISO_8859-1",
3022 "ISO_8859-1:1987",
3023 "csISOLatin1",
3024 "iso-ir-100",
3025 "l1",
3026 "latin1"
3027 ];
3028 }
3029
3030 override string toString() @safe pure nothrow @nogc
3031 {
3032 return "ISO-8859-1";
3033 }
3034
3035 override bool canEncode(dchar c) @safe pure nothrow @nogc
3036 {
3037 return std.encoding.canEncode!(Latin1Char)(c);
3038 }
3039
3040 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3041 {
3042 return std.encoding.encodedLength!(Latin1Char)(c);
3043 }
3044
3045 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3046 {
3047 auto r = cast(Latin1Char[]) buffer;
3048 return std.encoding.encode(c,r);
3049 }
3050
3051 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3052 {
3053 auto t = cast(const(Latin1Char)[]) s;
3054 dchar c = std.encoding.decode(t);
3055 s = s[$-t.length..$];
3056 return c;
3057 }
3058
3059 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3060 {
3061 auto t = cast(const(Latin1Char)[]) s;
3062 dchar c = std.encoding.safeDecode(t);
3063 s = s[$-t.length..$];
3064 return c;
3065 }
3066
3067 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3068 {
3069 return cast(immutable(ubyte)[])"?";
3070 }
3071 }
3072 }
3073
3074 /**
3075 EncodingScheme to handle Latin-2
3076
3077 This scheme recognises the following names:
3078 "Latin 2",
3079 "ISO-8859-2",
3080 "ISO_8859-2",
3081 "ISO_8859-2:1999",
3082 "Windows-28592"
3083 */
3084 class EncodingSchemeLatin2 : EncodingScheme
3085 {
3086 /* // moved to std.internal.phobosinit
3087 shared static this()
3088 {
3089 EncodingScheme.register("std.encoding.EncodingSchemeLatin2");
3090 }*/
3091
3092 const
3093 {
3094 override string[] names() @safe pure nothrow
3095 {
3096 return
3097 [
3098 "Latin 2",
3099 "ISO-8859-2",
3100 "ISO_8859-2",
3101 "ISO_8859-2:1999",
3102 "windows-28592"
3103 ];
3104 }
3105
3106 override string toString() @safe pure nothrow @nogc
3107 {
3108 return "ISO-8859-2";
3109 }
3110
3111 override bool canEncode(dchar c) @safe pure nothrow @nogc
3112 {
3113 return std.encoding.canEncode!(Latin2Char)(c);
3114 }
3115
3116 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3117 {
3118 return std.encoding.encodedLength!(Latin2Char)(c);
3119 }
3120
3121 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3122 {
3123 auto r = cast(Latin2Char[]) buffer;
3124 return std.encoding.encode(c,r);
3125 }
3126
3127 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3128 {
3129 auto t = cast(const(Latin2Char)[]) s;
3130 dchar c = std.encoding.decode(t);
3131 s = s[$-t.length..$];
3132 return c;
3133 }
3134
3135 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3136 {
3137 auto t = cast(const(Latin2Char)[]) s;
3138 dchar c = std.encoding.safeDecode(t);
3139 s = s[$-t.length..$];
3140 return c;
3141 }
3142
3143 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3144 {
3145 return cast(immutable(ubyte)[])"?";
3146 }
3147 }
3148 }
3149
3150 /**
3151 EncodingScheme to handle Windows-1250
3152
3153 This scheme recognises the following names:
3154 "windows-1250"
3155 */
3156 class EncodingSchemeWindows1250 : EncodingScheme
3157 {
3158 /* // moved to std.internal.phobosinit
3159 shared static this()
3160 {
3161 EncodingScheme.register("std.encoding.EncodingSchemeWindows1250");
3162 }*/
3163
3164 const
3165 {
3166 override string[] names() @safe pure nothrow
3167 {
3168 return
3169 [
3170 "windows-1250"
3171 ];
3172 }
3173
3174 override string toString() @safe pure nothrow @nogc
3175 {
3176 return "windows-1250";
3177 }
3178
3179 override bool canEncode(dchar c) @safe pure nothrow @nogc
3180 {
3181 return std.encoding.canEncode!(Windows1250Char)(c);
3182 }
3183
3184 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3185 {
3186 return std.encoding.encodedLength!(Windows1250Char)(c);
3187 }
3188
3189 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3190 {
3191 auto r = cast(Windows1250Char[]) buffer;
3192 return std.encoding.encode(c,r);
3193 }
3194
3195 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3196 {
3197 auto t = cast(const(Windows1250Char)[]) s;
3198 dchar c = std.encoding.decode(t);
3199 s = s[$-t.length..$];
3200 return c;
3201 }
3202
3203 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3204 {
3205 auto t = cast(const(Windows1250Char)[]) s;
3206 dchar c = std.encoding.safeDecode(t);
3207 s = s[$-t.length..$];
3208 return c;
3209 }
3210
3211 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3212 {
3213 return cast(immutable(ubyte)[])"?";
3214 }
3215 }
3216 }
3217
3218 /**
3219 EncodingScheme to handle Windows-1251
3220
3221 This scheme recognises the following names:
3222 "windows-1251"
3223 */
3224 class EncodingSchemeWindows1251 : EncodingScheme
3225 {
3226 /* // moved to std.internal.phobosinit
3227 shared static this()
3228 {
3229 EncodingScheme.register("std.encoding.EncodingSchemeWindows1251");
3230 }*/
3231
3232 const
3233 {
3234 override string[] names() @safe pure nothrow
3235 {
3236 return
3237 [
3238 "windows-1251"
3239 ];
3240 }
3241
3242 override string toString() @safe pure nothrow @nogc
3243 {
3244 return "windows-1251";
3245 }
3246
3247 override bool canEncode(dchar c) @safe pure nothrow @nogc
3248 {
3249 return std.encoding.canEncode!(Windows1251Char)(c);
3250 }
3251
3252 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3253 {
3254 return std.encoding.encodedLength!(Windows1251Char)(c);
3255 }
3256
3257 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3258 {
3259 auto r = cast(Windows1251Char[]) buffer;
3260 return std.encoding.encode(c,r);
3261 }
3262
3263 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3264 {
3265 auto t = cast(const(Windows1251Char)[]) s;
3266 dchar c = std.encoding.decode(t);
3267 s = s[$-t.length..$];
3268 return c;
3269 }
3270
3271 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3272 {
3273 auto t = cast(const(Windows1251Char)[]) s;
3274 dchar c = std.encoding.safeDecode(t);
3275 s = s[$-t.length..$];
3276 return c;
3277 }
3278
3279 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3280 {
3281 return cast(immutable(ubyte)[])"?";
3282 }
3283 }
3284 }
3285
3286 /**
3287 EncodingScheme to handle Windows-1252
3288
3289 This scheme recognises the following names:
3290 "windows-1252"
3291 */
3292 class EncodingSchemeWindows1252 : EncodingScheme
3293 {
3294 /* // moved to std.internal.phobosinit
3295 shared static this()
3296 {
3297 EncodingScheme.register("std.encoding.EncodingSchemeWindows1252");
3298 }*/
3299
3300 const
3301 {
3302 override string[] names() @safe pure nothrow
3303 {
3304 return
3305 [
3306 "windows-1252"
3307 ];
3308 }
3309
3310 override string toString() @safe pure nothrow @nogc
3311 {
3312 return "windows-1252";
3313 }
3314
3315 override bool canEncode(dchar c) @safe pure nothrow @nogc
3316 {
3317 return std.encoding.canEncode!(Windows1252Char)(c);
3318 }
3319
3320 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3321 {
3322 return std.encoding.encodedLength!(Windows1252Char)(c);
3323 }
3324
3325 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3326 {
3327 auto r = cast(Windows1252Char[]) buffer;
3328 return std.encoding.encode(c,r);
3329 }
3330
3331 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3332 {
3333 auto t = cast(const(Windows1252Char)[]) s;
3334 dchar c = std.encoding.decode(t);
3335 s = s[$-t.length..$];
3336 return c;
3337 }
3338
3339 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3340 {
3341 auto t = cast(const(Windows1252Char)[]) s;
3342 dchar c = std.encoding.safeDecode(t);
3343 s = s[$-t.length..$];
3344 return c;
3345 }
3346
3347 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3348 {
3349 return cast(immutable(ubyte)[])"?";
3350 }
3351 }
3352 }
3353
3354 @system unittest
3355 {
3356 static string[] schemeNames =
3357 [
3358 "ASCII",
3359 "ISO-8859-1",
3360 "ISO-8859-2",
3361 "windows-1250",
3362 "windows-1251",
3363 "windows-1252"
3364 ];
3365
3366 EncodingScheme[] schemes;
3367
3368 foreach (name;schemeNames)
3369 {
3370 schemes ~= EncodingScheme.create(name);
3371 }
3372
3373 ubyte[1] buffer;
3374 static dchar[][] valid =
3375 [
3376 //Valid ASCII
3377 ['\u0001','\u0020','\u0040','\u0060','\u007F'],
3378 //Vaild 8859-1
3379 ['\u0001','\u0020','\u0070','\u00DA','\u00FF'],
3380 //Valid 8859-2
3381 ['\u0020','\u00D7','\u00DF','\u010F','\u02D9'],
3382 //Valid 1250
3383 ['\u0020','\u20AC','\u201E','\u2021','\u2039'],
3384 //Valid 1251
3385 ['\u0402','\u00A4','\u0415','\u0439','\u044F'],
3386 //Valid 1252
3387 ['\u20AC','\u0160','\u2019','\u2122','\u0178'],
3388 ];
3389
3390 static const(ubyte)[] invalid = [0xA0,0xFF,0xFF,0x81,0x98,0x81];
3391
3392 foreach (i,scheme;schemes)
3393 {
3394 assert(scheme.toString() == schemeNames[i],"Error in the name of encoding scheme"~schemeNames[i]);
3395 assert(!scheme.canEncode('\uFFFD'));
3396 assert(scheme.encodedLength('A') == 1);
3397 const(ubyte)[] encodeStr;
3398 dchar[] decStr;
3399 foreach (chr;valid[i])
3400 {
3401 assert(scheme.encode(chr,buffer) == 1);
3402 encodeStr ~= buffer;
3403 const(ubyte)[] buf = buffer;
3404 decStr ~= scheme.decode(buf);
3405 }
3406
3407 assert(scheme.isValid(encodeStr),"Not correctly encoded UTF => " ~ schemeNames[i]);
3408 assert(valid[i] == decStr,"Error encode/decode UTF8 <=> " ~ schemeNames[i]);
3409
3410 if (schemeNames[i] == "ISO-8859-1" || schemeNames[i] == "ISO-8859-2")
3411 {
3412 assert(scheme.safeDecode(invalid) != INVALID_SEQUENCE);
3413 }
3414 else
3415 {
3416 assert(scheme.safeDecode(invalid) == INVALID_SEQUENCE);
3417 }
3418 assert(scheme.replacementSequence() == cast(immutable(ubyte)[])"?");
3419 }
3420 assert(invalid.length == 0);
3421 }
3422
3423 /**
3424 EncodingScheme to handle UTF-8
3425
3426 This scheme recognises the following names:
3427 "UTF-8"
3428 */
3429 class EncodingSchemeUtf8 : EncodingScheme
3430 {
3431 /* // moved to std.internal.phobosinit
3432 shared static this()
3433 {
3434 EncodingScheme.register("std.encoding.EncodingSchemeUtf8");
3435 }*/
3436
3437 const
3438 {
3439 override string[] names() @safe pure nothrow
3440 {
3441 return
3442 [
3443 "UTF-8"
3444 ];
3445 }
3446
3447 override string toString() @safe pure nothrow @nogc
3448 {
3449 return "UTF-8";
3450 }
3451
3452 override bool canEncode(dchar c) @safe pure nothrow @nogc
3453 {
3454 return std.encoding.canEncode!(char)(c);
3455 }
3456
3457 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3458 {
3459 return std.encoding.encodedLength!(char)(c);
3460 }
3461
3462 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3463 {
3464 auto r = cast(char[]) buffer;
3465 return std.encoding.encode(c,r);
3466 }
3467
3468 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3469 {
3470 auto t = cast(const(char)[]) s;
3471 dchar c = std.encoding.decode(t);
3472 s = s[$-t.length..$];
3473 return c;
3474 }
3475
3476 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3477 {
3478 auto t = cast(const(char)[]) s;
3479 dchar c = std.encoding.safeDecode(t);
3480 s = s[$-t.length..$];
3481 return c;
3482 }
3483
3484 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3485 {
3486 return cast(immutable(ubyte)[])"\uFFFD";
3487 }
3488 }
3489 }
3490
3491 /**
3492 EncodingScheme to handle UTF-16 in native byte order
3493
3494 This scheme recognises the following names:
3495 "UTF-16LE" (little-endian architecture only)
3496 "UTF-16BE" (big-endian architecture only)
3497 */
3498 class EncodingSchemeUtf16Native : EncodingScheme
3499 {
3500 /* // moved to std.internal.phobosinit
3501 shared static this()
3502 {
3503 EncodingScheme.register("std.encoding.EncodingSchemeUtf16Native");
3504 }*/
3505
3506 const
3507 {
3508 version (LittleEndian) { enum string NAME = "UTF-16LE"; }
3509 version (BigEndian) { enum string NAME = "UTF-16BE"; }
3510
3511 override string[] names() @safe pure nothrow
3512 {
3513 return [ NAME ];
3514 }
3515
3516 override string toString() @safe pure nothrow @nogc
3517 {
3518 return NAME;
3519 }
3520
3521 override bool canEncode(dchar c) @safe pure nothrow @nogc
3522 {
3523 return std.encoding.canEncode!(wchar)(c);
3524 }
3525
3526 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3527 {
3528 return std.encoding.encodedLength!(wchar)(c);
3529 }
3530
3531 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3532 {
3533 auto r = cast(wchar[]) buffer;
3534 return wchar.sizeof * std.encoding.encode(c,r);
3535 }
3536
3537 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3538 in
3539 {
3540 assert((s.length & 1) == 0);
3541 }
3542 do
3543 {
3544 auto t = cast(const(wchar)[]) s;
3545 dchar c = std.encoding.decode(t);
3546 s = s[$-t.length * wchar.sizeof..$];
3547 return c;
3548 }
3549
3550 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3551 in
3552 {
3553 assert((s.length & 1) == 0);
3554 }
3555 do
3556 {
3557 auto t = cast(const(wchar)[]) s;
3558 dchar c = std.encoding.safeDecode(t);
3559 s = s[$-t.length * wchar.sizeof..$];
3560 return c;
3561 }
3562
3563 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3564 {
3565 return cast(immutable(ubyte)[])"\uFFFD"w;
3566 }
3567 }
3568 }
3569 @system unittest
3570 {
3571 version (LittleEndian)
3572 {
3573 auto efrom = EncodingScheme.create("utf-16le");
3574 ubyte[6] sample = [154,1, 155,1, 156,1];
3575 }
3576 version (BigEndian)
3577 {
3578 auto efrom = EncodingScheme.create("utf-16be");
3579 ubyte[6] sample = [1,154, 1,155, 1,156];
3580 }
3581 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3582 dchar dc = efrom.safeDecode(ub);
3583 assert(dc == 410);
3584 assert(ub.length == 4);
3585 }
3586
3587 /**
3588 EncodingScheme to handle UTF-32 in native byte order
3589
3590 This scheme recognises the following names:
3591 "UTF-32LE" (little-endian architecture only)
3592 "UTF-32BE" (big-endian architecture only)
3593 */
3594 class EncodingSchemeUtf32Native : EncodingScheme
3595 {
3596 /* // moved to std.internal.phobosinit
3597 shared static this()
3598 {
3599 EncodingScheme.register("std.encoding.EncodingSchemeUtf32Native");
3600 }*/
3601
3602 const
3603 {
3604 version (LittleEndian) { enum string NAME = "UTF-32LE"; }
3605 version (BigEndian) { enum string NAME = "UTF-32BE"; }
3606
3607 override string[] names() @safe pure nothrow
3608 {
3609 return [ NAME ];
3610 }
3611
3612 override string toString() @safe pure nothrow @nogc
3613 {
3614 return NAME;
3615 }
3616
3617 override bool canEncode(dchar c) @safe pure nothrow @nogc
3618 {
3619 return std.encoding.canEncode!(dchar)(c);
3620 }
3621
3622 override size_t encodedLength(dchar c) @safe pure nothrow @nogc
3623 {
3624 return std.encoding.encodedLength!(dchar)(c);
3625 }
3626
3627 override size_t encode(dchar c, ubyte[] buffer) @safe pure nothrow @nogc
3628 {
3629 auto r = cast(dchar[]) buffer;
3630 return dchar.sizeof * std.encoding.encode(c,r);
3631 }
3632
3633 override dchar decode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3634 in
3635 {
3636 assert((s.length & 3) == 0);
3637 }
3638 do
3639 {
3640 auto t = cast(const(dchar)[]) s;
3641 dchar c = std.encoding.decode(t);
3642 s = s[$-t.length * dchar.sizeof..$];
3643 return c;
3644 }
3645
3646 override dchar safeDecode(ref const(ubyte)[] s) @safe pure nothrow @nogc
3647 in
3648 {
3649 assert((s.length & 3) == 0);
3650 }
3651 do
3652 {
3653 auto t = cast(const(dchar)[]) s;
3654 dchar c = std.encoding.safeDecode(t);
3655 s = s[$-t.length * dchar.sizeof..$];
3656 return c;
3657 }
3658
3659 override @property immutable(ubyte)[] replacementSequence() @safe pure nothrow @nogc
3660 {
3661 return cast(immutable(ubyte)[])"\uFFFD"d;
3662 }
3663 }
3664 }
3665 @system unittest
3666 {
3667 version (LittleEndian)
3668 {
3669 auto efrom = EncodingScheme.create("utf-32le");
3670 ubyte[12] sample = [154,1,0,0, 155,1,0,0, 156,1,0,0];
3671 }
3672 version (BigEndian)
3673 {
3674 auto efrom = EncodingScheme.create("utf-32be");
3675 ubyte[12] sample = [0,0,1,154, 0,0,1,155, 0,0,1,156];
3676 }
3677 const(ubyte)[] ub = cast(const(ubyte)[])sample;
3678 dchar dc = efrom.safeDecode(ub);
3679 assert(dc == 410);
3680 assert(ub.length == 8);
3681 }
3682
3683 //=============================================================================
3684
3685
3686 /** Definitions of common Byte Order Marks.
3687 The elements of the `enum` can used as indices into `bomTable` to get
3688 matching `BOMSeq`.
3689 */
3690 enum BOM
3691 {
3692 none = 0, /// no BOM was found
3693 utf32be = 1, /// [0x00, 0x00, 0xFE, 0xFF]
3694 utf32le = 2, /// [0xFF, 0xFE, 0x00, 0x00]
3695 utf7 = 3, /** [0x2B, 0x2F, 0x76, 0x38]
3696 [0x2B, 0x2F, 0x76, 0x39],
3697 [0x2B, 0x2F, 0x76, 0x2B],
3698 [0x2B, 0x2F, 0x76, 0x2F],
3699 [0x2B, 0x2F, 0x76, 0x38, 0x2D]
3700 */
3701 utf1 = 8, /// [0xF7, 0x64, 0x4C]
3702 utfebcdic = 9, /// [0xDD, 0x73, 0x66, 0x73]
3703 scsu = 10, /// [0x0E, 0xFE, 0xFF]
3704 bocu1 = 11, /// [0xFB, 0xEE, 0x28]
3705 gb18030 = 12, /// [0x84, 0x31, 0x95, 0x33]
3706 utf8 = 13, /// [0xEF, 0xBB, 0xBF]
3707 utf16be = 14, /// [0xFE, 0xFF]
3708 utf16le = 15 /// [0xFF, 0xFE]
3709 }
3710
3711 /// The type stored inside `bomTable`.
3712 alias BOMSeq = Tuple!(BOM, "schema", ubyte[], "sequence");
3713
3714 /** Mapping of a byte sequence to $(B Byte Order Mark (BOM))
3715 */
3716 immutable bomTable = [
3717 BOMSeq(BOM.none, null),
3718 BOMSeq(BOM.utf32be, cast(ubyte[])([0x00, 0x00, 0xFE, 0xFF])),
3719 BOMSeq(BOM.utf32le, cast(ubyte[])([0xFF, 0xFE, 0x00, 0x00])),
3720 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x39])),
3721 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2B])),
3722 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x2F])),
3723 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38, 0x2D])),
3724 BOMSeq(BOM.utf7, cast(ubyte[])([0x2B, 0x2F, 0x76, 0x38])),
3725 BOMSeq(BOM.utf1, cast(ubyte[])([0xF7, 0x64, 0x4C])),
3726 BOMSeq(BOM.utfebcdic, cast(ubyte[])([0xDD, 0x73, 0x66, 0x73])),
3727 BOMSeq(BOM.scsu, cast(ubyte[])([0x0E, 0xFE, 0xFF])),
3728 BOMSeq(BOM.bocu1, cast(ubyte[])([0xFB, 0xEE, 0x28])),
3729 BOMSeq(BOM.gb18030, cast(ubyte[])([0x84, 0x31, 0x95, 0x33])),
3730 BOMSeq(BOM.utf8, cast(ubyte[])([0xEF, 0xBB, 0xBF])),
3731 BOMSeq(BOM.utf16be, cast(ubyte[])([0xFE, 0xFF])),
3732 BOMSeq(BOM.utf16le, cast(ubyte[])([0xFF, 0xFE]))
3733 ];
3734
3735 /** Returns a `BOMSeq` for a given `input`.
3736 If no `BOM` is present the `BOMSeq` for `BOM.none` is
3737 returned. The `BOM` sequence at the beginning of the range will
3738 not be comsumed from the passed range. If you pass a reference type
3739 range make sure that `save` creates a deep copy.
3740
3741 Params:
3742 input = The sequence to check for the `BOM`
3743
3744 Returns:
3745 the found `BOMSeq` corresponding to the passed `input`.
3746 */
3747 immutable(BOMSeq) getBOM(Range)(Range input)
3748 if (isForwardRange!Range && is(immutable ElementType!Range == immutable ubyte))
3749 {
3750 import std.algorithm.searching : startsWith;
3751 foreach (it; bomTable[1 .. $])
3752 {
3753 if (startsWith(input.save, it.sequence))
3754 {
3755 return it;
3756 }
3757 }
3758
3759 return bomTable[0];
3760 }
3761
3762 ///
3763 @system unittest
3764 {
3765 import std.format : format;
3766
3767 auto ts = dchar(0x0000FEFF) ~ "Hello World"d;
3768
3769 auto entry = getBOM(cast(ubyte[]) ts);
3770 version (BigEndian)
3771 {
3772 assert(entry.schema == BOM.utf32be, format("%s", entry.schema));
3773 }
3774 else
3775 {
3776 assert(entry.schema == BOM.utf32le, format("%s", entry.schema));
3777 }
3778 }
3779
3780 @system unittest
3781 {
3782 import std.format : format;
3783
3784 foreach (idx, it; bomTable)
3785 {
3786 auto s = it[1] ~ cast(ubyte[])"hello world";
3787 auto i = getBOM(s);
3788 assert(i[0] == bomTable[idx][0]);
3789
3790 if (idx < 4 || idx > 7) // get around the multiple utf7 bom's
3791 {
3792 assert(i[0] == BOM.init + idx);
3793 assert(i[1] == it[1]);
3794 }
3795 }
3796 }
3797
3798 @safe pure unittest
3799 {
3800 struct BOMInputRange
3801 {
3802 ubyte[] arr;
3803
3804 @property ubyte front()
3805 {
3806 return this.arr.front;
3807 }
3808
3809 @property bool empty()
3810 {
3811 return this.arr.empty;
3812 }
3813
3814 void popFront()
3815 {
3816 this.arr = this.arr[1 .. $];
3817 }
3818
3819 @property typeof(this) save()
3820 {
3821 return this;
3822 }
3823 }
3824
3825 static assert( isInputRange!BOMInputRange);
3826 static assert(!isArray!BOMInputRange);
3827
3828 ubyte[] dummyEnd = [0,0,0,0];
3829
3830 foreach (idx, it; bomTable[1 .. $])
3831 {
3832 {
3833 auto ir = BOMInputRange(it.sequence.dup);
3834
3835 auto b = getBOM(ir);
3836 assert(b.schema == it.schema);
3837 assert(ir.arr == it.sequence);
3838 }
3839
3840 {
3841 auto noBom = it.sequence[0 .. 1].dup ~ dummyEnd;
3842 size_t oldLen = noBom.length;
3843 assert(oldLen - 4 < it.sequence.length);
3844
3845 auto ir = BOMInputRange(noBom.dup);
3846 auto b = getBOM(ir);
3847 assert(b.schema == BOM.none);
3848 assert(noBom.length == oldLen);
3849 }
3850 }
3851 }
3852
3853 /** Constant defining a fully decoded BOM */
3854 enum dchar utfBOM = 0xfeff;