1 // Written in the D programming language. 2 3 /++ 4 Encode and decode UTF-8, UTF-16 and UTF-32 strings. 5 6 UTF character support is restricted to 7 $(D '\u0000' <= character <= '\U0010FFFF'). 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Decode) $(TD 14 $(LREF decode) 15 $(LREF decodeFront) 16 )) 17 $(TR $(TD Lazy decode) $(TD 18 $(LREF byCodeUnit) 19 $(LREF byChar) 20 $(LREF byWchar) 21 $(LREF byDchar) 22 $(LREF byUTF) 23 )) 24 $(TR $(TD Encode) $(TD 25 $(LREF encode) 26 $(LREF toUTF8) 27 $(LREF toUTF16) 28 $(LREF toUTF32) 29 $(LREF toUTFz) 30 $(LREF toUTF16z) 31 )) 32 $(TR $(TD Length) $(TD 33 $(LREF codeLength) 34 $(LREF count) 35 $(LREF stride) 36 $(LREF strideBack) 37 )) 38 $(TR $(TD Index) $(TD 39 $(LREF toUCSindex) 40 $(LREF toUTFindex) 41 )) 42 $(TR $(TD Validation) $(TD 43 $(LREF isValidDchar) 44 $(LREF isValidCodepoint) 45 $(LREF validate) 46 )) 47 $(TR $(TD Miscellaneous) $(TD 48 $(LREF replacementDchar) 49 $(LREF UseReplacementDchar) 50 $(LREF UTFException) 51 )) 52 )) 53 See_Also: 54 $(LINK2 http://en.wikipedia.org/wiki/Unicode, Wikipedia)<br> 55 $(LINK http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8)<br> 56 $(LINK https://web.archive.org/web/20100113043530/https://anubis.dkuug.dk/JTC1/SC2/WG2/docs/n1335) 57 Copyright: Copyright The D Language Foundation 2000 - 2012. 58 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 59 Authors: $(HTTP digitalmars.com, Walter Bright) and 60 $(HTTP jmdavisprog.com, Jonathan M Davis) 61 Source: $(PHOBOSSRC std/utf.d) 62 +/ 63 module std.utf; 64 65 import std.exception : basicExceptionCtors; 66 import core.exception : UnicodeException; 67 import std.meta : AliasSeq; 68 import std.range; 69 import std.traits : isAutodecodableString, isConvertibleToString, 70 isSomeChar, isSomeString, isStaticArray, Unqual; 71 import std.typecons : Flag, Yes, No; 72 73 74 /++ 75 Exception thrown on errors in std.utf functions. 76 +/ 77 class UTFException : UnicodeException 78 { 79 import core.internal.string : unsignedToTempString, UnsignedStringBuf; 80 81 uint[4] sequence; 82 size_t len; 83 84 @safe pure nothrow @nogc 85 UTFException setSequence(scope uint[] data...) return 86 { 87 assert(data.length <= 4); 88 89 len = data.length < 4 ? data.length : 4; 90 sequence[0 .. len] = data[0 .. len]; 91 92 return this; 93 } 94 95 // FIXME: Use std.exception.basicExceptionCtors here once 96 // https://issues.dlang.org/show_bug.cgi?id=11500 is fixed 97 98 /** 99 Standard exception constructors. 100 */ 101 this(string msg, string file = __FILE__, size_t line = __LINE__, 102 Throwable next = null) @nogc @safe pure nothrow 103 { 104 super(msg, 0, file, line, next); 105 } 106 /// ditto 107 this(string msg, size_t index, string file = __FILE__, 108 size_t line = __LINE__, Throwable next = null) @safe pure nothrow 109 { 110 UnsignedStringBuf buf = void; 111 msg ~= " (at index " ~ unsignedToTempString(index, buf) ~ ")"; 112 super(msg, index, file, line, next); 113 } 114 115 /** 116 Returns: 117 A `string` detailing the invalid UTF sequence. 118 */ 119 override string toString() const 120 { 121 if (len == 0) 122 { 123 /* Exception.toString() is not marked as const, although 124 * it is const-compatible. 125 */ 126 //return super.toString(); 127 auto e = () @trusted { return cast(Exception) super; } (); 128 return e.toString(); 129 } 130 131 string result = "Invalid UTF sequence:"; 132 133 foreach (i; sequence[0 .. len]) 134 { 135 UnsignedStringBuf buf = void; 136 result ~= ' '; 137 auto h = unsignedToTempString!16(i, buf); 138 if (h.length == 1) 139 result ~= '0'; 140 result ~= h; 141 result ~= 'x'; 142 } 143 144 if (super.msg.length > 0) 145 { 146 result ~= " - "; 147 result ~= super.msg; 148 } 149 150 return result; 151 } 152 } 153 154 /// 155 @safe unittest 156 { 157 import std.exception : assertThrown; 158 159 char[4] buf; 160 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 161 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 162 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 163 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 164 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 165 } 166 167 /* 168 Provide array of invalidly encoded UTF strings. Useful for testing. 169 170 Params: 171 Char = char, wchar, or dchar 172 173 Returns: 174 an array of invalidly encoded UTF strings 175 */ 176 177 package auto invalidUTFstrings(Char)() @safe pure @nogc nothrow 178 if (isSomeChar!Char) 179 { 180 static if (is(Char == char)) 181 { 182 enum x = 0xDC00; // invalid surrogate value 183 enum y = 0x110000; // out of range 184 185 static immutable string[8] result = 186 [ 187 "\x80", // not a start byte 188 "\xC0", // truncated 189 "\xC0\xC0", // invalid continuation 190 "\xF0\x82\x82\xAC", // overlong 191 [ 192 0xE0 | (x >> 12), 193 0x80 | ((x >> 6) & 0x3F), 194 0x80 | (x & 0x3F) 195 ], 196 [ 197 cast(char)(0xF0 | (y >> 18)), 198 cast(char)(0x80 | ((y >> 12) & 0x3F)), 199 cast(char)(0x80 | ((y >> 6) & 0x3F)), 200 cast(char)(0x80 | (y & 0x3F)) 201 ], 202 [ 203 cast(char)(0xF8 | 3), // 5 byte encoding 204 cast(char)(0x80 | 3), 205 cast(char)(0x80 | 3), 206 cast(char)(0x80 | 3), 207 cast(char)(0x80 | 3), 208 ], 209 [ 210 cast(char)(0xFC | 3), // 6 byte encoding 211 cast(char)(0x80 | 3), 212 cast(char)(0x80 | 3), 213 cast(char)(0x80 | 3), 214 cast(char)(0x80 | 3), 215 cast(char)(0x80 | 3), 216 ], 217 ]; 218 219 return result[]; 220 } 221 else static if (is(Char == wchar)) 222 { 223 static immutable wstring[5] result = 224 [ 225 [ 226 cast(wchar) 0xDC00, 227 ], 228 [ 229 cast(wchar) 0xDFFF, 230 ], 231 [ 232 cast(wchar) 0xDBFF, 233 cast(wchar) 0xDBFF, 234 ], 235 [ 236 cast(wchar) 0xDBFF, 237 cast(wchar) 0xE000, 238 ], 239 [ 240 cast(wchar) 0xD800, 241 ], 242 ]; 243 244 return result[]; 245 } 246 else static if (is(Char == dchar)) 247 { 248 static immutable dstring[3] result = 249 [ 250 [ cast(dchar) 0x110000 ], 251 [ cast(dchar) 0x00D800 ], 252 [ cast(dchar) 0x00DFFF ], 253 ]; 254 255 return result; 256 } 257 else 258 static assert(0); 259 } 260 261 /++ 262 Check whether the given Unicode code point is valid. 263 264 Params: 265 c = code point to check 266 267 Returns: 268 `true` if and only if `c` is a valid Unicode code point 269 270 Note: 271 `'\uFFFE'` and `'\uFFFF'` are considered valid by `isValidDchar`, 272 as they are permitted for internal use by an application, but they are 273 not allowed for interchange by the Unicode standard. 274 +/ 275 bool isValidDchar(dchar c) pure nothrow @safe @nogc 276 { 277 return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); 278 } 279 280 /// 281 @safe @nogc pure nothrow unittest 282 { 283 assert( isValidDchar(cast(dchar) 0x41)); 284 assert( isValidDchar(cast(dchar) 0x00)); 285 assert(!isValidDchar(cast(dchar) 0xD800)); 286 assert(!isValidDchar(cast(dchar) 0x11FFFF)); 287 } 288 289 pure nothrow @safe @nogc unittest 290 { 291 import std.exception; 292 293 assertCTFEable!( 294 { 295 assert( isValidDchar(cast(dchar)'a') == true); 296 assert( isValidDchar(cast(dchar) 0x1FFFFF) == false); 297 298 assert(!isValidDchar(cast(dchar) 0x00D800)); 299 assert(!isValidDchar(cast(dchar) 0x00DBFF)); 300 assert(!isValidDchar(cast(dchar) 0x00DC00)); 301 assert(!isValidDchar(cast(dchar) 0x00DFFF)); 302 assert( isValidDchar(cast(dchar) 0x00FFFE)); 303 assert( isValidDchar(cast(dchar) 0x00FFFF)); 304 assert( isValidDchar(cast(dchar) 0x01FFFF)); 305 assert( isValidDchar(cast(dchar) 0x10FFFF)); 306 assert(!isValidDchar(cast(dchar) 0x110000)); 307 }); 308 } 309 310 /** 311 Checks if a single character forms a valid code point. 312 313 When standing alone, some characters are invalid code points. For 314 example the `wchar` `0xD800` is a so called high surrogate, which can 315 only be interpreted together with a low surrogate following it. As a 316 standalone character it is considered invalid. 317 318 See $(LINK2 http://www.unicode.org/versions/Unicode13.0.0/, 319 Unicode Standard, D90, D91 and D92) for more details. 320 321 Params: 322 c = character to test 323 Char = character type of `c` 324 325 Returns: 326 `true`, if `c` forms a valid code point. 327 */ 328 bool isValidCodepoint(Char)(Char c) 329 if (isSomeChar!Char) 330 { 331 alias UChar = typeof(cast() c); 332 static if (is(UChar == char)) 333 { 334 return c <= 0x7F; 335 } 336 else static if (is(UChar == wchar)) 337 { 338 return c <= 0xD7FF || c >= 0xE000; 339 } 340 else static if (is(UChar == dchar)) 341 { 342 return isValidDchar(c); 343 } 344 else 345 static assert(false, "unknown character type: `" ~ Char.stringof ~ "`"); 346 } 347 348 /// 349 @safe pure nothrow unittest 350 { 351 assert( isValidCodepoint(cast(char) 0x40)); 352 assert(!isValidCodepoint(cast(char) 0x80)); 353 assert( isValidCodepoint(cast(wchar) 0x1234)); 354 assert(!isValidCodepoint(cast(wchar) 0xD800)); 355 assert( isValidCodepoint(cast(dchar) 0x0010FFFF)); 356 assert(!isValidCodepoint(cast(dchar) 0x12345678)); 357 } 358 359 /++ 360 Calculate the length of the UTF sequence starting at `index` 361 in `str`. 362 363 Params: 364 str = $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 365 of UTF code units. Must be random access if `index` is passed 366 index = starting index of UTF sequence (default: `0`) 367 368 Returns: 369 The number of code units in the UTF sequence. For UTF-8, this is a 370 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 371 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 372 373 Throws: 374 May throw a `UTFException` if `str[index]` is not the start of a 375 valid UTF sequence. 376 377 Note: 378 `stride` will only analyze the first `str[index]` element. It 379 will not fully verify the validity of the UTF sequence, nor even verify 380 the presence of the sequence: it will not actually guarantee that 381 $(D index + stride(str, index) <= str.length). 382 +/ 383 uint stride(S)(auto ref S str, size_t index) 384 if (is(S : const char[]) || 385 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 386 { 387 static if (is(typeof(str.length) : ulong)) 388 assert(index < str.length, "Past the end of the UTF-8 sequence"); 389 immutable c = str[index]; 390 391 if (c < 0x80) 392 return 1; 393 else 394 return strideImpl(c, index); 395 } 396 397 /// Ditto 398 uint stride(S)(auto ref S str) 399 if (is(S : const char[]) || 400 (isInputRange!S && is(immutable ElementType!S == immutable char))) 401 { 402 static if (is(S : const char[])) 403 immutable c = str[0]; 404 else 405 immutable c = str.front; 406 407 if (c < 0x80) 408 return 1; 409 else 410 return strideImpl(c, 0); 411 } 412 413 @system unittest 414 { 415 import core.exception : AssertError; 416 import std.conv : to; 417 import std.exception; 418 import std.string : format; 419 import std.traits : FunctionAttribute, functionAttributes, isSafe; 420 static void test(string s, dchar c, size_t i = 0, size_t line = __LINE__) 421 { 422 enforce(stride(s, i) == codeLength!char(c), 423 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 424 425 enforce(stride(RandomCU!char(s), i) == codeLength!char(c), 426 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 427 428 auto refRandom = new RefRandomCU!char(s); 429 immutable randLen = refRandom.length; 430 enforce(stride(refRandom, i) == codeLength!char(c), 431 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 432 enforce(refRandom.length == randLen, 433 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 434 435 if (i == 0) 436 { 437 enforce(stride(s) == codeLength!char(c), 438 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 439 440 enforce(stride(InputCU!char(s)) == codeLength!char(c), 441 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 442 443 auto refBidir = new RefBidirCU!char(s); 444 immutable bidirLen = refBidir.length; 445 enforce(stride(refBidir) == codeLength!char(c), 446 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 447 enforce(refBidir.length == bidirLen, 448 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 449 } 450 } 451 452 assertCTFEable!( 453 { 454 test("a", 'a'); 455 test(" ", ' '); 456 test("\u2029", '\u2029'); //paraSep 457 test("\u0100", '\u0100'); 458 test("\u0430", '\u0430'); 459 test("\U00010143", '\U00010143'); 460 test("abcdefcdef", 'a'); 461 test("hello\U00010143\u0100\U00010143", 'h', 0); 462 test("hello\U00010143\u0100\U00010143", 'e', 1); 463 test("hello\U00010143\u0100\U00010143", 'l', 2); 464 test("hello\U00010143\u0100\U00010143", 'l', 3); 465 test("hello\U00010143\u0100\U00010143", 'o', 4); 466 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 467 test("hello\U00010143\u0100\U00010143", '\u0100', 9); 468 test("hello\U00010143\u0100\U00010143", '\U00010143', 11); 469 470 foreach (S; AliasSeq!(char[], const char[], string)) 471 { 472 enum str = to!S("hello world"); 473 static assert(isSafe!({ stride(str, 0); })); 474 static assert(isSafe!({ stride(str); })); 475 static assert((functionAttributes!({ stride(str, 0); }) & FunctionAttribute.pure_) != 0); 476 static assert((functionAttributes!({ stride(str); }) & FunctionAttribute.pure_) != 0); 477 } 478 }); 479 } 480 481 @safe unittest // invalid start bytes 482 { 483 import std.exception : assertThrown; 484 immutable char[] invalidStartBytes = [ 485 0b1111_1000, // indicating a sequence length of 5 486 0b1111_1100, // 6 487 0b1111_1110, // 7 488 0b1111_1111, // 8 489 0b1000_0000, // continuation byte 490 ]; 491 foreach (c; invalidStartBytes) 492 assertThrown!UTFException(stride([c])); 493 } 494 495 /// Ditto 496 uint stride(S)(auto ref S str, size_t index) 497 if (is(S : const wchar[]) || 498 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 499 { 500 static if (is(typeof(str.length) : ulong)) 501 assert(index < str.length, "Past the end of the UTF-16 sequence"); 502 immutable uint u = str[index]; 503 return 1 + (u >= 0xD800 && u <= 0xDBFF); 504 } 505 506 /// Ditto 507 uint stride(S)(auto ref S str) @safe pure 508 if (is(S : const wchar[])) 509 { 510 return stride(str, 0); 511 } 512 513 /// Ditto 514 uint stride(S)(auto ref S str) 515 if (isInputRange!S && is(immutable ElementType!S == immutable wchar) && 516 !is(S : const wchar[])) 517 { 518 assert(!str.empty, "UTF-16 sequence is empty"); 519 immutable uint u = str.front; 520 return 1 + (u >= 0xD800 && u <= 0xDBFF); 521 } 522 523 @system unittest 524 { 525 import core.exception : AssertError; 526 import std.conv : to; 527 import std.exception; 528 import std.string : format; 529 import std.traits : FunctionAttribute, functionAttributes, isSafe; 530 static void test(wstring s, dchar c, size_t i = 0, size_t line = __LINE__) 531 { 532 enforce(stride(s, i) == codeLength!wchar(c), 533 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 534 535 enforce(stride(RandomCU!wchar(s), i) == codeLength!wchar(c), 536 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 537 538 auto refRandom = new RefRandomCU!wchar(s); 539 immutable randLen = refRandom.length; 540 enforce(stride(refRandom, i) == codeLength!wchar(c), 541 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 542 enforce(refRandom.length == randLen, 543 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 544 545 if (i == 0) 546 { 547 enforce(stride(s) == codeLength!wchar(c), 548 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 549 550 enforce(stride(InputCU!wchar(s)) == codeLength!wchar(c), 551 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 552 553 auto refBidir = new RefBidirCU!wchar(s); 554 immutable bidirLen = refBidir.length; 555 enforce(stride(refBidir) == codeLength!wchar(c), 556 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 557 enforce(refBidir.length == bidirLen, 558 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 559 } 560 } 561 562 assertCTFEable!( 563 { 564 test("a", 'a'); 565 test(" ", ' '); 566 test("\u2029", '\u2029'); //paraSep 567 test("\u0100", '\u0100'); 568 test("\u0430", '\u0430'); 569 test("\U00010143", '\U00010143'); 570 test("abcdefcdef", 'a'); 571 test("hello\U00010143\u0100\U00010143", 'h', 0); 572 test("hello\U00010143\u0100\U00010143", 'e', 1); 573 test("hello\U00010143\u0100\U00010143", 'l', 2); 574 test("hello\U00010143\u0100\U00010143", 'l', 3); 575 test("hello\U00010143\u0100\U00010143", 'o', 4); 576 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 577 test("hello\U00010143\u0100\U00010143", '\u0100', 7); 578 test("hello\U00010143\u0100\U00010143", '\U00010143', 8); 579 580 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 581 { 582 enum str = to!S("hello world"); 583 static assert(isSafe!(() => stride(str, 0))); 584 static assert(isSafe!(() => stride(str) )); 585 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 586 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 587 } 588 }); 589 } 590 591 /// Ditto 592 uint stride(S)(auto ref S str, size_t index = 0) 593 if (is(S : const dchar[]) || 594 (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 595 { 596 static if (is(typeof(str.length) : ulong)) 597 assert(index < str.length, "Past the end of the UTF-32 sequence"); 598 else 599 assert(!str.empty, "UTF-32 sequence is empty."); 600 return 1; 601 } 602 603 /// 604 @safe unittest 605 { 606 assert("a".stride == 1); 607 assert("λ".stride == 2); 608 assert("aλ".stride == 1); 609 assert("aλ".stride(1) == 2); 610 assert("𐐷".stride == 4); 611 } 612 613 @system unittest 614 { 615 import core.exception : AssertError; 616 import std.conv : to; 617 import std.exception; 618 import std.string : format; 619 import std.traits : FunctionAttribute, functionAttributes, isSafe; 620 static void test(dstring s, dchar c, size_t i = 0, size_t line = __LINE__) 621 { 622 enforce(stride(s, i) == codeLength!dchar(c), 623 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 624 625 enforce(stride(RandomCU!dchar(s), i) == codeLength!dchar(c), 626 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 627 628 auto refRandom = new RefRandomCU!dchar(s); 629 immutable randLen = refRandom.length; 630 enforce(stride(refRandom, i) == codeLength!dchar(c), 631 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 632 enforce(refRandom.length == randLen, 633 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 634 635 if (i == 0) 636 { 637 enforce(stride(s) == codeLength!dchar(c), 638 new AssertError(format("Unit test failure string 0: %s", s), __FILE__, line)); 639 640 enforce(stride(InputCU!dchar(s)) == codeLength!dchar(c), 641 new AssertError(format("Unit test failure range 0: %s", s), __FILE__, line)); 642 643 auto refBidir = new RefBidirCU!dchar(s); 644 immutable bidirLen = refBidir.length; 645 enforce(stride(refBidir) == codeLength!dchar(c), 646 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 647 enforce(refBidir.length == bidirLen, 648 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 649 } 650 } 651 652 assertCTFEable!( 653 { 654 test("a", 'a'); 655 test(" ", ' '); 656 test("\u2029", '\u2029'); //paraSep 657 test("\u0100", '\u0100'); 658 test("\u0430", '\u0430'); 659 test("\U00010143", '\U00010143'); 660 test("abcdefcdef", 'a'); 661 test("hello\U00010143\u0100\U00010143", 'h', 0); 662 test("hello\U00010143\u0100\U00010143", 'e', 1); 663 test("hello\U00010143\u0100\U00010143", 'l', 2); 664 test("hello\U00010143\u0100\U00010143", 'l', 3); 665 test("hello\U00010143\u0100\U00010143", 'o', 4); 666 test("hello\U00010143\u0100\U00010143", '\U00010143', 5); 667 test("hello\U00010143\u0100\U00010143", '\u0100', 6); 668 test("hello\U00010143\u0100\U00010143", '\U00010143', 7); 669 670 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 671 { 672 enum str = to!S("hello world"); 673 static assert(isSafe!(() => stride(str, 0))); 674 static assert(isSafe!(() => stride(str) )); 675 static assert((functionAttributes!(() => stride(str, 0)) & FunctionAttribute.pure_) != 0); 676 static assert((functionAttributes!(() => stride(str) ) & FunctionAttribute.pure_) != 0); 677 } 678 }); 679 } 680 681 private uint strideImpl(char c, size_t index) @trusted pure 682 in { assert(c & 0x80); } 683 do 684 { 685 import core.bitop : bsr; 686 immutable msbs = 7 - bsr((~uint(c)) & 0xFF); 687 if (c == 0xFF || msbs < 2 || msbs > 4) 688 throw new UTFException("Invalid UTF-8 sequence", index); 689 return msbs; 690 } 691 692 /++ 693 Calculate the length of the UTF sequence ending one code unit before 694 `index` in `str`. 695 696 Params: 697 str = bidirectional range of UTF code units. Must be random access if 698 `index` is passed 699 index = index one past end of UTF sequence (default: `str.length`) 700 701 Returns: 702 The number of code units in the UTF sequence. For UTF-8, this is a 703 value between 1 and 4 (as per $(HTTP tools.ietf.org/html/rfc3629#section-3, RFC 3629$(COMMA) section 3)). 704 For UTF-16, it is either 1 or 2. For UTF-32, it is always 1. 705 706 Throws: 707 May throw a `UTFException` if `str[index]` is not one past the 708 end of a valid UTF sequence. 709 710 Note: 711 `strideBack` will only analyze the element at $(D str[index - 1]) 712 element. It will not fully verify the validity of the UTF sequence, nor 713 even verify the presence of the sequence: it will not actually 714 guarantee that $(D strideBack(str, index) <= index). 715 +/ 716 uint strideBack(S)(auto ref S str, size_t index) 717 if (is(S : const char[]) || 718 (isRandomAccessRange!S && is(immutable ElementType!S == immutable char))) 719 { 720 static if (is(typeof(str.length) : ulong)) 721 assert(index <= str.length, "Past the end of the UTF-8 sequence"); 722 assert(index > 0, "Not the end of the UTF-8 sequence"); 723 724 if ((str[index-1] & 0b1100_0000) != 0b1000_0000) 725 return 1; 726 727 if (index >= 4) //single verification for most common case 728 { 729 static foreach (i; 2 .. 5) 730 { 731 if ((str[index-i] & 0b1100_0000) != 0b1000_0000) 732 return i; 733 } 734 } 735 else 736 { 737 static foreach (i; 2 .. 4) 738 { 739 if (index >= i && (str[index-i] & 0b1100_0000) != 0b1000_0000) 740 return i; 741 } 742 } 743 throw new UTFException("Not the end of the UTF sequence", index); 744 } 745 746 /// Ditto 747 uint strideBack(S)(auto ref S str) 748 if (is(S : const char[]) || 749 (isRandomAccessRange!S && hasLength!S && is(immutable ElementType!S == immutable char))) 750 { 751 return strideBack(str, str.length); 752 } 753 754 /// Ditto 755 uint strideBack(S)(auto ref S str) 756 if (isBidirectionalRange!S && is(immutable ElementType!S == immutable char) && !isRandomAccessRange!S) 757 { 758 assert(!str.empty, "Past the end of the UTF-8 sequence"); 759 auto temp = str.save; 760 foreach (i; AliasSeq!(1, 2, 3, 4)) 761 { 762 if ((temp.back & 0b1100_0000) != 0b1000_0000) 763 return i; 764 temp.popBack(); 765 if (temp.empty) 766 break; 767 } 768 throw new UTFException("The last code unit is not the end of the UTF-8 sequence"); 769 } 770 771 @system unittest 772 { 773 import core.exception : AssertError; 774 import std.conv : to; 775 import std.exception; 776 import std.string : format; 777 import std.traits : FunctionAttribute, functionAttributes, isSafe; 778 static void test(string s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 779 { 780 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!char(c), 781 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 782 783 enforce(strideBack(RandomCU!char(s), i == size_t.max ? s.length : i) == codeLength!char(c), 784 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 785 786 auto refRandom = new RefRandomCU!char(s); 787 immutable randLen = refRandom.length; 788 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!char(c), 789 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 790 enforce(refRandom.length == randLen, 791 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 792 793 if (i == size_t.max) 794 { 795 enforce(strideBack(s) == codeLength!char(c), 796 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 797 798 enforce(strideBack(BidirCU!char(s)) == codeLength!char(c), 799 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 800 801 auto refBidir = new RefBidirCU!char(s); 802 immutable bidirLen = refBidir.length; 803 enforce(strideBack(refBidir) == codeLength!char(c), 804 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 805 enforce(refBidir.length == bidirLen, 806 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 807 } 808 } 809 810 assertCTFEable!( 811 { 812 test("a", 'a'); 813 test(" ", ' '); 814 test("\u2029", '\u2029'); //paraSep 815 test("\u0100", '\u0100'); 816 test("\u0430", '\u0430'); 817 test("\U00010143", '\U00010143'); 818 test("abcdefcdef", 'f'); 819 test("\U00010143\u0100\U00010143hello", 'o', 15); 820 test("\U00010143\u0100\U00010143hello", 'l', 14); 821 test("\U00010143\u0100\U00010143hello", 'l', 13); 822 test("\U00010143\u0100\U00010143hello", 'e', 12); 823 test("\U00010143\u0100\U00010143hello", 'h', 11); 824 test("\U00010143\u0100\U00010143hello", '\U00010143', 10); 825 test("\U00010143\u0100\U00010143hello", '\u0100', 6); 826 test("\U00010143\u0100\U00010143hello", '\U00010143', 4); 827 828 foreach (S; AliasSeq!(char[], const char[], string)) 829 { 830 enum str = to!S("hello world"); 831 static assert(isSafe!({ strideBack(str, 0); })); 832 static assert(isSafe!({ strideBack(str); })); 833 static assert((functionAttributes!({ strideBack(str, 0); }) & FunctionAttribute.pure_) != 0); 834 static assert((functionAttributes!({ strideBack(str); }) & FunctionAttribute.pure_) != 0); 835 } 836 }); 837 } 838 839 //UTF-16 is self synchronizing: The length of strideBack can be found from 840 //the value of a single wchar 841 /// Ditto 842 uint strideBack(S)(auto ref S str, size_t index) 843 if (is(S : const wchar[]) || 844 (isRandomAccessRange!S && is(immutable ElementType!S == immutable wchar))) 845 { 846 static if (is(typeof(str.length) : ulong)) 847 assert(index <= str.length, "Past the end of the UTF-16 sequence"); 848 assert(index > 0, "Not the end of a UTF-16 sequence"); 849 850 immutable c2 = str[index-1]; 851 return 1 + (0xDC00 <= c2 && c2 < 0xE000); 852 } 853 854 /// Ditto 855 uint strideBack(S)(auto ref S str) 856 if (is(S : const wchar[]) || 857 (isBidirectionalRange!S && is(immutable ElementType!S == immutable wchar))) 858 { 859 assert(!str.empty, "UTF-16 sequence is empty"); 860 861 static if (is(S : const(wchar)[])) 862 immutable c2 = str[$ - 1]; 863 else 864 immutable c2 = str.back; 865 866 return 1 + (0xDC00 <= c2 && c2 <= 0xE000); 867 } 868 869 @system unittest 870 { 871 import core.exception : AssertError; 872 import std.conv : to; 873 import std.exception; 874 import std.string : format; 875 import std.traits : FunctionAttribute, functionAttributes, isSafe; 876 static void test(wstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 877 { 878 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!wchar(c), 879 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 880 881 enforce(strideBack(RandomCU!wchar(s), i == size_t.max ? s.length : i) == codeLength!wchar(c), 882 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 883 884 auto refRandom = new RefRandomCU!wchar(s); 885 immutable randLen = refRandom.length; 886 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!wchar(c), 887 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 888 enforce(refRandom.length == randLen, 889 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 890 891 if (i == size_t.max) 892 { 893 enforce(strideBack(s) == codeLength!wchar(c), 894 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 895 896 enforce(strideBack(BidirCU!wchar(s)) == codeLength!wchar(c), 897 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 898 899 auto refBidir = new RefBidirCU!wchar(s); 900 immutable bidirLen = refBidir.length; 901 enforce(strideBack(refBidir) == codeLength!wchar(c), 902 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 903 enforce(refBidir.length == bidirLen, 904 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 905 } 906 } 907 908 assertCTFEable!( 909 { 910 test("a", 'a'); 911 test(" ", ' '); 912 test("\u2029", '\u2029'); //paraSep 913 test("\u0100", '\u0100'); 914 test("\u0430", '\u0430'); 915 test("\U00010143", '\U00010143'); 916 test("abcdefcdef", 'f'); 917 test("\U00010143\u0100\U00010143hello", 'o', 10); 918 test("\U00010143\u0100\U00010143hello", 'l', 9); 919 test("\U00010143\u0100\U00010143hello", 'l', 8); 920 test("\U00010143\u0100\U00010143hello", 'e', 7); 921 test("\U00010143\u0100\U00010143hello", 'h', 6); 922 test("\U00010143\u0100\U00010143hello", '\U00010143', 5); 923 test("\U00010143\u0100\U00010143hello", '\u0100', 3); 924 test("\U00010143\u0100\U00010143hello", '\U00010143', 2); 925 926 foreach (S; AliasSeq!(wchar[], const wchar[], wstring)) 927 { 928 enum str = to!S("hello world"); 929 static assert(isSafe!(() => strideBack(str, 0))); 930 static assert(isSafe!(() => strideBack(str) )); 931 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 932 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 933 } 934 }); 935 } 936 937 /// Ditto 938 uint strideBack(S)(auto ref S str, size_t index) 939 if (isRandomAccessRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 940 { 941 static if (is(typeof(str.length) : ulong)) 942 assert(index <= str.length, "Past the end of the UTF-32 sequence"); 943 assert(index > 0, "Not the end of the UTF-32 sequence"); 944 return 1; 945 } 946 947 /// Ditto 948 uint strideBack(S)(auto ref S str) 949 if (isBidirectionalRange!S && is(immutable ElementEncodingType!S == immutable dchar)) 950 { 951 assert(!str.empty, "Empty UTF-32 sequence"); 952 return 1; 953 } 954 955 /// 956 @safe unittest 957 { 958 assert("a".strideBack == 1); 959 assert("λ".strideBack == 2); 960 assert("aλ".strideBack == 2); 961 assert("aλ".strideBack(1) == 1); 962 assert("𐐷".strideBack == 4); 963 } 964 965 @system unittest 966 { 967 import core.exception : AssertError; 968 import std.conv : to; 969 import std.exception; 970 import std.string : format; 971 import std.traits : FunctionAttribute, functionAttributes, isSafe; 972 static void test(dstring s, dchar c, size_t i = size_t.max, size_t line = __LINE__) 973 { 974 enforce(strideBack(s, i == size_t.max ? s.length : i) == codeLength!dchar(c), 975 new AssertError(format("Unit test failure string: %s", s), __FILE__, line)); 976 977 enforce(strideBack(RandomCU!dchar(s), i == size_t.max ? s.length : i) == codeLength!dchar(c), 978 new AssertError(format("Unit test failure range: %s", s), __FILE__, line)); 979 980 auto refRandom = new RefRandomCU!dchar(s); 981 immutable randLen = refRandom.length; 982 enforce(strideBack(refRandom, i == size_t.max ? s.length : i) == codeLength!dchar(c), 983 new AssertError(format("Unit test failure rand ref range: %s", s), __FILE__, line)); 984 enforce(refRandom.length == randLen, 985 new AssertError(format("Unit test failure rand ref range length: %s", s), __FILE__, line)); 986 987 if (i == size_t.max) 988 { 989 enforce(strideBack(s) == codeLength!dchar(c), 990 new AssertError(format("Unit test failure string code length: %s", s), __FILE__, line)); 991 992 enforce(strideBack(BidirCU!dchar(s)) == codeLength!dchar(c), 993 new AssertError(format("Unit test failure range code length: %s", s), __FILE__, line)); 994 995 auto refBidir = new RefBidirCU!dchar(s); 996 immutable bidirLen = refBidir.length; 997 enforce(strideBack(refBidir) == codeLength!dchar(c), 998 new AssertError(format("Unit test failure bidir ref range code length: %s", s), __FILE__, line)); 999 enforce(refBidir.length == bidirLen, 1000 new AssertError(format("Unit test failure bidir ref range length: %s", s), __FILE__, line)); 1001 } 1002 } 1003 1004 assertCTFEable!( 1005 { 1006 test("a", 'a'); 1007 test(" ", ' '); 1008 test("\u2029", '\u2029'); //paraSep 1009 test("\u0100", '\u0100'); 1010 test("\u0430", '\u0430'); 1011 test("\U00010143", '\U00010143'); 1012 test("abcdefcdef", 'f'); 1013 test("\U00010143\u0100\U00010143hello", 'o', 8); 1014 test("\U00010143\u0100\U00010143hello", 'l', 7); 1015 test("\U00010143\u0100\U00010143hello", 'l', 6); 1016 test("\U00010143\u0100\U00010143hello", 'e', 5); 1017 test("\U00010143\u0100\U00010143hello", 'h', 4); 1018 test("\U00010143\u0100\U00010143hello", '\U00010143', 3); 1019 test("\U00010143\u0100\U00010143hello", '\u0100', 2); 1020 test("\U00010143\u0100\U00010143hello", '\U00010143', 1); 1021 1022 foreach (S; AliasSeq!(dchar[], const dchar[], dstring)) 1023 { 1024 enum str = to!S("hello world"); 1025 static assert(isSafe!(() => strideBack(str, 0))); 1026 static assert(isSafe!(() => strideBack(str) )); 1027 static assert((functionAttributes!(() => strideBack(str, 0)) & FunctionAttribute.pure_) != 0); 1028 static assert((functionAttributes!(() => strideBack(str) ) & FunctionAttribute.pure_) != 0); 1029 } 1030 }); 1031 } 1032 1033 1034 /++ 1035 Given `index` into `str` and assuming that `index` is at the start 1036 of a UTF sequence, `toUCSindex` determines the number of UCS characters 1037 up to `index`. So, `index` is the index of a code unit at the 1038 beginning of a code point, and the return value is how many code points into 1039 the string that that code point is. 1040 +/ 1041 size_t toUCSindex(C)(const(C)[] str, size_t index) @safe pure 1042 if (isSomeChar!C) 1043 { 1044 static if (is(immutable C == immutable dchar)) 1045 return index; 1046 else 1047 { 1048 size_t n = 0; 1049 size_t j = 0; 1050 1051 for (; j < index; ++n) 1052 j += stride(str, j); 1053 1054 if (j > index) 1055 { 1056 static if (is(immutable C == immutable char)) 1057 throw new UTFException("Invalid UTF-8 sequence", index); 1058 else 1059 throw new UTFException("Invalid UTF-16 sequence", index); 1060 } 1061 1062 return n; 1063 } 1064 } 1065 1066 /// 1067 @safe unittest 1068 { 1069 assert(toUCSindex(`hello world`, 7) == 7); 1070 assert(toUCSindex(`hello world`w, 7) == 7); 1071 assert(toUCSindex(`hello world`d, 7) == 7); 1072 1073 assert(toUCSindex(`Ma Chérie`, 7) == 6); 1074 assert(toUCSindex(`Ma Chérie`w, 7) == 7); 1075 assert(toUCSindex(`Ma Chérie`d, 7) == 7); 1076 1077 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`, 9) == 3); 1078 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1079 assert(toUCSindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1080 } 1081 1082 1083 /++ 1084 Given a UCS index `n` into `str`, returns the UTF index. 1085 So, `n` is how many code points into the string the code point is, and 1086 the array index of the code unit is returned. 1087 +/ 1088 size_t toUTFindex(C)(const(C)[] str, size_t n) @safe pure 1089 if (isSomeChar!C) 1090 { 1091 static if (is(immutable C == immutable dchar)) 1092 { 1093 return n; 1094 } 1095 else 1096 { 1097 size_t i; 1098 while (n--) 1099 { 1100 i += stride(str, i); 1101 } 1102 return i; 1103 } 1104 } 1105 1106 /// 1107 @safe unittest 1108 { 1109 assert(toUTFindex(`hello world`, 7) == 7); 1110 assert(toUTFindex(`hello world`w, 7) == 7); 1111 assert(toUTFindex(`hello world`d, 7) == 7); 1112 1113 assert(toUTFindex(`Ma Chérie`, 6) == 7); 1114 assert(toUTFindex(`Ma Chérie`w, 7) == 7); 1115 assert(toUTFindex(`Ma Chérie`d, 7) == 7); 1116 1117 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`, 3) == 9); 1118 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`w, 9) == 9); 1119 assert(toUTFindex(`さいごの果実 / ミツバチと科学者`d, 9) == 9); 1120 } 1121 1122 1123 /* =================== Decode ======================= */ 1124 1125 /// Whether or not to replace invalid UTF with $(LREF replacementDchar) 1126 alias UseReplacementDchar = Flag!"useReplacementDchar"; 1127 1128 /++ 1129 Decodes and returns the code point starting at `str[index]`. `index` 1130 is advanced to one past the decoded code point. If the code point is not 1131 well-formed, then a `UTFException` is thrown and `index` remains 1132 unchanged. 1133 1134 decode will only work with strings and random access ranges of code units 1135 with length and slicing, whereas $(LREF decodeFront) will work with any 1136 input range of code units. 1137 1138 Params: 1139 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1140 str = input string or indexable Range 1141 index = starting index into s[]; incremented by number of code units processed 1142 1143 Returns: 1144 decoded character 1145 1146 Throws: 1147 $(LREF UTFException) if `str[index]` is not the start of a valid UTF 1148 sequence and useReplacementDchar is `No.useReplacementDchar` 1149 +/ 1150 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(auto ref S str, ref size_t index) 1151 if (!isSomeString!S && 1152 isRandomAccessRange!S && hasSlicing!S && hasLength!S && isSomeChar!(ElementType!S)) 1153 in 1154 { 1155 assert(index < str.length, "Attempted to decode past the end of a string"); 1156 } 1157 out (result) 1158 { 1159 assert(isValidDchar(result)); 1160 } 1161 do 1162 { 1163 if (str[index] < codeUnitLimit!S) 1164 return str[index++]; 1165 else 1166 return decodeImpl!(true, useReplacementDchar)(str, index); 1167 } 1168 1169 /// ditto 1170 dchar decode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1171 auto ref scope S str, ref size_t index) @trusted pure 1172 if (isSomeString!S) 1173 in 1174 { 1175 assert(index < str.length, "Attempted to decode past the end of a string"); 1176 } 1177 out (result) 1178 { 1179 assert(isValidDchar(result)); 1180 } 1181 do 1182 { 1183 if (str[index] < codeUnitLimit!S) 1184 return str[index++]; 1185 else static if (is(immutable S == immutable C[], C)) 1186 return decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1187 } 1188 1189 /// 1190 @safe pure unittest 1191 { 1192 size_t i; 1193 1194 assert("a".decode(i) == 'a' && i == 1); 1195 i = 0; 1196 assert("å".decode(i) == 'å' && i == 2); 1197 i = 1; 1198 assert("aå".decode(i) == 'å' && i == 3); 1199 i = 0; 1200 assert("å"w.decode(i) == 'å' && i == 1); 1201 1202 // ë as a multi-code point grapheme 1203 i = 0; 1204 assert("e\u0308".decode(i) == 'e' && i == 1); 1205 // ë as a single code point grapheme 1206 i = 0; 1207 assert("ë".decode(i) == 'ë' && i == 2); 1208 i = 0; 1209 assert("ë"w.decode(i) == 'ë' && i == 1); 1210 } 1211 1212 @safe pure unittest // https://issues.dlang.org/show_bug.cgi?id=22867 1213 { 1214 import std.conv : hexString; 1215 string data = hexString!"f787a598"; 1216 size_t offset = 0; 1217 try data.decode(offset); 1218 catch (UTFException ex) assert(offset == 0); 1219 } 1220 1221 /++ 1222 `decodeFront` is a variant of $(LREF decode) which specifically decodes 1223 the first code point. Unlike $(LREF decode), `decodeFront` accepts any 1224 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 1225 of code units (rather than just a string or random access 1226 range). It also takes the range by `ref` and pops off the elements as it 1227 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1228 of code units which were in the code point which was decoded. 1229 1230 Params: 1231 useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1232 str = input string or indexable Range 1233 numCodeUnits = set to number of code units processed 1234 1235 Returns: 1236 decoded character 1237 1238 Throws: 1239 $(LREF UTFException) if `str.front` is not the start of a valid UTF 1240 sequence. If an exception is thrown, then there is no guarantee as to 1241 the number of code units which were popped off, as it depends on the 1242 type of range being used and how many code units had to be popped off 1243 before the code point was determined to be invalid. 1244 +/ 1245 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1246 ref S str, out size_t numCodeUnits) 1247 if (!isSomeString!S && isInputRange!S && isSomeChar!(ElementType!S)) 1248 in 1249 { 1250 assert(!str.empty); 1251 } 1252 out (result) 1253 { 1254 assert(isValidDchar(result)); 1255 } 1256 do 1257 { 1258 immutable fst = str.front; 1259 1260 if (fst < codeUnitLimit!S) 1261 { 1262 str.popFront(); 1263 numCodeUnits = 1; 1264 return fst; 1265 } 1266 else 1267 { 1268 // https://issues.dlang.org/show_bug.cgi?id=14447 forces canIndex to be 1269 // done outside of decodeImpl, which is undesirable, since not all 1270 // overloads of decodeImpl need it. So, it should be moved back into 1271 // decodeImpl once https://issues.dlang.org/show_bug.cgi?id=8521 1272 // has been fixed. 1273 enum canIndex = is(S : const char[]) || isRandomAccessRange!S && hasSlicing!S && hasLength!S; 1274 immutable retval = decodeImpl!(canIndex, useReplacementDchar)(str, numCodeUnits); 1275 1276 // The other range types were already popped by decodeImpl. 1277 static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1278 str = str[numCodeUnits .. str.length]; 1279 1280 return retval; 1281 } 1282 } 1283 1284 /// ditto 1285 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1286 ref scope S str, out size_t numCodeUnits) @trusted pure 1287 if (isSomeString!S) 1288 in 1289 { 1290 assert(!str.empty); 1291 } 1292 out (result) 1293 { 1294 assert(isValidDchar(result)); 1295 } 1296 do 1297 { 1298 if (str[0] < codeUnitLimit!S) 1299 { 1300 numCodeUnits = 1; 1301 immutable retval = str[0]; 1302 str = str[1 .. $]; 1303 return retval; 1304 } 1305 else static if (is(immutable S == immutable C[], C)) 1306 { 1307 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, numCodeUnits); 1308 str = str[numCodeUnits .. $]; 1309 return retval; 1310 } 1311 } 1312 1313 /++ Ditto +/ 1314 dchar decodeFront(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1315 if (isInputRange!S && isSomeChar!(ElementType!S)) 1316 { 1317 size_t numCodeUnits; 1318 return decodeFront!useReplacementDchar(str, numCodeUnits); 1319 } 1320 1321 /// 1322 @safe pure unittest 1323 { 1324 import std.range.primitives; 1325 string str = "Hello, World!"; 1326 1327 assert(str.decodeFront == 'H' && str == "ello, World!"); 1328 str = "å"; 1329 assert(str.decodeFront == 'å' && str.empty); 1330 str = "å"; 1331 size_t i; 1332 assert(str.decodeFront(i) == 'å' && i == 2 && str.empty); 1333 } 1334 1335 /++ 1336 `decodeBack` is a variant of $(LREF decode) which specifically decodes 1337 the last code point. Unlike $(LREF decode), `decodeBack` accepts any 1338 bidirectional range of code units (rather than just a string or random access 1339 range). It also takes the range by `ref` and pops off the elements as it 1340 decodes them. If `numCodeUnits` is passed in, it gets set to the number 1341 of code units which were in the code point which was decoded. 1342 1343 Params: 1344 useReplacementDchar = if invalid UTF, return `replacementDchar` rather than throwing 1345 str = input string or bidirectional Range 1346 numCodeUnits = gives the number of code units processed 1347 1348 Returns: 1349 A decoded UTF character. 1350 1351 Throws: 1352 $(LREF UTFException) if `str.back` is not the end of a valid UTF 1353 sequence. If an exception is thrown, the `str` itself remains unchanged, 1354 but there is no guarantee as to the value of `numCodeUnits` (when passed). 1355 +/ 1356 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1357 ref S str, out size_t numCodeUnits) 1358 if (isSomeString!S) 1359 in 1360 { 1361 assert(!str.empty); 1362 } 1363 out (result) 1364 { 1365 assert(isValidDchar(result)); 1366 } 1367 do 1368 { 1369 if (str[$ - 1] < codeUnitLimit!S) 1370 { 1371 numCodeUnits = 1; 1372 immutable retval = str[$ - 1]; 1373 str = str[0 .. $ - 1]; 1374 return retval; 1375 } 1376 else static if (is(immutable S == immutable C[], C)) 1377 { 1378 numCodeUnits = strideBack(str); 1379 immutable newLength = str.length - numCodeUnits; 1380 size_t index = newLength; 1381 immutable retval = decodeImpl!(true, useReplacementDchar)(cast(const(C)[]) str, index); 1382 str = str[0 .. newLength]; 1383 return retval; 1384 } 1385 } 1386 1387 /++ Ditto +/ 1388 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1389 ref S str, out size_t numCodeUnits) 1390 if (!isSomeString!S && isSomeChar!(ElementType!S) && isBidirectionalRange!S 1391 && ((isRandomAccessRange!S && hasLength!S) || !isRandomAccessRange!S)) 1392 in 1393 { 1394 assert(!str.empty); 1395 } 1396 out (result) 1397 { 1398 assert(isValidDchar(result)); 1399 } 1400 do 1401 { 1402 if (str.back < codeUnitLimit!S) 1403 { 1404 numCodeUnits = 1; 1405 immutable retval = str.back; 1406 str.popBack(); 1407 return retval; 1408 } 1409 else 1410 { 1411 numCodeUnits = strideBack(str); 1412 static if (isRandomAccessRange!S) 1413 { 1414 size_t index = str.length - numCodeUnits; 1415 immutable retval = decodeImpl!(true, useReplacementDchar)(str, index); 1416 str.popBackExactly(numCodeUnits); 1417 return retval; 1418 } 1419 else 1420 { 1421 alias Char = typeof(cast() ElementType!S.init); 1422 Char[4] codeUnits = void; 1423 S tmp = str.save; 1424 for (size_t i = numCodeUnits; i > 0; ) 1425 { 1426 codeUnits[--i] = tmp.back; 1427 tmp.popBack(); 1428 } 1429 const Char[] codePoint = codeUnits[0 .. numCodeUnits]; 1430 size_t index = 0; 1431 immutable retval = decodeImpl!(true, useReplacementDchar)(codePoint, index); 1432 str = tmp; 1433 return retval; 1434 } 1435 } 1436 } 1437 1438 /++ Ditto +/ 1439 dchar decodeBack(UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)(ref S str) 1440 if (isSomeString!S 1441 || (isRandomAccessRange!S && hasLength!S && isSomeChar!(ElementType!S)) 1442 || (!isRandomAccessRange!S && isBidirectionalRange!S && isSomeChar!(ElementType!S))) 1443 in 1444 { 1445 assert(!str.empty); 1446 } 1447 out (result) 1448 { 1449 assert(isValidDchar(result)); 1450 } 1451 do 1452 { 1453 size_t numCodeUnits; 1454 return decodeBack!useReplacementDchar(str, numCodeUnits); 1455 } 1456 1457 /// 1458 @system pure unittest 1459 { 1460 import std.range.primitives; 1461 string str = "Hello, World!"; 1462 1463 assert(str.decodeBack == '!' && str == "Hello, World"); 1464 str = "å"; 1465 assert(str.decodeBack == 'å' && str.empty); 1466 str = "å"; 1467 size_t i; 1468 assert(str.decodeBack(i) == 'å' && i == 2 && str.empty); 1469 } 1470 1471 // For the given range, code unit values less than this 1472 // are guaranteed to be valid single-codepoint encodings. 1473 package template codeUnitLimit(S) 1474 if (isSomeChar!(ElementEncodingType!S)) 1475 { 1476 static if (is(immutable ElementEncodingType!S == immutable char)) 1477 enum char codeUnitLimit = 0x80; 1478 else static if (is(immutable ElementEncodingType!S == immutable wchar)) 1479 enum wchar codeUnitLimit = 0xD800; 1480 else 1481 enum dchar codeUnitLimit = 0xD800; 1482 } 1483 1484 /* 1485 * For strings, this function does its own bounds checking to give a 1486 * more useful error message when attempting to decode past the end of a string. 1487 * Subsequently it uses a pointer instead of an array to avoid 1488 * redundant bounds checking. 1489 * 1490 * The three overloads of this operate on chars, wchars, and dchars. 1491 * 1492 * Params: 1493 * canIndex = if S is indexable 1494 * useReplacementDchar = if invalid UTF, return replacementDchar rather than throwing 1495 * str = input string or Range 1496 * index = starting index into s[]; incremented by number of code units processed 1497 * 1498 * Returns: 1499 * decoded character 1500 */ 1501 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1502 auto ref S str, ref size_t index) 1503 if ( 1504 is(S : const char[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable char))) 1505 { 1506 /* The following encodings are valid, except for the 5 and 6 byte 1507 * combinations: 1508 * 0xxxxxxx 1509 * 110xxxxx 10xxxxxx 1510 * 1110xxxx 10xxxxxx 10xxxxxx 1511 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 1512 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1513 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 1514 */ 1515 1516 /* Dchar bitmask for different numbers of UTF-8 code units. 1517 */ 1518 alias bitMask = AliasSeq!((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); 1519 1520 static if (is(S : const char[])) 1521 auto pstr = str.ptr + index; // this is what makes decodeImpl() @system code 1522 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1523 auto pstr = str[index .. str.length]; 1524 else 1525 alias pstr = str; 1526 1527 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1528 // outside of decodeImpl 1529 //enum canIndex = is(S : const char[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1530 1531 static if (canIndex) 1532 { 1533 immutable length = str.length - index; 1534 ubyte fst = pstr[0]; 1535 } 1536 else 1537 { 1538 ubyte fst = pstr.front; 1539 pstr.popFront(); 1540 } 1541 1542 static if (!useReplacementDchar) 1543 { 1544 static if (canIndex) 1545 { 1546 static UTFException exception(S)(S str, string msg) 1547 { 1548 uint[4] sequence = void; 1549 size_t i; 1550 1551 do 1552 { 1553 sequence[i] = str[i]; 1554 } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80); 1555 1556 return new UTFException(msg, i).setSequence(sequence[0 .. i]); 1557 } 1558 } 1559 1560 UTFException invalidUTF() 1561 { 1562 static if (canIndex) 1563 return exception(pstr[0 .. length], "Invalid UTF-8 sequence"); 1564 else 1565 { 1566 //We can't include the invalid sequence with input strings without 1567 //saving each of the code units along the way, and we can't do it with 1568 //forward ranges without saving the entire range. Both would incur a 1569 //cost for the decoding of every character just to provide a better 1570 //error message for the (hopefully) rare case when an invalid UTF-8 1571 //sequence is encountered, so we don't bother trying to include the 1572 //invalid sequence here, unlike with strings and sliceable ranges. 1573 return new UTFException("Invalid UTF-8 sequence"); 1574 } 1575 } 1576 1577 UTFException outOfBounds() 1578 { 1579 static if (canIndex) 1580 return exception(pstr[0 .. length], "Attempted to decode past the end of a string"); 1581 else 1582 return new UTFException("Attempted to decode past the end of a string"); 1583 } 1584 } 1585 1586 if ((fst & 0b1100_0000) != 0b1100_0000) 1587 { 1588 static if (useReplacementDchar) 1589 { 1590 ++index; // always consume bad input to avoid infinite loops 1591 return replacementDchar; 1592 } 1593 else 1594 throw invalidUTF(); // starter must have at least 2 first bits set 1595 } 1596 ubyte tmp = void; 1597 dchar d = fst; // upper control bits are masked out later 1598 fst <<= 1; 1599 1600 foreach (i; AliasSeq!(1, 2, 3)) 1601 { 1602 1603 static if (canIndex) 1604 { 1605 if (i == length) 1606 { 1607 static if (useReplacementDchar) 1608 { 1609 index += i; 1610 return replacementDchar; 1611 } 1612 else 1613 throw outOfBounds(); 1614 } 1615 } 1616 else 1617 { 1618 if (pstr.empty) 1619 { 1620 static if (useReplacementDchar) 1621 { 1622 index += i; 1623 return replacementDchar; 1624 } 1625 else 1626 throw outOfBounds(); 1627 } 1628 } 1629 1630 static if (canIndex) 1631 tmp = pstr[i]; 1632 else 1633 { 1634 tmp = pstr.front; 1635 pstr.popFront(); 1636 } 1637 1638 if ((tmp & 0xC0) != 0x80) 1639 { 1640 static if (useReplacementDchar) 1641 { 1642 index += i + 1; 1643 return replacementDchar; 1644 } 1645 else 1646 throw invalidUTF(); 1647 } 1648 1649 d = (d << 6) | (tmp & 0x3F); 1650 fst <<= 1; 1651 1652 if (!(fst & 0x80)) // no more bytes 1653 { 1654 d &= bitMask[i]; // mask out control bits 1655 1656 // overlong, could have been encoded with i bytes 1657 if ((d & ~bitMask[i - 1]) == 0) 1658 { 1659 static if (useReplacementDchar) 1660 { 1661 index += i + 1; 1662 return replacementDchar; 1663 } 1664 else 1665 throw invalidUTF(); 1666 } 1667 1668 // check for surrogates only needed for 3 bytes 1669 static if (i == 2) 1670 { 1671 if (!isValidDchar(d)) 1672 { 1673 static if (useReplacementDchar) 1674 { 1675 index += i + 1; 1676 return replacementDchar; 1677 } 1678 else 1679 throw invalidUTF(); 1680 } 1681 } 1682 1683 static if (i == 3) 1684 { 1685 if (d > dchar.max) 1686 { 1687 static if (useReplacementDchar) 1688 d = replacementDchar; 1689 else 1690 throw invalidUTF(); 1691 } 1692 } 1693 1694 index += i + 1; 1695 return d; 1696 } 1697 } 1698 1699 static if (useReplacementDchar) 1700 { 1701 index += 4; // read 4 chars by now 1702 return replacementDchar; 1703 } 1704 else 1705 throw invalidUTF(); 1706 } 1707 1708 @safe pure @nogc nothrow 1709 unittest 1710 { 1711 // Add tests for useReplacemendDchar == yes path 1712 1713 static struct R 1714 { 1715 @safe pure @nogc nothrow: 1716 this(string s) { this.s = s; } 1717 @property bool empty() { return idx == s.length; } 1718 @property char front() { return s[idx]; } 1719 void popFront() { ++idx; } 1720 size_t idx; 1721 string s; 1722 } 1723 1724 foreach (s; invalidUTFstrings!char()) 1725 { 1726 auto r = R(s); 1727 size_t index; 1728 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1729 assert(dc == replacementDchar); 1730 assert(1 <= index && index <= s.length); 1731 } 1732 } 1733 1734 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S) 1735 (auto ref S str, ref size_t index) 1736 if (is(S : const wchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable wchar))) 1737 { 1738 static if (is(S : const wchar[])) 1739 auto pstr = str.ptr + index; 1740 else static if (isRandomAccessRange!S && hasSlicing!S && hasLength!S) 1741 auto pstr = str[index .. str.length]; 1742 else 1743 alias pstr = str; 1744 1745 // https://issues.dlang.org/show_bug.cgi?id=14447 forces this to be done 1746 // outside of decodeImpl 1747 //enum canIndex = is(S : const wchar[]) || (isRandomAccessRange!S && hasSlicing!S && hasLength!S); 1748 1749 static if (canIndex) 1750 { 1751 immutable length = str.length - index; 1752 uint u = pstr[0]; 1753 } 1754 else 1755 { 1756 uint u = pstr.front; 1757 pstr.popFront(); 1758 } 1759 1760 static if (!useReplacementDchar) 1761 { 1762 UTFException exception(string msg) 1763 { 1764 static if (canIndex) 1765 return new UTFException(msg).setSequence(pstr[0]); 1766 else 1767 return new UTFException(msg); 1768 } 1769 } 1770 1771 // The < case must be taken care of before decodeImpl is called. 1772 assert(u >= 0xD800); 1773 1774 if (u <= 0xDBFF) 1775 { 1776 static if (canIndex) 1777 immutable onlyOneCodeUnit = length == 1; 1778 else 1779 immutable onlyOneCodeUnit = pstr.empty; 1780 1781 if (onlyOneCodeUnit) 1782 { 1783 static if (useReplacementDchar) 1784 { 1785 ++index; 1786 return replacementDchar; 1787 } 1788 else 1789 throw exception("surrogate UTF-16 high value past end of string"); 1790 } 1791 1792 static if (canIndex) 1793 immutable uint u2 = pstr[1]; 1794 else 1795 { 1796 immutable uint u2 = pstr.front; 1797 pstr.popFront(); 1798 } 1799 1800 if (u2 < 0xDC00 || u2 > 0xDFFF) 1801 { 1802 static if (useReplacementDchar) 1803 u = replacementDchar; 1804 else 1805 throw exception("surrogate UTF-16 low value out of range"); 1806 } 1807 else 1808 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 1809 ++index; 1810 } 1811 else if (u >= 0xDC00 && u <= 0xDFFF) 1812 { 1813 static if (useReplacementDchar) 1814 u = replacementDchar; 1815 else 1816 throw exception("unpaired surrogate UTF-16 value"); 1817 } 1818 ++index; 1819 1820 // Note: u+FFFE and u+FFFF are specifically permitted by the 1821 // Unicode standard for application internal use (see isValidDchar) 1822 1823 return cast(dchar) u; 1824 } 1825 1826 @safe pure @nogc nothrow 1827 unittest 1828 { 1829 // Add tests for useReplacemendDchar == true path 1830 1831 static struct R 1832 { 1833 @safe pure @nogc nothrow: 1834 this(wstring s) { this.s = s; } 1835 @property bool empty() { return idx == s.length; } 1836 @property wchar front() { return s[idx]; } 1837 void popFront() { ++idx; } 1838 size_t idx; 1839 wstring s; 1840 } 1841 1842 foreach (s; invalidUTFstrings!wchar()) 1843 { 1844 auto r = R(s); 1845 size_t index; 1846 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1847 assert(dc == replacementDchar); 1848 assert(1 <= index && index <= s.length); 1849 } 1850 } 1851 1852 private dchar decodeImpl(bool canIndex, UseReplacementDchar useReplacementDchar = No.useReplacementDchar, S)( 1853 auto ref S str, ref size_t index) 1854 if (is(S : const dchar[]) || (isInputRange!S && is(immutable ElementEncodingType!S == immutable dchar))) 1855 { 1856 static if (is(S : const dchar[])) 1857 auto pstr = str.ptr; 1858 else 1859 alias pstr = str; 1860 1861 static if (is(S : const dchar[]) || isRandomAccessRange!S) 1862 { 1863 dchar dc = pstr[index]; 1864 if (!isValidDchar(dc)) 1865 { 1866 static if (useReplacementDchar) 1867 dc = replacementDchar; 1868 else 1869 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1870 } 1871 ++index; 1872 return dc; 1873 } 1874 else 1875 { 1876 dchar dc = pstr.front; 1877 if (!isValidDchar(dc)) 1878 { 1879 static if (useReplacementDchar) 1880 dc = replacementDchar; 1881 else 1882 throw new UTFException("Invalid UTF-32 value").setSequence(dc); 1883 } 1884 ++index; 1885 pstr.popFront(); 1886 return dc; 1887 } 1888 } 1889 1890 @safe pure @nogc nothrow 1891 unittest 1892 { 1893 // Add tests for useReplacemendDchar == true path 1894 1895 static struct R 1896 { 1897 @safe pure @nogc nothrow: 1898 this(dstring s) { this.s = s; } 1899 @property bool empty() { return idx == s.length; } 1900 @property dchar front() { return s[idx]; } 1901 void popFront() { ++idx; } 1902 size_t idx; 1903 dstring s; 1904 } 1905 1906 foreach (s; invalidUTFstrings!dchar()) 1907 { 1908 auto r = R(s); 1909 size_t index; 1910 dchar dc = decodeImpl!(false, Yes.useReplacementDchar)(r, index); 1911 assert(dc == replacementDchar); 1912 assert(1 <= index && index <= s.length); 1913 } 1914 } 1915 1916 1917 version (StdUnittest) private void testDecode(R)(R range, 1918 size_t index, 1919 dchar expectedChar, 1920 size_t expectedIndex, 1921 size_t line = __LINE__) 1922 { 1923 import core.exception : AssertError; 1924 import std.exception : enforce; 1925 import std.string : format; 1926 import std.traits : isNarrowString; 1927 1928 static if (hasLength!R) 1929 immutable lenBefore = range.length; 1930 1931 static if (isRandomAccessRange!R && !isNarrowString!R) 1932 { 1933 { 1934 immutable result = decode(range, index); 1935 enforce(result == expectedChar, 1936 new AssertError(format("decode: Wrong character: %s", result), __FILE__, line)); 1937 enforce(index == expectedIndex, 1938 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 1939 static if (hasLength!R) 1940 { 1941 enforce(range.length == lenBefore, 1942 new AssertError(format("decode: length changed: %s", range.length), __FILE__, line)); 1943 } 1944 } 1945 } 1946 } 1947 1948 version (StdUnittest) private void testDecodeFront(R)(ref R range, 1949 dchar expectedChar, 1950 size_t expectedNumCodeUnits, 1951 size_t line = __LINE__) 1952 { 1953 import core.exception : AssertError; 1954 import std.exception : enforce; 1955 import std.string : format; 1956 1957 static if (hasLength!R) 1958 immutable lenBefore = range.length; 1959 1960 size_t numCodeUnits; 1961 immutable result = decodeFront(range, numCodeUnits); 1962 enforce(result == expectedChar, 1963 new AssertError(format("decodeFront: Wrong character: %s", result), __FILE__, line)); 1964 enforce(numCodeUnits == expectedNumCodeUnits, 1965 new AssertError(format("decodeFront: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1966 1967 static if (hasLength!R) 1968 { 1969 enforce(range.length == lenBefore - numCodeUnits, 1970 new AssertError(format("decodeFront: wrong length: %s", range.length), __FILE__, line)); 1971 } 1972 } 1973 1974 version (StdUnittest) private void testDecodeBack(R)(ref R range, 1975 dchar expectedChar, 1976 size_t expectedNumCodeUnits, 1977 size_t line = __LINE__) 1978 { 1979 // This condition is to allow unit testing all `decode` functions together 1980 static if (!isBidirectionalRange!R) 1981 return; 1982 else 1983 { 1984 import core.exception : AssertError; 1985 import std.exception : enforce; 1986 import std.string : format; 1987 1988 static if (hasLength!R) 1989 immutable lenBefore = range.length; 1990 1991 size_t numCodeUnits; 1992 immutable result = decodeBack(range, numCodeUnits); 1993 enforce(result == expectedChar, 1994 new AssertError(format("decodeBack: Wrong character: %s", result), __FILE__, line)); 1995 enforce(numCodeUnits == expectedNumCodeUnits, 1996 new AssertError(format("decodeBack: Wrong numCodeUnits: %s", numCodeUnits), __FILE__, line)); 1997 1998 static if (hasLength!R) 1999 { 2000 enforce(range.length == lenBefore - numCodeUnits, 2001 new AssertError(format("decodeBack: wrong length: %s", range.length), __FILE__, line)); 2002 } 2003 } 2004 } 2005 2006 version (StdUnittest) private void testAllDecode(R)(R range, 2007 dchar expectedChar, 2008 size_t expectedIndex, 2009 size_t line = __LINE__) 2010 { 2011 testDecode(range, 0, expectedChar, expectedIndex, line); 2012 static if (isBidirectionalRange!R) 2013 { 2014 auto rangeCopy = range.save; 2015 testDecodeBack(rangeCopy, expectedChar, expectedIndex, line); 2016 } 2017 testDecodeFront(range, expectedChar, expectedIndex, line); 2018 } 2019 2020 version (StdUnittest) private void testBadDecode(R)(R range, size_t index, size_t line = __LINE__) 2021 { 2022 import core.exception : AssertError; 2023 import std.exception : assertThrown, enforce; 2024 import std.string : format; 2025 2026 immutable initialIndex = index; 2027 2028 static if (hasLength!R) 2029 immutable lenBefore = range.length; 2030 2031 static if (isRandomAccessRange!R) 2032 { 2033 assertThrown!UTFException(decode(range, index), null, __FILE__, line); 2034 enforce(index == initialIndex, 2035 new AssertError(format("decode: Wrong index: %s", index), __FILE__, line)); 2036 static if (hasLength!R) 2037 { 2038 enforce(range.length == lenBefore, 2039 new AssertError(format("decode: length changed:", range.length), __FILE__, line)); 2040 } 2041 } 2042 2043 if (initialIndex == 0) 2044 assertThrown!UTFException(decodeFront(range, index), null, __FILE__, line); 2045 } 2046 2047 version (StdUnittest) private void testBadDecodeBack(R)(R range, size_t line = __LINE__) 2048 { 2049 // This condition is to allow unit testing all `decode` functions together 2050 static if (!isBidirectionalRange!R) 2051 return; 2052 else 2053 { 2054 import core.exception : AssertError; 2055 import std.exception : assertThrown, enforce; 2056 import std.string : format; 2057 2058 static if (hasLength!R) 2059 immutable lenBefore = range.length; 2060 2061 static if (isRandomAccessRange!R) 2062 { 2063 assertThrown!UTFException(decodeBack(range), null, __FILE__, line); 2064 static if (hasLength!R) 2065 { 2066 enforce(range.length == lenBefore, 2067 new AssertError(format("decodeBack: length changed:", range.length), __FILE__, line)); 2068 } 2069 } 2070 } 2071 } 2072 2073 @system unittest 2074 { 2075 import std.conv : to; 2076 import std.exception; 2077 2078 assertCTFEable!( 2079 { 2080 foreach (S; AliasSeq!(to!string, InputCU!char, RandomCU!char, 2081 (string s) => new RefBidirCU!char(s), 2082 (string s) => new RefRandomCU!char(s))) 2083 { 2084 enum sHasLength = hasLength!(typeof(S("abcd"))); 2085 2086 { 2087 auto range = S("abcd"); 2088 testDecode(range, 0, 'a', 1); 2089 testDecode(range, 1, 'b', 2); 2090 testDecodeFront(range, 'a', 1); 2091 testDecodeFront(range, 'b', 1); 2092 assert(decodeFront(range) == 'c'); 2093 assert(decodeFront(range) == 'd'); 2094 } 2095 2096 { 2097 auto range = S("ウェブサイト"); 2098 testDecode(range, 0, 'ウ', 3); 2099 testDecode(range, 3, 'ェ', 6); 2100 testDecodeFront(range, 'ウ', 3); 2101 testDecodeFront(range, 'ェ', 3); 2102 assert(decodeFront(range) == 'ブ'); 2103 assert(decodeFront(range) == 'サ'); 2104 } 2105 2106 { 2107 auto range = S("abcd"); 2108 testDecodeBack(range, 'd', 1); 2109 testDecodeBack(range, 'c', 1); 2110 testDecodeBack(range, 'b', 1); 2111 testDecodeBack(range, 'a', 1); 2112 } 2113 2114 { 2115 auto range = S("ウェブサイト"); 2116 testDecodeBack(range, 'ト', 3); 2117 testDecodeBack(range, 'イ', 3); 2118 testDecodeBack(range, 'サ', 3); 2119 testDecodeBack(range, 'ブ', 3); 2120 } 2121 2122 testAllDecode(S("\xC2\xA9"), '\u00A9', 2); 2123 testAllDecode(S("\xE2\x89\xA0"), '\u2260', 3); 2124 2125 foreach (str; ["\xE2\x89", // too short 2126 "\xC0\x8A", 2127 "\xE0\x80\x8A", 2128 "\xF0\x80\x80\x8A", 2129 "\xF8\x80\x80\x80\x8A", 2130 "\xFC\x80\x80\x80\x80\x8A"]) 2131 { 2132 testBadDecode(S(str), 0); 2133 testBadDecode(S(str), 1); 2134 testBadDecodeBack(S(str)); 2135 } 2136 2137 //Invalid UTF-8 sequence where the first code unit is valid. 2138 testAllDecode(S("\xEF\xBF\xBE"), cast(dchar) 0xFFFE, 3); 2139 testAllDecode(S("\xEF\xBF\xBF"), cast(dchar) 0xFFFF, 3); 2140 2141 //Invalid UTF-8 sequence where the first code unit isn't valid. 2142 foreach (str; ["\xED\xA0\x80", 2143 "\xED\xAD\xBF", 2144 "\xED\xAE\x80", 2145 "\xED\xAF\xBF", 2146 "\xED\xB0\x80", 2147 "\xED\xBE\x80", 2148 "\xED\xBF\xBF"]) 2149 { 2150 testBadDecode(S(str), 0); 2151 testBadDecodeBack(S(str)); 2152 } 2153 } 2154 }); 2155 } 2156 2157 @system unittest 2158 { 2159 import std.exception; 2160 assertCTFEable!( 2161 { 2162 foreach (S; AliasSeq!((wstring s) => s, InputCU!wchar, RandomCU!wchar, 2163 (wstring s) => new RefBidirCU!wchar(s), 2164 (wstring s) => new RefRandomCU!wchar(s))) 2165 { 2166 testAllDecode(S([cast(wchar) 0x1111]), cast(dchar) 0x1111, 1); 2167 testAllDecode(S([cast(wchar) 0xD800, cast(wchar) 0xDC00]), cast(dchar) 0x10000, 2); 2168 testAllDecode(S([cast(wchar) 0xDBFF, cast(wchar) 0xDFFF]), cast(dchar) 0x10FFFF, 2); 2169 testAllDecode(S([cast(wchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2170 testAllDecode(S([cast(wchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2171 2172 testBadDecode(S([ cast(wchar) 0xD801 ]), 0); 2173 testBadDecode(S([ cast(wchar) 0xD800, cast(wchar) 0x1200 ]), 0); 2174 2175 testBadDecodeBack(S([ cast(wchar) 0xD801 ])); 2176 testBadDecodeBack(S([ cast(wchar) 0x0010, cast(wchar) 0xD800 ])); 2177 2178 { 2179 auto range = S("ウェブサイト"); 2180 testDecode(range, 0, 'ウ', 1); 2181 testDecode(range, 1, 'ェ', 2); 2182 testDecodeFront(range, 'ウ', 1); 2183 testDecodeFront(range, 'ェ', 1); 2184 assert(decodeFront(range) == 'ブ'); 2185 assert(decodeFront(range) == 'サ'); 2186 } 2187 2188 { 2189 auto range = S("ウェブサイト"); 2190 testDecodeBack(range, 'ト', 1); 2191 testDecodeBack(range, 'イ', 1); 2192 testDecodeBack(range, 'サ', 1); 2193 testDecodeBack(range, 'ブ', 1); 2194 } 2195 } 2196 2197 foreach (S; AliasSeq!((wchar[] s) => s.idup, RandomCU!wchar, (wstring s) => new RefRandomCU!wchar(s))) 2198 { 2199 auto str = S([cast(wchar) 0xD800, cast(wchar) 0xDC00, 2200 cast(wchar) 0x1400, 2201 cast(wchar) 0xDAA7, cast(wchar) 0xDDDE]); 2202 testDecode(str, 0, cast(dchar) 0x10000, 2); 2203 testDecode(str, 2, cast(dchar) 0x1400, 3); 2204 testDecode(str, 3, cast(dchar) 0xB9DDE, 5); 2205 testDecodeBack(str, cast(dchar) 0xB9DDE, 2); 2206 testDecodeBack(str, cast(dchar) 0x1400, 1); 2207 testDecodeBack(str, cast(dchar) 0x10000, 2); 2208 } 2209 }); 2210 } 2211 2212 @system unittest 2213 { 2214 import std.exception; 2215 assertCTFEable!( 2216 { 2217 foreach (S; AliasSeq!((dstring s) => s, RandomCU!dchar, InputCU!dchar, 2218 (dstring s) => new RefBidirCU!dchar(s), 2219 (dstring s) => new RefRandomCU!dchar(s))) 2220 { 2221 testAllDecode(S([cast(dchar) 0x1111]), cast(dchar) 0x1111, 1); 2222 testAllDecode(S([cast(dchar) 0x10000]), cast(dchar) 0x10000, 1); 2223 testAllDecode(S([cast(dchar) 0x10FFFF]), cast(dchar) 0x10FFFF, 1); 2224 testAllDecode(S([cast(dchar) 0xFFFE]), cast(dchar) 0xFFFE, 1); 2225 testAllDecode(S([cast(dchar) 0xFFFF]), cast(dchar) 0xFFFF, 1); 2226 2227 testBadDecode(S([cast(dchar) 0xD800]), 0); 2228 testBadDecode(S([cast(dchar) 0xDFFE]), 0); 2229 testBadDecode(S([cast(dchar) 0x110000]), 0); 2230 2231 testBadDecodeBack(S([cast(dchar) 0xD800])); 2232 testBadDecodeBack(S([cast(dchar) 0xDFFE])); 2233 testBadDecodeBack(S([cast(dchar) 0x110000])); 2234 2235 { 2236 auto range = S("ウェブサイト"); 2237 testDecode(range, 0, 'ウ', 1); 2238 testDecode(range, 1, 'ェ', 2); 2239 testDecodeFront(range, 'ウ', 1); 2240 testDecodeFront(range, 'ェ', 1); 2241 assert(decodeFront(range) == 'ブ'); 2242 assert(decodeFront(range) == 'サ'); 2243 } 2244 2245 { 2246 auto range = S("ウェブサイト"); 2247 testDecodeBack(range, 'ト', 1); 2248 testDecodeBack(range, 'イ', 1); 2249 testDecodeBack(range, 'サ', 1); 2250 testDecodeBack(range, 'ブ', 1); 2251 } 2252 } 2253 2254 foreach (S; AliasSeq!((dchar[] s) => s.idup, RandomCU!dchar, (dstring s) => new RefRandomCU!dchar(s))) 2255 { 2256 auto str = S([cast(dchar) 0x10000, cast(dchar) 0x1400, cast(dchar) 0xB9DDE]); 2257 testDecode(str, 0, 0x10000, 1); 2258 testDecode(str, 1, 0x1400, 2); 2259 testDecode(str, 2, 0xB9DDE, 3); 2260 testDecodeBack(str, cast(dchar) 0xB9DDE, 1); 2261 testDecodeBack(str, cast(dchar) 0x1400, 1); 2262 testDecodeBack(str, cast(dchar) 0x10000, 1); 2263 } 2264 }); 2265 } 2266 2267 @safe unittest 2268 { 2269 import std.exception; 2270 import std.traits : FunctionAttribute, functionAttributes, isSafe; 2271 assertCTFEable!( 2272 { 2273 foreach (S; AliasSeq!( char[], const( char)[], string, 2274 wchar[], const(wchar)[], wstring, 2275 dchar[], const(dchar)[], dstring)) 2276 { 2277 static assert(isSafe!({ S str; size_t i = 0; decode(str, i); })); 2278 static assert(isSafe!({ S str; size_t i = 0; decodeFront(str, i); })); 2279 static assert(isSafe!({ S str; decodeFront(str); })); 2280 static assert((functionAttributes!({ S str; size_t i = 0; decode(str, i); }) & FunctionAttribute.pure_) != 0); 2281 static assert((functionAttributes!({ 2282 S str; size_t i = 0; decodeFront(str, i); 2283 }) & FunctionAttribute.pure_) != 0); 2284 static assert((functionAttributes!({ S str; decodeFront(str); }) & FunctionAttribute.pure_) != 0); 2285 static assert((functionAttributes!({ 2286 S str; size_t i = 0; decodeBack(str, i); 2287 }) & FunctionAttribute.pure_) != 0); 2288 static assert((functionAttributes!({ S str; decodeBack(str); }) & FunctionAttribute.pure_) != 0); 2289 } 2290 }); 2291 } 2292 2293 @safe unittest 2294 { 2295 import std.exception; 2296 char[4] val; 2297 val[0] = 0b1111_0111; 2298 val[1] = 0b1011_1111; 2299 val[2] = 0b1011_1111; 2300 val[3] = 0b1011_1111; 2301 size_t i = 0; 2302 assertThrown!UTFException((){ dchar ch = decode(val[], i); }()); 2303 } 2304 /* =================== Encode ======================= */ 2305 2306 private dchar _utfException(UseReplacementDchar useReplacementDchar)(string msg, dchar c) 2307 { 2308 static if (useReplacementDchar) 2309 return replacementDchar; 2310 else 2311 throw new UTFException(msg).setSequence(c); 2312 } 2313 2314 /++ 2315 Encodes `c` into the static array, `buf`, and returns the actual 2316 length of the encoded character (a number between `1` and `4` for 2317 `char[4]` buffers and a number between `1` and `2` for 2318 `wchar[2]` buffers). 2319 2320 Throws: 2321 `UTFException` if `c` is not a valid UTF code point. 2322 +/ 2323 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2324 out char[4] buf, dchar c) @safe pure 2325 { 2326 if (c <= 0x7F) 2327 { 2328 assert(isValidDchar(c)); 2329 buf[0] = cast(char) c; 2330 return 1; 2331 } 2332 if (c <= 0x7FF) 2333 { 2334 assert(isValidDchar(c)); 2335 buf[0] = cast(char)(0xC0 | (c >> 6)); 2336 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2337 return 2; 2338 } 2339 if (c <= 0xFFFF) 2340 { 2341 if (0xD800 <= c && c <= 0xDFFF) 2342 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2343 2344 assert(isValidDchar(c)); 2345 L3: 2346 buf[0] = cast(char)(0xE0 | (c >> 12)); 2347 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2348 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2349 return 3; 2350 } 2351 if (c <= 0x10FFFF) 2352 { 2353 assert(isValidDchar(c)); 2354 buf[0] = cast(char)(0xF0 | (c >> 18)); 2355 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2356 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2357 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2358 return 4; 2359 } 2360 2361 assert(!isValidDchar(c)); 2362 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2363 goto L3; 2364 } 2365 2366 /// 2367 @safe unittest 2368 { 2369 import std.exception : assertThrown; 2370 import std.typecons : Yes; 2371 2372 char[4] buf; 2373 2374 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2375 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2376 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2377 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2378 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2379 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2380 2381 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2382 auto slice = buf[]; 2383 assert(slice.decodeFront == replacementDchar); 2384 } 2385 2386 /// 2387 @safe unittest 2388 { 2389 import std.exception : assertThrown; 2390 import std.typecons : Yes; 2391 2392 wchar[2] buf; 2393 2394 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2395 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2396 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2397 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2398 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2399 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2400 2401 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2402 auto slice = buf[]; 2403 assert(slice.decodeFront == replacementDchar); 2404 } 2405 2406 /// 2407 @safe unittest 2408 { 2409 import std.exception : assertThrown; 2410 import std.typecons : Yes; 2411 2412 dchar[1] buf; 2413 2414 assert(encode(buf, '\u0000') == 1 && buf[0] == '\u0000'); 2415 assert(encode(buf, '\uD7FF') == 1 && buf[0] == '\uD7FF'); 2416 assert(encode(buf, '\uE000') == 1 && buf[0] == '\uE000'); 2417 assert(encode(buf, '\U0010FFFF') == 1 && buf[0] == '\U0010FFFF'); 2418 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2419 2420 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2421 assert(buf[0] == replacementDchar); 2422 } 2423 2424 @safe unittest 2425 { 2426 import std.exception; 2427 assertCTFEable!( 2428 { 2429 char[4] buf; 2430 2431 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2432 assert(encode(buf, '\u007F') == 1 && buf[0 .. 1] == "\u007F"); 2433 assert(encode(buf, '\u0080') == 2 && buf[0 .. 2] == "\u0080"); 2434 assert(encode(buf, '\u07FF') == 2 && buf[0 .. 2] == "\u07FF"); 2435 assert(encode(buf, '\u0800') == 3 && buf[0 .. 3] == "\u0800"); 2436 assert(encode(buf, '\uD7FF') == 3 && buf[0 .. 3] == "\uD7FF"); 2437 assert(encode(buf, '\uE000') == 3 && buf[0 .. 3] == "\uE000"); 2438 assert(encode(buf, 0xFFFE) == 3 && buf[0 .. 3] == "\xEF\xBF\xBE"); 2439 assert(encode(buf, 0xFFFF) == 3 && buf[0 .. 3] == "\xEF\xBF\xBF"); 2440 assert(encode(buf, '\U00010000') == 4 && buf[0 .. 4] == "\U00010000"); 2441 assert(encode(buf, '\U0010FFFF') == 4 && buf[0 .. 4] == "\U0010FFFF"); 2442 2443 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2444 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2445 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2446 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2447 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2448 2449 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2450 enum replacementDcharString = "\uFFFD"; 2451 assert(buf[0 .. replacementDcharString.length] == replacementDcharString); 2452 }); 2453 } 2454 2455 2456 /// Ditto 2457 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2458 out wchar[2] buf, dchar c) @safe pure 2459 { 2460 if (c <= 0xFFFF) 2461 { 2462 if (0xD800 <= c && c <= 0xDFFF) 2463 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2464 2465 assert(isValidDchar(c)); 2466 L1: 2467 buf[0] = cast(wchar) c; 2468 return 1; 2469 } 2470 if (c <= 0x10FFFF) 2471 { 2472 assert(isValidDchar(c)); 2473 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2474 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2475 return 2; 2476 } 2477 2478 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2479 goto L1; 2480 } 2481 2482 @safe unittest 2483 { 2484 import std.exception; 2485 assertCTFEable!( 2486 { 2487 wchar[2] buf; 2488 2489 assert(encode(buf, '\u0000') == 1 && buf[0 .. 1] == "\u0000"); 2490 assert(encode(buf, '\uD7FF') == 1 && buf[0 .. 1] == "\uD7FF"); 2491 assert(encode(buf, '\uE000') == 1 && buf[0 .. 1] == "\uE000"); 2492 assert(encode(buf, 0xFFFE) == 1 && buf[0] == 0xFFFE); 2493 assert(encode(buf, 0xFFFF) == 1 && buf[0] == 0xFFFF); 2494 assert(encode(buf, '\U00010000') == 2 && buf[0 .. 2] == "\U00010000"); 2495 assert(encode(buf, '\U0010FFFF') == 2 && buf[0 .. 2] == "\U0010FFFF"); 2496 2497 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2498 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2499 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2500 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2501 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2502 2503 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2504 assert(buf.front == replacementDchar); 2505 }); 2506 } 2507 2508 2509 /// Ditto 2510 size_t encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2511 out dchar[1] buf, dchar c) @safe pure 2512 { 2513 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2514 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2515 else 2516 assert(isValidDchar(c)); 2517 buf[0] = c; 2518 return 1; 2519 } 2520 2521 @safe unittest 2522 { 2523 import std.exception; 2524 assertCTFEable!( 2525 { 2526 dchar[1] buf; 2527 2528 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2529 encode(buf, '\uD7FF'); assert(buf[0] == '\uD7FF'); 2530 encode(buf, '\uE000'); assert(buf[0] == '\uE000'); 2531 encode(buf, 0xFFFE); assert(buf[0] == 0xFFFE); 2532 encode(buf, 0xFFFF); assert(buf[0] == 0xFFFF); 2533 encode(buf, '\U0010FFFF'); assert(buf[0] == '\U0010FFFF'); 2534 2535 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2536 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2537 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2538 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2539 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2540 2541 assert(encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000) == buf.stride); 2542 assert(buf.front == replacementDchar); 2543 }); 2544 } 2545 2546 2547 /++ 2548 Encodes `c` in `str`'s encoding and appends it to `str`. 2549 2550 Throws: 2551 `UTFException` if `c` is not a valid UTF code point. 2552 +/ 2553 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2554 ref scope char[] str, dchar c) @safe pure 2555 { 2556 if (c <= 0x7F) 2557 { 2558 assert(isValidDchar(c)); 2559 str ~= cast(char) c; 2560 } 2561 else 2562 { 2563 char[4] buf; 2564 uint L; 2565 2566 if (c <= 0x7FF) 2567 { 2568 assert(isValidDchar(c)); 2569 buf[0] = cast(char)(0xC0 | (c >> 6)); 2570 buf[1] = cast(char)(0x80 | (c & 0x3F)); 2571 L = 2; 2572 } 2573 else if (c <= 0xFFFF) 2574 { 2575 if (0xD800 <= c && c <= 0xDFFF) 2576 c = _utfException!useReplacementDchar("Encoding a surrogate code point in UTF-8", c); 2577 2578 assert(isValidDchar(c)); 2579 L3: 2580 buf[0] = cast(char)(0xE0 | (c >> 12)); 2581 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2582 buf[2] = cast(char)(0x80 | (c & 0x3F)); 2583 L = 3; 2584 } 2585 else if (c <= 0x10FFFF) 2586 { 2587 assert(isValidDchar(c)); 2588 buf[0] = cast(char)(0xF0 | (c >> 18)); 2589 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 2590 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 2591 buf[3] = cast(char)(0x80 | (c & 0x3F)); 2592 L = 4; 2593 } 2594 else 2595 { 2596 assert(!isValidDchar(c)); 2597 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-8", c); 2598 goto L3; 2599 } 2600 str ~= buf[0 .. L]; 2601 } 2602 } 2603 2604 /// 2605 @safe unittest 2606 { 2607 char[] s = "abcd".dup; 2608 dchar d1 = 'a'; 2609 dchar d2 = 'ø'; 2610 2611 encode(s, d1); 2612 assert(s.length == 5); 2613 assert(s == "abcda"); 2614 encode(s, d2); 2615 assert(s.length == 7); 2616 assert(s == "abcdaø"); 2617 } 2618 2619 @safe unittest 2620 { 2621 import std.exception; 2622 2623 assertCTFEable!( 2624 { 2625 char[] s = "abcd".dup; 2626 encode(s, cast(dchar)'a'); 2627 assert(s.length == 5); 2628 assert(s == "abcda"); 2629 2630 encode(s, cast(dchar)'\u00A9'); 2631 assert(s.length == 7); 2632 assert(s == "abcda\xC2\xA9"); 2633 //assert(s == "abcda\u00A9"); // BUG: fix compiler 2634 2635 encode(s, cast(dchar)'\u2260'); 2636 assert(s.length == 10); 2637 assert(s == "abcda\xC2\xA9\xE2\x89\xA0"); 2638 }); 2639 } 2640 2641 @safe unittest 2642 { 2643 import std.exception; 2644 assertCTFEable!( 2645 { 2646 char[] buf; 2647 2648 encode(buf, '\u0000'); assert(buf[0 .. $] == "\u0000"); 2649 encode(buf, '\u007F'); assert(buf[1 .. $] == "\u007F"); 2650 encode(buf, '\u0080'); assert(buf[2 .. $] == "\u0080"); 2651 encode(buf, '\u07FF'); assert(buf[4 .. $] == "\u07FF"); 2652 encode(buf, '\u0800'); assert(buf[6 .. $] == "\u0800"); 2653 encode(buf, '\uD7FF'); assert(buf[9 .. $] == "\uD7FF"); 2654 encode(buf, '\uE000'); assert(buf[12 .. $] == "\uE000"); 2655 encode(buf, 0xFFFE); assert(buf[15 .. $] == "\xEF\xBF\xBE"); 2656 encode(buf, 0xFFFF); assert(buf[18 .. $] == "\xEF\xBF\xBF"); 2657 encode(buf, '\U00010000'); assert(buf[21 .. $] == "\U00010000"); 2658 encode(buf, '\U0010FFFF'); assert(buf[25 .. $] == "\U0010FFFF"); 2659 2660 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2661 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2662 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2663 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2664 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2665 2666 enum replacementDcharString = "\uFFFD"; 2667 enum rdcslen = replacementDcharString.length; 2668 assert(buf[$ - rdcslen .. $] != replacementDcharString); 2669 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2670 assert(buf[$ - rdcslen .. $] == replacementDcharString); 2671 }); 2672 } 2673 2674 /// ditto 2675 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2676 ref scope wchar[] str, dchar c) @safe pure 2677 { 2678 if (c <= 0xFFFF) 2679 { 2680 if (0xD800 <= c && c <= 0xDFFF) 2681 c = _utfException!useReplacementDchar("Encoding an isolated surrogate code point in UTF-16", c); 2682 2683 assert(isValidDchar(c)); 2684 L1: 2685 str ~= cast(wchar) c; 2686 } 2687 else if (c <= 0x10FFFF) 2688 { 2689 wchar[2] buf; 2690 2691 assert(isValidDchar(c)); 2692 buf[0] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 2693 buf[1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 2694 str ~= buf; 2695 } 2696 else 2697 { 2698 assert(!isValidDchar(c)); 2699 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-16", c); 2700 goto L1; 2701 } 2702 } 2703 2704 @safe unittest 2705 { 2706 import std.exception; 2707 assertCTFEable!( 2708 { 2709 wchar[] buf; 2710 2711 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2712 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2713 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2714 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2715 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2716 encode(buf, '\U00010000'); assert(buf[5 .. $] == "\U00010000"); 2717 encode(buf, '\U0010FFFF'); assert(buf[7 .. $] == "\U0010FFFF"); 2718 2719 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2720 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2721 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2722 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2723 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2724 2725 assert(buf.back != replacementDchar); 2726 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2727 assert(buf.back == replacementDchar); 2728 }); 2729 } 2730 2731 /// ditto 2732 void encode(UseReplacementDchar useReplacementDchar = No.useReplacementDchar)( 2733 ref scope dchar[] str, dchar c) @safe pure 2734 { 2735 if ((0xD800 <= c && c <= 0xDFFF) || 0x10FFFF < c) 2736 c = _utfException!useReplacementDchar("Encoding an invalid code point in UTF-32", c); 2737 else 2738 assert(isValidDchar(c)); 2739 str ~= c; 2740 } 2741 2742 @safe unittest 2743 { 2744 import std.exception; 2745 assertCTFEable!( 2746 { 2747 dchar[] buf; 2748 2749 encode(buf, '\u0000'); assert(buf[0] == '\u0000'); 2750 encode(buf, '\uD7FF'); assert(buf[1] == '\uD7FF'); 2751 encode(buf, '\uE000'); assert(buf[2] == '\uE000'); 2752 encode(buf, 0xFFFE); assert(buf[3] == 0xFFFE); 2753 encode(buf, 0xFFFF); assert(buf[4] == 0xFFFF); 2754 encode(buf, '\U0010FFFF'); assert(buf[5] == '\U0010FFFF'); 2755 2756 assertThrown!UTFException(encode(buf, cast(dchar) 0xD800)); 2757 assertThrown!UTFException(encode(buf, cast(dchar) 0xDBFF)); 2758 assertThrown!UTFException(encode(buf, cast(dchar) 0xDC00)); 2759 assertThrown!UTFException(encode(buf, cast(dchar) 0xDFFF)); 2760 assertThrown!UTFException(encode(buf, cast(dchar) 0x110000)); 2761 2762 assert(buf.back != replacementDchar); 2763 encode!(Yes.useReplacementDchar)(buf, cast(dchar) 0x110000); 2764 assert(buf.back == replacementDchar); 2765 }); 2766 } 2767 2768 2769 /++ 2770 Returns the number of code units that are required to encode the code point 2771 `c` when `C` is the character type used to encode it. 2772 +/ 2773 ubyte codeLength(C)(dchar c) @safe pure nothrow @nogc 2774 if (isSomeChar!C) 2775 { 2776 static if (C.sizeof == 1) 2777 { 2778 if (c <= 0x7F) return 1; 2779 if (c <= 0x7FF) return 2; 2780 if (c <= 0xFFFF) return 3; 2781 if (c <= 0x10FFFF) return 4; 2782 assert(false); 2783 } 2784 else static if (C.sizeof == 2) 2785 { 2786 return c <= 0xFFFF ? 1 : 2; 2787 } 2788 else 2789 { 2790 static assert(C.sizeof == 4); 2791 return 1; 2792 } 2793 } 2794 2795 /// 2796 @safe pure nothrow @nogc unittest 2797 { 2798 assert(codeLength!char('a') == 1); 2799 assert(codeLength!wchar('a') == 1); 2800 assert(codeLength!dchar('a') == 1); 2801 2802 assert(codeLength!char('\U0010FFFF') == 4); 2803 assert(codeLength!wchar('\U0010FFFF') == 2); 2804 assert(codeLength!dchar('\U0010FFFF') == 1); 2805 } 2806 2807 2808 /++ 2809 Returns the number of code units that are required to encode `str` 2810 in a string whose character type is `C`. This is particularly useful 2811 when slicing one string with the length of another and the two string 2812 types use different character types. 2813 2814 Params: 2815 C = the character type to get the encoding length for 2816 input = the $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 2817 to calculate the encoding length from 2818 Returns: 2819 The number of code units in `input` when encoded to `C` 2820 +/ 2821 size_t codeLength(C, InputRange)(InputRange input) 2822 if (isSomeFiniteCharInputRange!InputRange) 2823 { 2824 alias EncType = typeof(cast() ElementEncodingType!InputRange.init); 2825 static if (isSomeString!InputRange && is(EncType == C) && is(typeof(input.length))) 2826 return input.length; 2827 else 2828 { 2829 size_t total = 0; 2830 2831 foreach (c; input.byDchar) 2832 total += codeLength!C(c); 2833 2834 return total; 2835 } 2836 } 2837 2838 /// 2839 @safe unittest 2840 { 2841 assert(codeLength!char("hello world") == 2842 "hello world".length); 2843 assert(codeLength!wchar("hello world") == 2844 "hello world"w.length); 2845 assert(codeLength!dchar("hello world") == 2846 "hello world"d.length); 2847 2848 assert(codeLength!char(`プログラミング`) == 2849 `プログラミング`.length); 2850 assert(codeLength!wchar(`プログラミング`) == 2851 `プログラミング`w.length); 2852 assert(codeLength!dchar(`プログラミング`) == 2853 `プログラミング`d.length); 2854 2855 string haystack = `Être sans la verité, ça, ce ne serait pas bien.`; 2856 wstring needle = `Être sans la verité`; 2857 assert(haystack[codeLength!char(needle) .. $] == 2858 `, ça, ce ne serait pas bien.`); 2859 } 2860 2861 @safe unittest 2862 { 2863 import std.algorithm.iteration : filter; 2864 import std.conv : to; 2865 import std.exception; 2866 2867 assertCTFEable!( 2868 { 2869 foreach (S; AliasSeq!( char[], const char[], string, 2870 wchar[], const wchar[], wstring, 2871 dchar[], const dchar[], dstring)) 2872 { 2873 foreach (C; AliasSeq!(char, wchar, dchar)) 2874 { 2875 assert(codeLength!C(to!S("Walter Bright")) == to!(C[])("Walter Bright").length); 2876 assert(codeLength!C(to!S(`言語`)) == to!(C[])(`言語`).length); 2877 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`)) == 2878 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2879 assert(codeLength!C(to!S(`ウェブサイト@La_Verité.com`).filter!(x => true)()) == 2880 to!(C[])(`ウェブサイト@La_Verité.com`).length); 2881 } 2882 } 2883 }); 2884 } 2885 2886 /+ 2887 Internal helper function: 2888 2889 Returns true if it is safe to search for the Codepoint `c` inside 2890 code units, without decoding. 2891 2892 This is a runtime check that is used an optimization in various functions, 2893 particularly, in `std.string`. 2894 +/ 2895 package bool canSearchInCodeUnits(C)(dchar c) 2896 if (isSomeChar!C) 2897 { 2898 static if (C.sizeof == 1) 2899 return c <= 0x7F; 2900 else static if (C.sizeof == 2) 2901 return c <= 0xD7FF || (0xE000 <= c && c <= 0xFFFF); 2902 else static if (C.sizeof == 4) 2903 return true; 2904 else 2905 static assert(0); 2906 } 2907 @safe unittest 2908 { 2909 assert( canSearchInCodeUnits! char('a')); 2910 assert( canSearchInCodeUnits!wchar('a')); 2911 assert( canSearchInCodeUnits!dchar('a')); 2912 assert(!canSearchInCodeUnits! char('ö')); //Important test: ö <= 0xFF 2913 assert(!canSearchInCodeUnits! char(cast(char)'ö')); //Important test: ö <= 0xFF 2914 assert( canSearchInCodeUnits!wchar('ö')); 2915 assert( canSearchInCodeUnits!dchar('ö')); 2916 assert(!canSearchInCodeUnits! char('日')); 2917 assert( canSearchInCodeUnits!wchar('日')); 2918 assert( canSearchInCodeUnits!dchar('日')); 2919 assert(!canSearchInCodeUnits!wchar(cast(wchar) 0xDA00)); 2920 assert( canSearchInCodeUnits!dchar(cast(dchar) 0xDA00)); 2921 assert(!canSearchInCodeUnits! char('\U00010001')); 2922 assert(!canSearchInCodeUnits!wchar('\U00010001')); 2923 assert( canSearchInCodeUnits!dchar('\U00010001')); 2924 } 2925 2926 /* =================== Validation ======================= */ 2927 2928 /++ 2929 Checks to see if `str` is well-formed unicode or not. 2930 2931 Throws: 2932 `UTFException` if `str` is not well-formed. 2933 +/ 2934 void validate(S)(in S str) @safe pure 2935 if (isSomeString!S) 2936 { 2937 immutable len = str.length; 2938 for (size_t i = 0; i < len; ) 2939 { 2940 decode(str, i); 2941 } 2942 } 2943 2944 /// 2945 @safe unittest 2946 { 2947 import std.exception : assertThrown; 2948 char[] a = [167, 133, 175]; 2949 assertThrown!UTFException(validate(a)); 2950 } 2951 2952 // https://issues.dlang.org/show_bug.cgi?id=12923 2953 @safe unittest 2954 { 2955 import std.exception; 2956 assertThrown((){ 2957 char[3]a=[167, 133, 175]; 2958 validate(a[]); 2959 }()); 2960 } 2961 2962 /** 2963 * Encodes the elements of `s` to UTF-8 and returns a newly allocated 2964 * string of the elements. 2965 * 2966 * Params: 2967 * s = the string to encode 2968 * Returns: 2969 * A UTF-8 string 2970 * See_Also: 2971 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 2972 */ 2973 string toUTF8(S)(S s) 2974 if (isSomeFiniteCharInputRange!S) 2975 { 2976 return toUTFImpl!string(s); 2977 } 2978 2979 /// 2980 @safe pure unittest 2981 { 2982 import std.algorithm.comparison : equal; 2983 2984 // The ö is represented by two UTF-8 code units 2985 assert("Hellø"w.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 2986 2987 // 𐐷 is four code units in UTF-8 2988 assert("𐐷"d.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 2989 } 2990 2991 @system pure unittest 2992 { 2993 import std.algorithm.comparison : equal; 2994 import std.internal.test.dummyrange : ReferenceInputRange; 2995 2996 alias RT = ReferenceInputRange!(ElementType!(string)); 2997 auto r1 = new RT("Hellø"); 2998 auto r2 = new RT("𐐷"); 2999 3000 assert(r1.toUTF8.equal(['H', 'e', 'l', 'l', 0xC3, 0xB8])); 3001 assert(r2.toUTF8.equal([0xF0, 0x90, 0x90, 0xB7])); 3002 } 3003 3004 /** 3005 * Encodes the elements of `s` to UTF-16 and returns a newly GC allocated 3006 * `wstring` of the elements. 3007 * 3008 * Params: 3009 * s = the range to encode 3010 * Returns: 3011 * A UTF-16 string 3012 * See_Also: 3013 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3014 */ 3015 wstring toUTF16(S)(S s) 3016 if (isSomeFiniteCharInputRange!S) 3017 { 3018 return toUTFImpl!wstring(s); 3019 } 3020 3021 /// 3022 @safe pure unittest 3023 { 3024 import std.algorithm.comparison : equal; 3025 3026 // these graphemes are two code units in UTF-16 and one in UTF-32 3027 assert("𤭢"d.length == 1); 3028 assert("𐐷"d.length == 1); 3029 3030 assert("𤭢"d.toUTF16.equal([0xD852, 0xDF62])); 3031 assert("𐐷"d.toUTF16.equal([0xD801, 0xDC37])); 3032 } 3033 3034 @system pure unittest 3035 { 3036 import std.algorithm.comparison : equal; 3037 import std.internal.test.dummyrange : ReferenceInputRange; 3038 3039 alias RT = ReferenceInputRange!(ElementType!(string)); 3040 auto r1 = new RT("𤭢"); 3041 auto r2 = new RT("𐐷"); 3042 3043 assert(r1.toUTF16.equal([0xD852, 0xDF62])); 3044 assert(r2.toUTF16.equal([0xD801, 0xDC37])); 3045 } 3046 3047 3048 /** 3049 * Encodes the elements of `s` to UTF-32 and returns a newly GC allocated 3050 * `dstring` of the elements. 3051 * 3052 * Params: 3053 * s = the range to encode 3054 * Returns: 3055 * A UTF-32 string 3056 * See_Also: 3057 * For a lazy, non-allocating version of these functions, see $(LREF byUTF). 3058 */ 3059 dstring toUTF32(S)(scope S s) 3060 if (isSomeFiniteCharInputRange!S) 3061 { 3062 return toUTFImpl!dstring(s); 3063 } 3064 3065 /// 3066 @safe pure unittest 3067 { 3068 import std.algorithm.comparison : equal; 3069 3070 // these graphemes are two code units in UTF-16 and one in UTF-32 3071 assert("𤭢"w.length == 2); 3072 assert("𐐷"w.length == 2); 3073 3074 assert("𤭢"w.toUTF32.equal([0x00024B62])); 3075 assert("𐐷"w.toUTF32.equal([0x00010437])); 3076 } 3077 3078 private T toUTFImpl(T, S)(scope S s) 3079 { 3080 static if (is(S : T)) 3081 { 3082 return s.idup; 3083 } 3084 else 3085 { 3086 import std.array : appender; 3087 auto app = appender!T(); 3088 3089 static if (is(S == C[], C) || hasLength!S) 3090 app.reserve(s.length); 3091 3092 ElementEncodingType!T e = void; 3093 foreach (c; s.byUTF!(typeof(cast() ElementEncodingType!T.init))) 3094 app.put(c); 3095 3096 return app.data; 3097 } 3098 } 3099 3100 /* =================== toUTFz ======================= */ 3101 3102 /++ 3103 Returns a C-style zero-terminated string equivalent to `str`. `str` 3104 must not contain embedded `'\0'`'s as any C function will treat the first 3105 `'\0'` that it sees as the end of the string. If `str.empty` is 3106 `true`, then a string containing only `'\0'` is returned. 3107 3108 `toUTFz` accepts any type of string and is templated on the type of 3109 character pointer that you wish to convert to. It will avoid allocating a 3110 new string if it can, but there's a decent chance that it will end up having 3111 to allocate a new string - particularly when dealing with character types 3112 other than `char`. 3113 3114 $(RED Warning 1:) If the result of `toUTFz` equals `str.ptr`, then if 3115 anything alters the character one past the end of `str` (which is the 3116 `'\0'` character terminating the string), then the string won't be 3117 zero-terminated anymore. The most likely scenarios for that are if you 3118 append to `str` and no reallocation takes place or when `str` is a 3119 slice of a larger array, and you alter the character in the larger array 3120 which is one character past the end of `str`. Another case where it could 3121 occur would be if you had a mutable character array immediately after 3122 `str` in memory (for example, if they're member variables in a 3123 user-defined type with one declared right after the other) and that 3124 character array happened to start with `'\0'`. Such scenarios will never 3125 occur if you immediately use the zero-terminated string after calling 3126 `toUTFz` and the C function using it doesn't keep a reference to it. 3127 Also, they are unlikely to occur even if you save the zero-terminated string 3128 (the cases above would be among the few examples of where it could happen). 3129 However, if you save the zero-terminate string and want to be absolutely 3130 certain that the string stays zero-terminated, then simply append a 3131 `'\0'` to the string and use its `ptr` property rather than calling 3132 `toUTFz`. 3133 3134 $(RED Warning 2:) When passing a character pointer to a C function, and the 3135 C function keeps it around for any reason, make sure that you keep a 3136 reference to it in your D code. Otherwise, it may go away during a garbage 3137 collection cycle and cause a nasty bug when the C code tries to use it. 3138 +/ 3139 template toUTFz(P) 3140 if (is(P == C*, C) && isSomeChar!C) 3141 { 3142 P toUTFz(S)(S str) @safe pure 3143 if (isSomeString!S) 3144 { 3145 return toUTFzImpl!(P, S)(str); 3146 } 3147 } 3148 3149 /// 3150 @safe pure unittest 3151 { 3152 auto p1 = toUTFz!(char*)("hello world"); 3153 auto p2 = toUTFz!(const(char)*)("hello world"); 3154 auto p3 = toUTFz!(immutable(char)*)("hello world"); 3155 auto p4 = toUTFz!(char*)("hello world"d); 3156 auto p5 = toUTFz!(const(wchar)*)("hello world"); 3157 auto p6 = toUTFz!(immutable(dchar)*)("hello world"w); 3158 } 3159 3160 private P toUTFzImpl(P, S)(return scope S str) @safe pure 3161 if (is(immutable typeof(*P.init) == typeof(str[0]))) 3162 //immutable(C)[] -> C*, const(C)*, or immutable(C)* 3163 { 3164 if (str.empty) 3165 { 3166 typeof(*P.init)[] retval = ['\0']; 3167 3168 auto trustedPtr() @trusted { return retval.ptr; } 3169 return trustedPtr(); 3170 } 3171 3172 alias C = typeof(cast() ElementEncodingType!S.init); 3173 3174 //If the P is mutable, then we have to make a copy. 3175 static if (is(typeof(cast() *P.init) == typeof(*P.init))) 3176 { 3177 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3178 } 3179 else 3180 { 3181 if (!__ctfe) 3182 { 3183 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3184 immutable p = trustedPtrAdd(str); 3185 3186 // Peek past end of str, if it's 0, no conversion necessary. 3187 // Note that the compiler will put a 0 past the end of static 3188 // strings, and the storage allocator will put a 0 past the end 3189 // of newly allocated char[]'s. 3190 // Is p dereferenceable? A simple test: if the p points to an 3191 // address multiple of 4, then conservatively assume the pointer 3192 // might be pointing to a new block of memory, which might be 3193 // unreadable. Otherwise, it's definitely pointing to valid 3194 // memory. 3195 if ((cast(size_t) p & 3) && *p == '\0') 3196 return &str[0]; 3197 } 3198 3199 return toUTFzImpl!(P, const(C)[])(cast(const(C)[])str); 3200 } 3201 } 3202 3203 private P toUTFzImpl(P, S)(return scope S str) @safe pure 3204 if (is(typeof(str[0]) C) && is(immutable typeof(*P.init) == immutable C) && !is(C == immutable)) 3205 //C[] or const(C)[] -> C*, const(C)*, or immutable(C)* 3206 { 3207 alias InChar = typeof(str[0]); 3208 alias UInChar = typeof(cast() str[0]); // unqualified version of InChar 3209 alias OutChar = typeof(*P.init); 3210 alias UOutChar = typeof(cast() *P.init); // unqualified version 3211 3212 //const(C)[] -> const(C)* or 3213 //C[] -> C* or const(C)* 3214 static if (( is(const(UInChar) == InChar) && is( const(UOutChar) == OutChar)) || 3215 (!is(const(UInChar) == InChar) && !is(immutable(UOutChar) == OutChar))) 3216 { 3217 if (!__ctfe) 3218 { 3219 auto trustedPtrAdd(S s) @trusted { return s.ptr + s.length; } 3220 auto p = trustedPtrAdd(str); 3221 3222 if ((cast(size_t) p & 3) && *p == '\0') 3223 return &str[0]; 3224 } 3225 3226 str ~= '\0'; 3227 return &str[0]; 3228 } 3229 //const(C)[] -> C* or immutable(C)* or 3230 //C[] -> immutable(C)* 3231 else 3232 { 3233 import std.array : uninitializedArray; 3234 auto copy = uninitializedArray!(UOutChar[])(str.length + 1); 3235 copy[0 .. $ - 1] = str[]; 3236 copy[$ - 1] = '\0'; 3237 3238 auto trustedCast(typeof(copy) c) @trusted { return cast(P) c.ptr; } 3239 return trustedCast(copy); 3240 } 3241 } 3242 3243 private P toUTFzImpl(P, S)(S str) @safe pure 3244 if (!is(immutable typeof(*P.init) == immutable typeof(str[0]))) 3245 //C1[], const(C1)[], or immutable(C1)[] -> C2*, const(C2)*, or immutable(C2)* 3246 { 3247 import std.array : appender; 3248 auto retval = appender!(typeof(*P.init)[])(); 3249 3250 foreach (dchar c; str) 3251 retval.put(c); 3252 retval.put('\0'); 3253 3254 return () @trusted { return cast(P) retval.data.ptr; } (); 3255 } 3256 3257 @safe pure unittest 3258 { 3259 import core.exception : AssertError; 3260 import std.algorithm; 3261 import std.conv : to; 3262 import std.exception; 3263 import std.string : format; 3264 3265 assertCTFEable!( 3266 { 3267 foreach (S; AliasSeq!(string, wstring, dstring)) 3268 { 3269 alias C = Unqual!(ElementEncodingType!S); 3270 3271 auto s1 = to!S("hello\U00010143\u0100\U00010143"); 3272 auto temp = new C[](s1.length + 1); 3273 temp[0 .. $ - 1] = s1[0 .. $]; 3274 temp[$ - 1] = '\n'; 3275 --temp.length; 3276 auto trustedAssumeUnique(T)(T t) @trusted { return assumeUnique(t); } 3277 auto s2 = trustedAssumeUnique(temp); 3278 assert(s1 == s2); 3279 3280 void trustedCStringAssert(P, S)(S s) @trusted 3281 { 3282 auto p = toUTFz!P(s); 3283 assert(p[0 .. s.length] == s); 3284 assert(p[s.length] == '\0'); 3285 } 3286 3287 foreach (P; AliasSeq!(C*, const(C)*, immutable(C)*)) 3288 { 3289 trustedCStringAssert!P(s1); 3290 trustedCStringAssert!P(s2); 3291 } 3292 } 3293 }); 3294 3295 static void test(P, S)(S s, size_t line = __LINE__) @trusted 3296 { 3297 static size_t zeroLen(C)(const(C)* ptr) @trusted 3298 { 3299 size_t len = 0; 3300 while (*ptr != '\0') { ++ptr; ++len; } 3301 return len; 3302 } 3303 3304 auto p = toUTFz!P(s); 3305 immutable len = zeroLen(p); 3306 enforce(cmp(s, p[0 .. len]) == 0, 3307 new AssertError(format("Unit test failed: %s %s", P.stringof, S.stringof), 3308 __FILE__, line)); 3309 } 3310 3311 assertCTFEable!( 3312 { 3313 foreach (P; AliasSeq!(wchar*, const(wchar)*, immutable(wchar)*, 3314 dchar*, const(dchar)*, immutable(dchar)*)) 3315 { 3316 test!P("hello\U00010143\u0100\U00010143"); 3317 } 3318 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3319 dchar*, const(dchar)*, immutable(dchar)*)) 3320 { 3321 test!P("hello\U00010143\u0100\U00010143"w); 3322 } 3323 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3324 wchar*, const(wchar)*, immutable(wchar)*)) 3325 { 3326 test!P("hello\U00010143\u0100\U00010143"d); 3327 } 3328 foreach (S; AliasSeq!( char[], const( char)[], 3329 wchar[], const(wchar)[], 3330 dchar[], const(dchar)[])) 3331 { 3332 auto s = to!S("hello\U00010143\u0100\U00010143"); 3333 3334 foreach (P; AliasSeq!( char*, const( char)*, immutable( char)*, 3335 wchar*, const(wchar)*, immutable(wchar)*, 3336 dchar*, const(dchar)*, immutable(dchar)*)) 3337 { 3338 test!P(s); 3339 } 3340 } 3341 }); 3342 } 3343 3344 3345 /++ 3346 `toUTF16z` is a convenience function for `toUTFz!(const(wchar)*)`. 3347 3348 Encodes string `s` into UTF-16 and returns the encoded string. 3349 `toUTF16z` is suitable for calling the 'W' functions in the Win32 API 3350 that take an `LPCWSTR` argument. 3351 +/ 3352 const(wchar)* toUTF16z(C)(const(C)[] str) @safe pure 3353 if (isSomeChar!C) 3354 { 3355 return toUTFz!(const(wchar)*)(str); 3356 } 3357 3358 /// 3359 @system unittest 3360 { 3361 string str = "Hello, World!"; 3362 const(wchar)* p = str.toUTF16z; 3363 assert(p[str.length] == '\0'); 3364 } 3365 3366 @safe pure unittest 3367 { 3368 import std.conv : to; 3369 //toUTFz is already thoroughly tested, so this will just verify that 3370 //toUTF16z compiles properly for the various string types. 3371 foreach (S; AliasSeq!(string, wstring, dstring)) 3372 assert(toUTF16z(to!S("hello world")) !is null); 3373 } 3374 3375 3376 /* ================================ tests ================================== */ 3377 3378 @safe pure unittest 3379 { 3380 import std.exception; 3381 3382 assertCTFEable!( 3383 { 3384 assert(toUTF16("hello"c) == "hello"); 3385 assert(toUTF32("hello"c) == "hello"); 3386 assert(toUTF8 ("hello"w) == "hello"); 3387 assert(toUTF32("hello"w) == "hello"); 3388 assert(toUTF8 ("hello"d) == "hello"); 3389 assert(toUTF16("hello"d) == "hello"); 3390 3391 assert(toUTF16("hel\u1234o"c) == "hel\u1234o"); 3392 assert(toUTF32("hel\u1234o"c) == "hel\u1234o"); 3393 assert(toUTF8 ("hel\u1234o"w) == "hel\u1234o"); 3394 assert(toUTF32("hel\u1234o"w) == "hel\u1234o"); 3395 assert(toUTF8 ("hel\u1234o"d) == "hel\u1234o"); 3396 assert(toUTF16("hel\u1234o"d) == "hel\u1234o"); 3397 3398 assert(toUTF16("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3399 assert(toUTF32("he\U0010AAAAllo"c) == "he\U0010AAAAllo"); 3400 assert(toUTF8 ("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3401 assert(toUTF32("he\U0010AAAAllo"w) == "he\U0010AAAAllo"); 3402 assert(toUTF8 ("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3403 assert(toUTF16("he\U0010AAAAllo"d) == "he\U0010AAAAllo"); 3404 }); 3405 } 3406 3407 3408 /++ 3409 Returns the total number of code points encoded in `str`. 3410 3411 Supercedes: This function supercedes $(LREF toUCSindex). 3412 3413 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 3414 3415 Throws: 3416 `UTFException` if `str` is not well-formed. 3417 +/ 3418 size_t count(C)(const(C)[] str) @safe pure nothrow @nogc 3419 if (isSomeChar!C) 3420 { 3421 return walkLength(str.byDchar); 3422 } 3423 3424 /// 3425 @safe pure nothrow @nogc unittest 3426 { 3427 assert(count("") == 0); 3428 assert(count("a") == 1); 3429 assert(count("abc") == 3); 3430 assert(count("\u20AC100") == 4); 3431 } 3432 3433 @safe pure nothrow @nogc unittest 3434 { 3435 import std.exception; 3436 assertCTFEable!( 3437 { 3438 assert(count("") == 0); 3439 assert(count("a") == 1); 3440 assert(count("abc") == 3); 3441 assert(count("\u20AC100") == 4); 3442 }); 3443 } 3444 3445 3446 // Ranges of code units for testing. 3447 version (StdUnittest) 3448 { 3449 private: 3450 struct InputCU(C) 3451 { 3452 import std.conv : to; 3453 @property bool empty() { return _str.empty; } 3454 @property C front() { return _str[0]; } 3455 void popFront() { _str = _str[1 .. $]; } 3456 3457 this(inout(C)[] str) 3458 { 3459 _str = to!(C[])(str); 3460 } 3461 3462 C[] _str; 3463 } 3464 3465 struct BidirCU(C) 3466 { 3467 import std.conv : to; 3468 @property bool empty() { return _str.empty; } 3469 @property C front() { return _str[0]; } 3470 void popFront() { _str = _str[1 .. $]; } 3471 @property C back() { return _str[$ - 1]; } 3472 void popBack() { _str = _str[0 .. $ - 1]; } 3473 @property auto save() { return BidirCU(_str); } 3474 @property size_t length() { return _str.length; } 3475 3476 this(inout(C)[] str) 3477 { 3478 _str = to!(C[])(str); 3479 } 3480 3481 C[] _str; 3482 } 3483 3484 struct RandomCU(C) 3485 { 3486 import std.conv : to; 3487 @property bool empty() { return _str.empty; } 3488 @property C front() { return _str[0]; } 3489 void popFront() { _str = _str[1 .. $]; } 3490 @property C back() { return _str[$ - 1]; } 3491 void popBack() { _str = _str[0 .. $ - 1]; } 3492 @property auto save() { return RandomCU(_str); } 3493 @property size_t length() { return _str.length; } 3494 C opIndex(size_t i) { return _str[i]; } 3495 auto opSlice(size_t i, size_t j) { return RandomCU(_str[i .. j]); } 3496 3497 this(inout(C)[] str) 3498 { 3499 _str = to!(C[])(str); 3500 } 3501 3502 C[] _str; 3503 } 3504 3505 class RefBidirCU(C) 3506 { 3507 import std.conv : to; 3508 @property bool empty() { return _str.empty; } 3509 @property C front() { return _str[0]; } 3510 void popFront() { _str = _str[1 .. $]; } 3511 @property C back() { return _str[$ - 1]; } 3512 void popBack() { _str = _str[0 .. $ - 1]; } 3513 @property auto save() { return new RefBidirCU(_str); } 3514 @property size_t length() { return _str.length; } 3515 3516 this(inout(C)[] str) 3517 { 3518 _str = to!(C[])(str); 3519 } 3520 3521 C[] _str; 3522 } 3523 3524 class RefRandomCU(C) 3525 { 3526 import std.conv : to; 3527 @property bool empty() { return _str.empty; } 3528 @property C front() { return _str[0]; } 3529 void popFront() { _str = _str[1 .. $]; } 3530 @property C back() { return _str[$ - 1]; } 3531 void popBack() { _str = _str[0 .. $ - 1]; } 3532 @property auto save() { return new RefRandomCU(_str); } 3533 @property size_t length() { return _str.length; } 3534 C opIndex(size_t i) { return _str[i]; } 3535 auto opSlice(size_t i, size_t j) { return new RefRandomCU(_str[i .. j]); } 3536 3537 this(inout(C)[] str) 3538 { 3539 _str = to!(C[])(str); 3540 } 3541 3542 C[] _str; 3543 } 3544 } 3545 3546 3547 /** 3548 * Inserted in place of invalid UTF sequences. 3549 * 3550 * References: 3551 * $(LINK http://en.wikipedia.org/wiki/Replacement_character#Replacement_character) 3552 */ 3553 enum dchar replacementDchar = '\uFFFD'; 3554 3555 /******************************************** 3556 * Iterate a range of char, wchar, or dchars by code unit. 3557 * 3558 * The purpose is to bypass the special case decoding that 3559 * $(REF front, std,range,primitives) does to character arrays. As a result, 3560 * using ranges with `byCodeUnit` can be `nothrow` while 3561 * $(REF front, std,range,primitives) throws when it encounters invalid Unicode 3562 * sequences. 3563 * 3564 * A code unit is a building block of the UTF encodings. Generally, an 3565 * individual code unit does not represent what's perceived as a full 3566 * character (a.k.a. a grapheme cluster in Unicode terminology). Many characters 3567 * are encoded with multiple code units. For example, the UTF-8 code units for 3568 * `ø` are `0xC3 0xB8`. That means, an individual element of `byCodeUnit` 3569 * often does not form a character on its own. Attempting to treat it as 3570 * one while iterating over the resulting range will give nonsensical results. 3571 * 3572 * Params: 3573 * r = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 3574 * of characters (including strings) or a type that implicitly converts to a string type. 3575 * Returns: 3576 * If `r` is not an auto-decodable string (i.e. a narrow string or a 3577 * user-defined type that implicitly converts to a string type), then `r` 3578 * is returned. 3579 * 3580 * Otherwise, `r` is converted to its corresponding string type (if it's 3581 * not already a string) and wrapped in a random-access range where the 3582 * element encoding type of the string (its code unit) is the element type 3583 * of the range, and that range returned. The range has slicing. 3584 * 3585 * If `r` is quirky enough to be a struct or class which is an input range 3586 * of characters on its own (i.e. it has the input range API as member 3587 * functions), $(I and) it's implicitly convertible to a string type, then 3588 * `r` is returned, and no implicit conversion takes place. 3589 * 3590 * If `r` is wrapped in a new range, then that range has a `source` 3591 * property for returning the string that's currently contained within that 3592 * range. 3593 * 3594 * See_Also: 3595 * Refer to the $(MREF std, uni) docs for a reference on Unicode 3596 * terminology. 3597 * 3598 * For a range that iterates by grapheme cluster (written character) see 3599 * $(REF byGrapheme, std,uni). 3600 */ 3601 auto byCodeUnit(R)(R r) 3602 if ((isConvertibleToString!R && !isStaticArray!R) || 3603 (isInputRange!R && isSomeChar!(ElementEncodingType!R))) 3604 { 3605 import std.traits : StringTypeOf; 3606 static if (// This would be cleaner if we had a way to check whether a type 3607 // was a range without any implicit conversions. 3608 (isAutodecodableString!R && !__traits(hasMember, R, "empty") && 3609 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3610 { 3611 static struct ByCodeUnitImpl 3612 { 3613 @safe pure nothrow @nogc: 3614 3615 @property bool empty() const { return source.length == 0; } 3616 @property auto ref front() inout { return source[0]; } 3617 void popFront() { source = source[1 .. $]; } 3618 3619 @property auto save() { return ByCodeUnitImpl(source.save); } 3620 3621 @property auto ref back() inout { return source[$ - 1]; } 3622 void popBack() { source = source[0 .. $-1]; } 3623 3624 auto ref opIndex(size_t index) inout { return source[index]; } 3625 auto opSlice(size_t lower, size_t upper) { return ByCodeUnitImpl(source[lower .. upper]); } 3626 3627 @property size_t length() const { return source.length; } 3628 alias opDollar = length; 3629 3630 StringTypeOf!R source; 3631 } 3632 3633 static assert(isRandomAccessRange!ByCodeUnitImpl); 3634 3635 return ByCodeUnitImpl(r); 3636 } 3637 else static if (!isInputRange!R || 3638 (is(R : const dchar[]) && !__traits(hasMember, R, "empty") && 3639 !__traits(hasMember, R, "front") && !__traits(hasMember, R, "popFront"))) 3640 { 3641 return cast(StringTypeOf!R) r; 3642 } 3643 else 3644 { 3645 // byCodeUnit for ranges and dchar[] is a no-op 3646 return r; 3647 } 3648 } 3649 3650 /// 3651 @safe unittest 3652 { 3653 import std.range.primitives; 3654 import std.traits : isAutodecodableString; 3655 3656 auto r = "Hello, World!".byCodeUnit(); 3657 static assert(hasLength!(typeof(r))); 3658 static assert(hasSlicing!(typeof(r))); 3659 static assert(isRandomAccessRange!(typeof(r))); 3660 static assert(is(ElementType!(typeof(r)) == immutable char)); 3661 3662 // contrast with the range capabilities of standard strings (with or 3663 // without autodecoding enabled). 3664 auto s = "Hello, World!"; 3665 static assert(isBidirectionalRange!(typeof(r))); 3666 static if (isAutodecodableString!(typeof(s))) 3667 { 3668 // with autodecoding enabled, strings are non-random-access ranges of 3669 // dchar. 3670 static assert(is(ElementType!(typeof(s)) == dchar)); 3671 static assert(!isRandomAccessRange!(typeof(s))); 3672 static assert(!hasSlicing!(typeof(s))); 3673 static assert(!hasLength!(typeof(s))); 3674 } 3675 else 3676 { 3677 // without autodecoding, strings are normal arrays. 3678 static assert(is(ElementType!(typeof(s)) == immutable char)); 3679 static assert(isRandomAccessRange!(typeof(s))); 3680 static assert(hasSlicing!(typeof(s))); 3681 static assert(hasLength!(typeof(s))); 3682 } 3683 } 3684 3685 /// `byCodeUnit` does no Unicode decoding 3686 @safe unittest 3687 { 3688 string noel1 = "noe\u0308l"; // noël using e + combining diaeresis 3689 assert(noel1.byCodeUnit[2] != 'ë'); 3690 assert(noel1.byCodeUnit[2] == 'e'); 3691 3692 string noel2 = "no\u00EBl"; // noël using a precomposed ë character 3693 // Because string is UTF-8, the code unit at index 2 is just 3694 // the first of a sequence that encodes 'ë' 3695 assert(noel2.byCodeUnit[2] != 'ë'); 3696 } 3697 3698 /// `byCodeUnit` exposes a `source` property when wrapping narrow strings. 3699 @safe unittest 3700 { 3701 import std.algorithm.comparison : equal; 3702 import std.range : popFrontN; 3703 import std.traits : isAutodecodableString; 3704 { 3705 auto range = byCodeUnit("hello world"); 3706 range.popFrontN(3); 3707 assert(equal(range.save, "lo world")); 3708 static if (isAutodecodableString!string) // only enabled with autodecoding 3709 { 3710 string str = range.source; 3711 assert(str == "lo world"); 3712 } 3713 } 3714 // source only exists if the range was wrapped 3715 { 3716 auto range = byCodeUnit("hello world"d); 3717 static assert(!__traits(compiles, range.source)); 3718 } 3719 } 3720 3721 @safe pure nothrow @nogc unittest 3722 { 3723 import std.range; 3724 { 3725 enum testStr = "𐁄𐂌𐃯 hello ディラン"; 3726 char[testStr.length] s; 3727 int i; 3728 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3729 { 3730 s[i++] = c; 3731 } 3732 assert(s == testStr); 3733 } 3734 { 3735 enum testStr = "𐁄𐂌𐃯 hello ディラン"w; 3736 wchar[testStr.length] s; 3737 int i; 3738 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3739 { 3740 s[i++] = c; 3741 } 3742 assert(s == testStr); 3743 } 3744 { 3745 enum testStr = "𐁄𐂌𐃯 hello ディラン"d; 3746 dchar[testStr.length] s; 3747 int i; 3748 foreach (c; testStr.byCodeUnit().byCodeUnit()) 3749 { 3750 s[i++] = c; 3751 } 3752 assert(s == testStr); 3753 } 3754 { 3755 auto bcu = "hello".byCodeUnit(); 3756 assert(bcu.length == 5); 3757 assert(bcu[3] == 'l'); 3758 assert(bcu[2 .. 4][1] == 'l'); 3759 } 3760 { 3761 char[5] orig = "hello"; 3762 auto bcu = orig[].byCodeUnit(); 3763 bcu.front = 'H'; 3764 assert(bcu.front == 'H'); 3765 bcu[1] = 'E'; 3766 assert(bcu[1] == 'E'); 3767 } 3768 { 3769 auto bcu = "hello".byCodeUnit().byCodeUnit(); 3770 static assert(isForwardRange!(typeof(bcu))); 3771 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3772 auto s = bcu.save; 3773 bcu.popFront(); 3774 assert(s.front == 'h'); 3775 } 3776 { 3777 auto bcu = "hello".byCodeUnit(); 3778 static assert(hasSlicing!(typeof(bcu))); 3779 static assert(isBidirectionalRange!(typeof(bcu))); 3780 static assert(is(typeof(bcu) == struct) == isAutodecodableString!string); 3781 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3782 auto ret = bcu.retro; 3783 assert(ret.front == 'o'); 3784 ret.popFront(); 3785 assert(ret.front == 'l'); 3786 } 3787 { 3788 auto bcu = "κόσμε"w.byCodeUnit(); 3789 static assert(hasSlicing!(typeof(bcu))); 3790 static assert(isBidirectionalRange!(typeof(bcu))); 3791 static assert(is(typeof(bcu) == struct) == isAutodecodableString!wstring); 3792 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3793 auto ret = bcu.retro; 3794 assert(ret.front == 'ε'); 3795 ret.popFront(); 3796 assert(ret.front == 'μ'); 3797 } 3798 { 3799 static struct Stringish 3800 { 3801 string s; 3802 alias s this; 3803 } 3804 3805 auto orig = Stringish("\U0010fff8 𐁊 foo 𐂓"); 3806 auto bcu = orig.byCodeUnit(); 3807 static assert(is(typeof(bcu) == struct)); 3808 static assert(!is(typeof(bcu) == Stringish) == isAutodecodableString!Stringish); 3809 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3810 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3811 assert(bcu.front == cast(char) 244); 3812 } 3813 { 3814 static struct WStringish 3815 { 3816 wstring s; 3817 alias s this; 3818 } 3819 3820 auto orig = WStringish("\U0010fff8 𐁊 foo 𐂓"w); 3821 auto bcu = orig.byCodeUnit(); 3822 static assert(is(typeof(bcu) == struct)); 3823 static assert(!is(typeof(bcu) == WStringish) == isAutodecodableString!WStringish); 3824 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3825 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3826 assert(bcu.front == cast(wchar) 56319); 3827 } 3828 { 3829 static struct DStringish 3830 { 3831 dstring s; 3832 alias s this; 3833 } 3834 3835 auto orig = DStringish("\U0010fff8 𐁊 foo 𐂓"d); 3836 auto bcu = orig.byCodeUnit(); 3837 static assert(is(typeof(bcu) == dstring)); 3838 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3839 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3840 assert(bcu.front == cast(dchar) 1114104); 3841 } 3842 { 3843 static struct FuncStringish 3844 { 3845 string str; 3846 string s() pure nothrow @nogc { return str; } 3847 alias s this; 3848 } 3849 3850 auto orig = FuncStringish("\U0010fff8 𐁊 foo 𐂓"); 3851 auto bcu = orig.byCodeUnit(); 3852 static if (isAutodecodableString!FuncStringish) 3853 static assert(is(typeof(bcu) == struct)); 3854 else 3855 static assert(is(typeof(bcu) == string)); 3856 static assert(!is(typeof(bcu) == FuncStringish)); 3857 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3858 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3859 assert(bcu.front == cast(char) 244); 3860 } 3861 { 3862 static struct Range 3863 { 3864 string data; 3865 bool empty() pure nothrow @nogc { return data.empty; } 3866 char front() pure nothrow @nogc { return data[0]; } 3867 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3868 } 3869 3870 auto orig = Range("\U0010fff8 𐁊 foo 𐂓"); 3871 auto bcu = orig.byCodeUnit(); 3872 static assert(is(typeof(bcu) == Range)); 3873 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3874 static assert(is(ElementType!(typeof(bcu)) == char)); 3875 assert(bcu.front == cast(char) 244); 3876 } 3877 { 3878 static struct WRange 3879 { 3880 wstring data; 3881 bool empty() pure nothrow @nogc { return data.empty; } 3882 wchar front() pure nothrow @nogc { return data[0]; } 3883 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3884 } 3885 3886 auto orig = WRange("\U0010fff8 𐁊 foo 𐂓"w); 3887 auto bcu = orig.byCodeUnit(); 3888 static assert(is(typeof(bcu) == WRange)); 3889 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3890 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3891 assert(bcu.front == 56319); 3892 } 3893 { 3894 static struct DRange 3895 { 3896 dstring data; 3897 bool empty() pure nothrow @nogc { return data.empty; } 3898 dchar front() pure nothrow @nogc { return data[0]; } 3899 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3900 } 3901 3902 auto orig = DRange("\U0010fff8 𐁊 foo 𐂓"d); 3903 auto bcu = orig.byCodeUnit(); 3904 static assert(is(typeof(bcu) == DRange)); 3905 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3906 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3907 assert(bcu.front == 1114104); 3908 } 3909 { 3910 static struct RangeAndStringish 3911 { 3912 bool empty() pure nothrow @nogc { return data.empty; } 3913 char front() pure nothrow @nogc { return data[0]; } 3914 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3915 3916 string data; 3917 string s; 3918 alias s this; 3919 } 3920 3921 auto orig = RangeAndStringish("test.d", "other"); 3922 auto bcu = orig.byCodeUnit(); 3923 static assert(is(typeof(bcu) == RangeAndStringish)); 3924 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3925 static assert(is(ElementType!(typeof(bcu)) == char)); 3926 assert(bcu.front == 't'); 3927 } 3928 { 3929 static struct WRangeAndStringish 3930 { 3931 bool empty() pure nothrow @nogc { return data.empty; } 3932 wchar front() pure nothrow @nogc { return data[0]; } 3933 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3934 3935 wstring data; 3936 wstring s; 3937 alias s this; 3938 } 3939 3940 auto orig = WRangeAndStringish("test.d"w, "other"w); 3941 auto bcu = orig.byCodeUnit(); 3942 static assert(is(typeof(bcu) == WRangeAndStringish)); 3943 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3944 static assert(is(ElementType!(typeof(bcu)) == wchar)); 3945 assert(bcu.front == 't'); 3946 } 3947 { 3948 static struct DRangeAndStringish 3949 { 3950 bool empty() pure nothrow @nogc { return data.empty; } 3951 dchar front() pure nothrow @nogc { return data[0]; } 3952 void popFront() pure nothrow @nogc { data = data[1 .. $]; } 3953 3954 dstring data; 3955 dstring s; 3956 alias s this; 3957 } 3958 3959 auto orig = DRangeAndStringish("test.d"d, "other"d); 3960 auto bcu = orig.byCodeUnit(); 3961 static assert(is(typeof(bcu) == DRangeAndStringish)); 3962 static assert(is(typeof(bcu) == typeof(bcu.byCodeUnit()))); 3963 static assert(is(ElementType!(typeof(bcu)) == dchar)); 3964 assert(bcu.front == 't'); 3965 } 3966 { 3967 enum Enum : string { a = "test.d" } 3968 3969 auto orig = Enum.a; 3970 auto bcu = orig.byCodeUnit(); 3971 static assert(!is(typeof(bcu) == Enum)); 3972 static if (isAutodecodableString!Enum) 3973 static assert(is(typeof(bcu) == struct)); 3974 else 3975 static assert(is(typeof(bcu) == string)); 3976 static assert(is(ElementType!(typeof(bcu)) == immutable char)); 3977 assert(bcu.front == 't'); 3978 } 3979 { 3980 enum WEnum : wstring { a = "test.d"w } 3981 3982 auto orig = WEnum.a; 3983 auto bcu = orig.byCodeUnit(); 3984 static assert(!is(typeof(bcu) == WEnum)); 3985 static if (isAutodecodableString!WEnum) 3986 static assert(is(typeof(bcu) == struct)); 3987 else 3988 static assert(is(typeof(bcu) == wstring)); 3989 static assert(is(ElementType!(typeof(bcu)) == immutable wchar)); 3990 assert(bcu.front == 't'); 3991 } 3992 { 3993 enum DEnum : dstring { a = "test.d"d } 3994 3995 auto orig = DEnum.a; 3996 auto bcu = orig.byCodeUnit(); 3997 static assert(is(typeof(bcu) == dstring)); 3998 static assert(is(ElementType!(typeof(bcu)) == immutable dchar)); 3999 assert(bcu.front == 't'); 4000 } 4001 4002 static if (autodecodeStrings) 4003 { 4004 static assert(!is(typeof(byCodeUnit("hello")) == string)); 4005 static assert(!is(typeof(byCodeUnit("hello"w)) == wstring)); 4006 } 4007 else 4008 { 4009 static assert(is(typeof(byCodeUnit("hello")) == string)); 4010 static assert(is(typeof(byCodeUnit("hello"w)) == wstring)); 4011 } 4012 static assert(is(typeof(byCodeUnit("hello"d)) == dstring)); 4013 4014 static assert(!__traits(compiles, byCodeUnit((char[5]).init))); 4015 static assert(!__traits(compiles, byCodeUnit((wchar[5]).init))); 4016 static assert(!__traits(compiles, byCodeUnit((dchar[5]).init))); 4017 4018 enum SEnum : char[5] { a = "hello" } 4019 enum WSEnum : wchar[5] { a = "hello"w } 4020 enum DSEnum : dchar[5] { a = "hello"d } 4021 4022 static assert(!__traits(compiles, byCodeUnit(SEnum.a))); 4023 static assert(!__traits(compiles, byCodeUnit(WSEnum.a))); 4024 static assert(!__traits(compiles, byCodeUnit(DSEnum.a))); 4025 } 4026 4027 /**************************** 4028 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4029 * of characters by char, wchar, or dchar. 4030 * These aliases simply forward to $(LREF byUTF) with the 4031 * corresponding C argument. 4032 * 4033 * Params: 4034 * r = input range of characters, or array of characters 4035 */ 4036 alias byChar = byUTF!char; 4037 4038 /// Ditto 4039 alias byWchar = byUTF!wchar; 4040 4041 /// Ditto 4042 alias byDchar = byUTF!dchar; 4043 4044 @safe pure nothrow @nogc unittest 4045 { 4046 { 4047 char[5] s; 4048 int i; 4049 foreach (c; "hello".byChar.byChar()) 4050 { 4051 //writefln("[%d] '%c'", i, c); 4052 s[i++] = c; 4053 } 4054 assert(s == "hello"); 4055 } 4056 { 4057 char[5+2+3+4+3+3] s; 4058 int i; 4059 dchar[10] a; 4060 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4061 a[8] = 0xD800; // invalid 4062 a[9] = cast(dchar) 0x110000; // invalid 4063 foreach (c; a[].byChar()) 4064 { 4065 //writefln("[%d] '%c'", i, c); 4066 s[i++] = c; 4067 } 4068 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"); 4069 } 4070 { 4071 auto r = "hello"w.byChar(); 4072 r.popFront(); 4073 r.popFront(); 4074 assert(r.front == 'l'); 4075 } 4076 { 4077 auto r = "hello"d.byChar(); 4078 r.popFront(); 4079 r.popFront(); 4080 assert(r.front == 'l'); 4081 } 4082 { 4083 auto r = "hello"d.byChar(); 4084 assert(isForwardRange!(typeof(r))); 4085 auto s = r.save; 4086 r.popFront(); 4087 assert(s.front == 'h'); 4088 } 4089 } 4090 4091 @safe pure nothrow @nogc unittest 4092 { 4093 { 4094 wchar[11] s; 4095 int i; 4096 dchar[10] a; 4097 a[0 .. 8] = "hello\u07FF\uD7FF\U0010FFFF"d; 4098 a[8] = 0xD800; // invalid 4099 a[9] = cast(dchar) 0x110000; // invalid 4100 foreach (c; a[].byWchar()) 4101 { 4102 //writefln("[%d] '%c' x%x", i, c, c); 4103 s[i++] = c; 4104 } 4105 foreach (j, wchar c; "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w) 4106 { 4107 //writefln("[%d] '%c' x%x", j, c, c); 4108 } 4109 assert(s == "hello\u07FF\uD7FF\U0010FFFF\uFFFD\uFFFD"w); 4110 } 4111 4112 { 4113 auto r = "hello".byWchar(); 4114 r.popFront(); 4115 r.popFront(); 4116 assert(r.front == 'l'); 4117 } 4118 { 4119 auto r = "hello"d.byWchar(); 4120 r.popFront(); 4121 r.popFront(); 4122 assert(r.front == 'l'); 4123 } 4124 { 4125 auto r = "hello"d.byWchar(); 4126 assert(isForwardRange!(typeof(r))); 4127 auto s = r.save; 4128 r.popFront(); 4129 assert(s.front == 'h'); 4130 } 4131 } 4132 4133 @safe pure nothrow @nogc unittest 4134 { 4135 { 4136 dchar[9] s; 4137 int i; 4138 string a = "hello\u07FF\uD7FF\U00010000\U0010FFFF"; // 1,2,3,4 byte sequences 4139 foreach (c; a.byDchar()) 4140 { 4141 s[i++] = c; 4142 } 4143 assert(s == "hello\u07FF\uD7FF\U00010000\U0010FFFF"d); 4144 } 4145 { 4146 foreach (s; invalidUTFstrings!char()) 4147 { 4148 auto r = s.byDchar(); 4149 assert(!r.empty); 4150 assert(r.front == r.front); 4151 dchar c = r.front; 4152 assert(c == replacementDchar); 4153 } 4154 } 4155 { 4156 auto r = "hello".byDchar(); 4157 r.popFront(); 4158 r.popFront(); 4159 assert(r.front == 'l'); 4160 } 4161 4162 { 4163 dchar[8] s; 4164 int i; 4165 wstring a = "hello\u07FF\uD7FF\U0010FFFF"w; 4166 foreach (c; a.byDchar()) 4167 { 4168 //writefln("[%d] '%c' x%x", i, c, c); 4169 s[i++] = c; 4170 } 4171 assert(s == "hello\u07FF\uD7FF\U0010FFFF"d); 4172 } 4173 { 4174 foreach (s; invalidUTFstrings!wchar()) 4175 { 4176 auto r = s.byDchar(); 4177 assert(!r.empty); 4178 assert(r.front == r.front); 4179 dchar c = r.front; 4180 assert(c == replacementDchar); 4181 } 4182 } 4183 { 4184 wchar[2] ws; 4185 ws[0] = 0xD800; 4186 ws[1] = 0xDD00; // correct surrogate pair 4187 auto r = ws[].byDchar(); 4188 assert(!r.empty); 4189 assert(r.front == r.front); 4190 dchar c = r.front; 4191 assert(c == '\U00010100'); 4192 } 4193 { 4194 auto r = "hello"w.byDchar(); 4195 r.popFront(); 4196 r.popFront(); 4197 assert(r.front == 'l'); 4198 } 4199 4200 { 4201 dchar[5] s; 4202 int i; 4203 dstring a = "hello"d; 4204 foreach (c; a.byDchar.byDchar()) 4205 { 4206 //writefln("[%d] '%c' x%x", i, c, c); 4207 s[i++] = c; 4208 } 4209 assert(s == "hello"d); 4210 } 4211 { 4212 auto r = "hello".byDchar(); 4213 assert(isForwardRange!(typeof(r))); 4214 auto s = r.save; 4215 r.popFront(); 4216 assert(s.front == 'h'); 4217 } 4218 { 4219 auto r = "hello"w.byDchar(); 4220 assert(isForwardRange!(typeof(r))); 4221 auto s = r.save; 4222 r.popFront(); 4223 assert(s.front == 'h'); 4224 } 4225 } 4226 4227 // test pure, @safe, nothrow, @nogc correctness of byChar/byWchar/byDchar, 4228 // which needs to support ranges with and without those attributes 4229 4230 pure @safe nothrow @nogc unittest 4231 { 4232 dchar[5] s = "hello"d; 4233 foreach (c; s[].byChar()) { } 4234 foreach (c; s[].byWchar()) { } 4235 foreach (c; s[].byDchar()) { } 4236 } 4237 4238 version (StdUnittest) 4239 private int impureVariable; 4240 4241 @system unittest 4242 { 4243 static struct ImpureThrowingSystemRange(Char) 4244 { 4245 @property bool empty() const { return true; } 4246 @property Char front() const { return Char.init; } 4247 void popFront() 4248 { 4249 impureVariable++; 4250 throw new Exception("only for testing nothrow"); 4251 } 4252 } 4253 4254 foreach (Char; AliasSeq!(char, wchar, dchar)) 4255 { 4256 ImpureThrowingSystemRange!Char range; 4257 foreach (c; range.byChar()) { } 4258 foreach (c; range.byWchar()) { } 4259 foreach (c; range.byDchar()) { } 4260 } 4261 } 4262 4263 /**************************** 4264 * Iterate an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 4265 * of characters by char type `C` by encoding the elements of the range. 4266 * 4267 * UTF sequences that cannot be converted to the specified encoding are either 4268 * replaced by U+FFFD per "5.22 Best Practice for U+FFFD Substitution" 4269 * of the Unicode Standard 6.2 or result in a thrown UTFException. 4270 * Hence byUTF is not symmetric. 4271 * This algorithm is lazy, and does not allocate memory. 4272 * `@nogc`, `pure`-ity, `nothrow`, and `@safe`-ty are inferred from the 4273 * `r` parameter. 4274 * 4275 * Params: 4276 * C = `char`, `wchar`, or `dchar` 4277 * useReplacementDchar = UseReplacementDchar.yes means replace invalid UTF with `replacementDchar`, 4278 * UseReplacementDchar.no means throw `UTFException` for invalid UTF 4279 * 4280 * Throws: 4281 * `UTFException` if invalid UTF sequence and `useReplacementDchar` is set to `UseReplacementDchar.no` 4282 * 4283 * GC: 4284 * Does not use GC if `useReplacementDchar` is set to `UseReplacementDchar.yes` 4285 * 4286 * Returns: 4287 * A bidirectional range if `R` is a bidirectional range and not auto-decodable, 4288 * as defined by $(REF isAutodecodableString, std, traits). 4289 * 4290 * A forward range if `R` is a forward range and not auto-decodable. 4291 * 4292 * Or, if `R` is a range and it is auto-decodable and 4293 * `is(ElementEncodingType!typeof(r) == C)`, then the range is passed 4294 * to $(LREF byCodeUnit). 4295 * 4296 * Otherwise, an input range of characters. 4297 */ 4298 template byUTF(C, UseReplacementDchar useReplacementDchar = Yes.useReplacementDchar) 4299 if (isSomeChar!C) 4300 { 4301 static if (is(immutable C == immutable UC, UC) && !is(C == UC)) 4302 alias byUTF = byUTF!UC; 4303 else: 4304 4305 auto ref byUTF(R)(R r) 4306 if (isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4307 { 4308 return byUTF(r.byCodeUnit()); 4309 } 4310 4311 auto ref byUTF(R)(R r) 4312 if (!isAutodecodableString!R && isInputRange!R && isSomeChar!(ElementEncodingType!R)) 4313 { 4314 static if (is(immutable ElementEncodingType!R == immutable RC, RC) && is(RC == C)) 4315 { 4316 return r.byCodeUnit(); 4317 } 4318 else static if (is(C == dchar)) 4319 { 4320 static struct Result 4321 { 4322 enum Empty = uint.max; // range is empty or just constructed 4323 4324 this(return scope R r) 4325 { 4326 this.r = r; 4327 } 4328 4329 this(return scope R r, uint buff) 4330 { 4331 this.r = r; 4332 this.buff = buff; 4333 } 4334 4335 static if (isBidirectionalRange!R) 4336 { 4337 this(return scope R r, uint frontBuff, uint backBuff) 4338 { 4339 this.r = r; 4340 this.buff = frontBuff; 4341 this.backBuff = backBuff; 4342 } 4343 } 4344 4345 @property bool empty() 4346 { 4347 static if (isBidirectionalRange!R) 4348 return buff == Empty && backBuff == Empty && r.empty; 4349 else 4350 return buff == Empty && r.empty; 4351 } 4352 4353 @property dchar front() scope // 'scope' required by call to decodeFront() below 4354 { 4355 if (buff == Empty) 4356 { 4357 auto c = r.front; 4358 4359 static if (is(RC == wchar)) 4360 enum firstMulti = 0xD800; // First high surrogate. 4361 else 4362 enum firstMulti = 0x80; // First non-ASCII. 4363 if (c < firstMulti) 4364 { 4365 r.popFront; 4366 buff = cast(dchar) c; 4367 } 4368 else 4369 { 4370 buff = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4371 } 4372 } 4373 return cast(dchar) buff; 4374 } 4375 4376 void popFront() 4377 { 4378 if (buff == Empty) 4379 front(); 4380 buff = Empty; 4381 } 4382 4383 static if (isForwardRange!R) 4384 { 4385 @property auto save() 4386 { 4387 static if (isBidirectionalRange!R) 4388 { 4389 return Result(r.save, buff, backBuff); 4390 } 4391 else 4392 { 4393 return Result(r.save, buff); 4394 } 4395 } 4396 } 4397 4398 static if (isBidirectionalRange!R) 4399 { 4400 @property dchar back() scope // 'scope' required by call to decodeBack() below 4401 { 4402 if (backBuff != Empty) 4403 return cast(dchar) backBuff; 4404 4405 auto c = r.back; 4406 static if (is(RC == wchar)) 4407 enum firstMulti = 0xD800; // First high surrogate. 4408 else 4409 enum firstMulti = 0x80; // First non-ASCII. 4410 if (c < firstMulti) 4411 { 4412 r.popBack; 4413 backBuff = cast(dchar) c; 4414 } 4415 else 4416 { 4417 backBuff = () @trusted { return decodeBack!useReplacementDchar(r); }(); 4418 } 4419 return cast(dchar) backBuff; 4420 4421 } 4422 4423 void popBack() 4424 { 4425 if (backBuff == Empty) 4426 back(); 4427 backBuff = Empty; 4428 } 4429 } 4430 4431 private: 4432 4433 R r; 4434 uint buff = Empty; // one character lookahead buffer 4435 static if (isBidirectionalRange!R) 4436 uint backBuff = Empty; 4437 } 4438 4439 return Result(r); 4440 } 4441 else 4442 { 4443 static struct Result 4444 { 4445 this(return scope R r) 4446 { 4447 this.r = r; 4448 } 4449 4450 this(return scope R r, ushort pos, ushort fill, C[4 / C.sizeof] buf) 4451 { 4452 this.r = r; 4453 this.pos = pos; 4454 this.fill = fill; 4455 this.buf = buf; 4456 } 4457 4458 static if (isBidirectionalRange!R) 4459 { 4460 this(return scope R r, ushort frontPos, ushort frontFill, 4461 ushort backPos, ushort backFill, C[4 / C.sizeof] buf) 4462 { 4463 this.r = r; 4464 this.pos = frontPos; 4465 this.fill = frontFill; 4466 this.backPos = backPos; 4467 this.backFill = backFill; 4468 this.buf = buf; 4469 } 4470 } 4471 4472 @property bool empty() 4473 { 4474 static if (isBidirectionalRange!R) 4475 return pos == fill && backPos == backFill && r.empty; 4476 else 4477 return pos == fill && r.empty; 4478 } 4479 4480 @property auto front() scope // 'scope' required by call to decodeFront() below 4481 { 4482 if (pos == fill) 4483 { 4484 pos = 0; 4485 auto c = r.front; 4486 4487 static if (C.sizeof >= 2 && RC.sizeof >= 2) 4488 enum firstMulti = 0xD800; // First high surrogate. 4489 else 4490 enum firstMulti = 0x80; // First non-ASCII. 4491 if (c < firstMulti) 4492 { 4493 fill = 1; 4494 r.popFront; 4495 buf[pos] = cast(C) c; 4496 } 4497 else 4498 { 4499 static if (is(RC == dchar)) 4500 { 4501 r.popFront; 4502 dchar dc = c; 4503 } 4504 else 4505 dchar dc = () @trusted { return decodeFront!(useReplacementDchar)(r); }(); 4506 fill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4507 } 4508 } 4509 return buf[pos]; 4510 } 4511 4512 void popFront() 4513 { 4514 if (pos == fill) 4515 front; 4516 ++pos; 4517 } 4518 4519 static if (isForwardRange!R) 4520 { 4521 @property auto save() 4522 { 4523 static if (isBidirectionalRange!R) 4524 { 4525 return Result(r.save, pos, fill, backPos, backFill, buf); 4526 } 4527 else 4528 { 4529 return Result(r.save, pos, fill, buf); 4530 } 4531 } 4532 } 4533 4534 static if (isBidirectionalRange!R) 4535 { 4536 @property auto back() scope // 'scope' required by call to decodeBack() below 4537 { 4538 if (backPos != backFill) 4539 return buf[cast(ushort) (backFill - backPos - 1)]; 4540 4541 backPos = 0; 4542 auto c = r.back; 4543 static if (C.sizeof >= 2 && RC.sizeof >= 2) 4544 enum firstMulti = 0xD800; // First high surrogate. 4545 else 4546 enum firstMulti = 0x80; // First non-ASCII. 4547 if (c < firstMulti) 4548 { 4549 backFill = 1; 4550 r.popBack; 4551 buf[cast(ushort) (backFill - backPos - 1)] = cast(C) c; 4552 } 4553 else 4554 { 4555 static if (is(RC == dchar)) 4556 { 4557 r.popBack; 4558 dchar dc = c; 4559 } 4560 else 4561 dchar dc = () @trusted { return decodeBack!(useReplacementDchar)(r); }(); 4562 backFill = cast(ushort) encode!(useReplacementDchar)(buf, dc); 4563 } 4564 return buf[cast(ushort) (backFill - backPos - 1)]; 4565 } 4566 4567 void popBack() 4568 { 4569 if (backPos == backFill) 4570 back; 4571 ++backPos; 4572 } 4573 } 4574 4575 private: 4576 4577 R r; 4578 ushort pos, fill; 4579 static if (isBidirectionalRange!R) 4580 ushort backPos, backFill; 4581 C[4 / C.sizeof] buf = void; 4582 } 4583 4584 return Result(r); 4585 } 4586 } 4587 } 4588 4589 /// 4590 @safe pure nothrow unittest 4591 { 4592 import std.algorithm.comparison : equal; 4593 4594 // hellö as a range of `char`s, which are UTF-8 4595 assert("hell\u00F6".byUTF!char().equal(['h', 'e', 'l', 'l', 0xC3, 0xB6])); 4596 4597 // `wchar`s are able to hold the ö in a single element (UTF-16 code unit) 4598 assert("hell\u00F6".byUTF!wchar().equal(['h', 'e', 'l', 'l', 'ö'])); 4599 4600 // 𐐷 is four code units in UTF-8, two in UTF-16, and one in UTF-32 4601 assert("𐐷".byUTF!char().equal([0xF0, 0x90, 0x90, 0xB7])); 4602 assert("𐐷".byUTF!wchar().equal([0xD801, 0xDC37])); 4603 assert("𐐷".byUTF!dchar().equal([0x00010437])); 4604 } 4605 4606 /// 4607 @safe unittest 4608 { 4609 import std.algorithm.comparison : equal; 4610 import std.exception : assertThrown; 4611 4612 assert("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.yes).equal("hello\uFFFDetty")); 4613 assertThrown!UTFException("hello\xF0betty".byChar.byUTF!(dchar, UseReplacementDchar.no).equal("hello betty")); 4614 } 4615 4616 @safe unittest 4617 { 4618 { 4619 wchar[] s = ['a', 'b', 0x219]; 4620 auto r = s.byUTF!char; 4621 assert(isBidirectionalRange!(typeof(r))); 4622 assert(r.back == 0x99); 4623 r.popBack; 4624 assert(r.back == 0xc8); 4625 r.popBack; 4626 assert(r.back == 'b'); 4627 4628 } 4629 4630 { 4631 wchar[] s = ['a', 'b', 0x219]; 4632 auto r = s.byUTF!wchar; 4633 uint i; 4634 assert(isBidirectionalRange!(typeof(r))); 4635 assert(r.back == 0x219); 4636 r.popBack; 4637 assert(r.back == 'b'); 4638 } 4639 4640 { 4641 wchar[] s = ['a', 'b', 0x219]; 4642 auto r = s.byUTF!dchar; 4643 assert(isBidirectionalRange!(typeof(r))); 4644 assert(r.back == 0x219); 4645 r.popBack; 4646 assert(r.back == 'b'); 4647 } 4648 4649 { 4650 dchar[] s = ['𐐷', '😁']; 4651 auto r = s.byUTF!wchar; 4652 assert(r.back == 0xde01); 4653 r.popBack; 4654 assert(r.back == 0xd83d); 4655 r.popBack; 4656 assert(r.back == 0xdc37); 4657 r.popBack; 4658 assert(r.back == 0xd801); 4659 } 4660 4661 { 4662 dchar[] s = ['𐐷', '😁']; 4663 auto r = s.byUTF!char; 4664 char[] res; 4665 while (!r.empty) 4666 { 4667 res ~= r.back; 4668 r.popBack; 4669 } 4670 import std.algorithm.comparison : equal; 4671 assert(res.equal([0x81, 0x98, 0x9f, 0xf0, 0xb7, 0x90, 0x90, 0xf0])); 4672 } 4673 4674 { 4675 dchar[] res; 4676 auto r = ['a', 'b', 'c', 'd', 'e'].byUTF!dchar; 4677 while (!r.empty) 4678 { 4679 res ~= r.back; 4680 r.popBack; 4681 } 4682 import std.algorithm.comparison : equal; 4683 assert(res.equal(['e', 'd', 'c', 'b', 'a'])); 4684 } 4685 4686 { 4687 //testing the save() function 4688 wchar[] s = ['Ă','ț']; 4689 4690 auto rc = s.byUTF!char; 4691 rc.popBack; 4692 auto rcCopy = rc.save; 4693 assert(rc.back == rcCopy.back); 4694 assert(rcCopy.back == 0xc8); 4695 4696 auto rd = s.byUTF!dchar; 4697 rd.popBack; 4698 auto rdCopy = rd.save; 4699 assert(rd.back == rdCopy.back); 4700 assert(rdCopy.back == 'Ă'); 4701 } 4702 } 4703 4704 /// 4705 @safe pure nothrow unittest 4706 { 4707 import std.range.primitives; 4708 wchar[] s = ['ă', 'î']; 4709 4710 auto rc = s.byUTF!char; 4711 static assert(isBidirectionalRange!(typeof(rc))); 4712 assert(rc.back == 0xae); 4713 rc.popBack; 4714 assert(rc.back == 0xc3); 4715 rc.popBack; 4716 assert(rc.back == 0x83); 4717 rc.popBack; 4718 assert(rc.back == 0xc4); 4719 4720 auto rw = s.byUTF!wchar; 4721 static assert(isBidirectionalRange!(typeof(rw))); 4722 assert(rw.back == 'î'); 4723 rw.popBack; 4724 assert(rw.back == 'ă'); 4725 4726 auto rd = s.byUTF!dchar; 4727 static assert(isBidirectionalRange!(typeof(rd))); 4728 assert(rd.back == 'î'); 4729 rd.popBack; 4730 assert(rd.back == 'ă'); 4731 }