1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 )) 20 $(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23 )) 24 $(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43 )) 44 $(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50 )) 51 $(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55 )) 56 $(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59 )) 60 $(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65 )) 66 $(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73 )) 74 $(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84 )) 85 $(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89 )) 90 $(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94 )) 95 $(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99 )) 100 )) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['¥']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['£']); 219 assert(twoTrie['£']); 220 assert(threeTrie['£']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 226 assert(balance == "ընկեր!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "Äffin"); 237 assert(nS != nonS); 238 string composed = "Äffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2¹⁰") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a “character” 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni/package.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694 Macros: 695 696 SECTION = <h3><a id="$1">$0</a></h3> 697 DEF = <div><a id="$1"><i>$0</i></a></div> 698 S_LINK = <a href="#$1">$+</a> 699 CODEPOINT = $(S_LINK Code point, code point) 700 CODEPOINTS = $(S_LINK Code point, code points) 701 CHARACTER = $(S_LINK Character, character) 702 CHARACTERS = $(S_LINK Character, characters) 703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704 +/ 705 module std.uni; 706 707 import std.meta : AliasSeq; 708 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711 import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713 // debug = std_uni; 714 715 import std.internal.unicode_tables; // generated file 716 717 debug(std_uni) import std.stdio; // writefln, writeln 718 719 private: 720 721 722 void copyBackwards(T,U)(T[] src, U[] dest) 723 { 724 assert(src.length == dest.length); 725 for (size_t i=src.length; i-- > 0; ) 726 dest[i] = src[i]; 727 } 728 729 void copyForward(T,U)(T[] src, U[] dest) 730 { 731 assert(src.length == dest.length); 732 for (size_t i=0; i<src.length; i++) 733 dest[i] = src[i]; 734 } 735 736 // TODO: update to reflect all major CPUs supporting unaligned reads 737 version (X86) 738 enum hasUnalignedReads = true; 739 else version (X86_64) 740 enum hasUnalignedReads = true; 741 else version (SystemZ) 742 enum hasUnalignedReads = true; 743 else 744 enum hasUnalignedReads = false; // better be safe then sorry 745 746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 748 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 749 750 // test the intro example 751 @safe unittest 752 { 753 import std.algorithm.searching : find; 754 // initialize code point sets using script/block or property name 755 // set contains code points from both scripts. 756 auto set = unicode("Cyrillic") | unicode("Armenian"); 757 // or simpler and statically-checked look 758 auto ascii = unicode.ASCII; 759 auto currency = unicode.Currency_Symbol; 760 761 // easy set ops 762 auto a = set & ascii; 763 assert(a.empty); // as it has no intersection with ascii 764 a = set | ascii; 765 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 766 767 // some properties of code point sets 768 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 769 // testing presence of a code point in a set 770 // is just fine, it is O(logN) 771 assert(!b['$']); 772 assert(!b['\u058F']); // Armenian dram sign 773 assert(b['¥']); 774 775 // building fast lookup tables, these guarantee O(1) complexity 776 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 777 auto oneTrie = toTrie!1(b); 778 // 2-level far more compact but typically slightly slower 779 auto twoTrie = toTrie!2(b); 780 // 3-level even smaller, and a bit slower yet 781 auto threeTrie = toTrie!3(b); 782 assert(oneTrie['£']); 783 assert(twoTrie['£']); 784 assert(threeTrie['£']); 785 786 // build the trie with the most sensible trie level 787 // and bind it as a functor 788 auto cyrillicOrArmenian = toDelegate(set); 789 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 790 assert(balance == "ընկեր!"); 791 // compatible with bool delegate(dchar) 792 bool delegate(dchar) bindIt = cyrillicOrArmenian; 793 794 // Normalization 795 string s = "Plain ascii (and not only), is always normalized!"; 796 assert(s is normalize(s));// is the same string 797 798 string nonS = "A\u0308ffin"; // A ligature 799 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 800 assert(nS == "Äffin"); 801 assert(nS != nonS); 802 string composed = "Äffin"; 803 804 assert(normalize!NFD(composed) == "A\u0308ffin"); 805 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 806 assert(normalize!NFKD("2¹⁰") == "210"); 807 } 808 809 enum lastDchar = 0x10FFFF; 810 811 auto force(T, F)(F from) 812 if (isIntegral!T && !is(T == F)) 813 { 814 assert(from <= T.max && from >= T.min); 815 return cast(T) from; 816 } 817 818 auto force(T, F)(F from) 819 if (isBitPacked!T && !is(T == F)) 820 { 821 assert(from <= 2^^bitSizeOf!T-1); 822 return T(cast(TypeOfBitPacked!T) from); 823 } 824 825 auto force(T, F)(F from) 826 if (is(T == F)) 827 { 828 return from; 829 } 830 831 // repeat X times the bit-pattern in val assuming it's length is 'bits' 832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 833 { 834 static if (times == 1) 835 return val; 836 else static if (bits == 1) 837 { 838 static if (times == size_t.sizeof*8) 839 return val ? size_t.max : 0; 840 else 841 return val ? (1 << times)-1 : 0; 842 } 843 else static if (times % 2) 844 return (replicateBits!(times-1, bits)(val)<<bits) | val; 845 else 846 return replicateBits!(times/2, bits*2)((val << bits) | val); 847 } 848 849 @safe pure nothrow @nogc unittest // for replicate 850 { 851 import std.algorithm.iteration : sum, map; 852 import std.range : iota; 853 size_t m = 0b111; 854 size_t m2 = 0b01; 855 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 856 { 857 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 858 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 859 } 860 } 861 862 // multiple arrays squashed into one memory block 863 struct MultiArray(Types...) 864 { 865 import std.range.primitives : isOutputRange; 866 this(size_t[] sizes...) @safe pure nothrow 867 { 868 assert(dim == sizes.length); 869 size_t full_size; 870 foreach (i, v; Types) 871 { 872 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 873 sz[i] = sizes[i]; 874 static if (i >= 1) 875 offsets[i] = offsets[i-1] + 876 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 877 } 878 879 storage = new size_t[full_size]; 880 } 881 882 this(const(size_t)[] raw_offsets, 883 const(size_t)[] raw_sizes, 884 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 885 { 886 offsets[] = raw_offsets[]; 887 sz[] = raw_sizes[]; 888 storage = data; 889 } 890 891 @property auto slice(size_t n)()inout pure nothrow @nogc 892 { 893 auto ptr = raw_ptr!n; 894 return packedArrayView!(Types[n])(ptr, sz[n]); 895 } 896 897 @property auto ptr(size_t n)()inout pure nothrow @nogc 898 { 899 auto ptr = raw_ptr!n; 900 return inout(PackedPtr!(Types[n]))(ptr); 901 } 902 903 template length(size_t n) 904 { 905 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 906 907 @property void length(size_t new_size) 908 { 909 if (new_size > sz[n]) 910 {// extend 911 size_t delta = (new_size - sz[n]); 912 sz[n] += delta; 913 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 914 storage.length += delta;// extend space at end 915 // raw_slice!x must follow resize as it could be moved! 916 // next stmts move all data past this array, last-one-goes-first 917 static if (n != dim-1) 918 { 919 auto start = raw_ptr!(n+1); 920 // len includes delta 921 size_t len = (storage.ptr+storage.length-start); 922 923 copyBackwards(start[0 .. len-delta], start[delta .. len]); 924 925 start[0 .. delta] = 0; 926 // offsets are used for raw_slice, ptr etc. 927 foreach (i; n+1 .. dim) 928 offsets[i] += delta; 929 } 930 } 931 else if (new_size < sz[n]) 932 {// shrink 933 size_t delta = (sz[n] - new_size); 934 sz[n] -= delta; 935 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 936 // move all data past this array, forward direction 937 static if (n != dim-1) 938 { 939 auto start = raw_ptr!(n+1); 940 size_t len = (storage.ptr+storage.length-start); 941 copyForward(start[0 .. len-delta], start[delta .. len]); 942 943 // adjust offsets last, they affect raw_slice 944 foreach (i; n+1 .. dim) 945 offsets[i] -= delta; 946 } 947 storage.length -= delta; 948 } 949 // else - NOP 950 } 951 } 952 953 @property size_t bytes(size_t n=size_t.max)() const @safe 954 { 955 static if (n == size_t.max) 956 return storage.length*size_t.sizeof; 957 else static if (n != Types.length-1) 958 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 959 else 960 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 961 } 962 963 void store(OutRange)(scope OutRange sink) const 964 if (isOutputRange!(OutRange, char)) 965 { 966 import std.format.write : formattedWrite; 967 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 968 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 969 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 970 } 971 972 private: 973 import std.meta : staticMap; 974 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 975 { 976 static if (n == 0) 977 return storage.ptr; 978 else 979 { 980 return storage.ptr+offsets[n]; 981 } 982 } 983 enum dim = Types.length; 984 size_t[dim] offsets;// offset for level x 985 size_t[dim] sz;// size of level x 986 alias bitWidth = staticMap!(bitSizeOf, Types); 987 size_t[] storage; 988 } 989 990 @system unittest 991 { 992 import std.conv : text; 993 enum dg = (){ 994 // sizes are: 995 // lvl0: 3, lvl1 : 2, lvl2: 1 996 auto m = MultiArray!(int, ubyte, int)(3,2,1); 997 998 static void check(size_t k, T)(ref T m, int n) 999 { 1000 foreach (i; 0 .. n) 1001 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1002 } 1003 1004 static void checkB(size_t k, T)(ref T m, int n) 1005 { 1006 foreach (i; 0 .. n) 1007 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1008 } 1009 1010 static void fill(size_t k, T)(ref T m, int n) 1011 { 1012 foreach (i; 0 .. n) 1013 m.slice!(k)[i] = force!ubyte(i+1); 1014 } 1015 1016 static void fillB(size_t k, T)(ref T m, int n) 1017 { 1018 foreach (i; 0 .. n) 1019 m.slice!(k)[i] = force!ubyte(n-i); 1020 } 1021 1022 m.length!1 = 100; 1023 fill!1(m, 100); 1024 check!1(m, 100); 1025 1026 m.length!0 = 220; 1027 fill!0(m, 220); 1028 check!1(m, 100); 1029 check!0(m, 220); 1030 1031 m.length!2 = 17; 1032 fillB!2(m, 17); 1033 checkB!2(m, 17); 1034 check!0(m, 220); 1035 check!1(m, 100); 1036 1037 m.length!2 = 33; 1038 checkB!2(m, 17); 1039 fillB!2(m, 33); 1040 checkB!2(m, 33); 1041 check!0(m, 220); 1042 check!1(m, 100); 1043 1044 m.length!1 = 195; 1045 fillB!1(m, 195); 1046 checkB!1(m, 195); 1047 checkB!2(m, 33); 1048 check!0(m, 220); 1049 1050 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1051 marr.length!0 = 15; 1052 marr.length!1 = 30; 1053 fill!1(marr, 30); 1054 fill!0(marr, 15); 1055 check!1(marr, 30); 1056 check!0(marr, 15); 1057 return 0; 1058 }; 1059 enum ct = dg(); 1060 auto rt = dg(); 1061 } 1062 1063 @system unittest 1064 {// more bitpacking tests 1065 import std.conv : text; 1066 1067 alias Bitty = 1068 MultiArray!(BitPacked!(size_t, 3) 1069 , BitPacked!(size_t, 4) 1070 , BitPacked!(size_t, 3) 1071 , BitPacked!(size_t, 6) 1072 , bool); 1073 alias fn1 = sliceBits!(13, 16); 1074 alias fn2 = sliceBits!( 9, 13); 1075 alias fn3 = sliceBits!( 6, 9); 1076 alias fn4 = sliceBits!( 0, 6); 1077 static void check(size_t lvl, MA)(ref MA arr){ 1078 for (size_t i = 0; i< arr.length!lvl; i++) 1079 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1080 } 1081 1082 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1083 for (size_t i = 0; i< arr.length!lvl; i++) 1084 arr.slice!(lvl)[i] = i; 1085 } 1086 Bitty m1; 1087 1088 m1.length!4 = 10; 1089 m1.length!3 = 2^^6; 1090 m1.length!2 = 2^^3; 1091 m1.length!1 = 2^^4; 1092 m1.length!0 = 2^^3; 1093 1094 m1.length!4 = 2^^16; 1095 1096 for (size_t i = 0; i< m1.length!4; i++) 1097 m1.slice!(4)[i] = i % 2; 1098 1099 fillIdx!1(m1); 1100 check!1(m1); 1101 fillIdx!2(m1); 1102 check!2(m1); 1103 fillIdx!3(m1); 1104 check!3(m1); 1105 fillIdx!0(m1); 1106 check!0(m1); 1107 check!3(m1); 1108 check!2(m1); 1109 check!1(m1); 1110 for (size_t i=0; i < 2^^16; i++) 1111 { 1112 m1.slice!(4)[i] = i % 2; 1113 m1.slice!(0)[fn1(i)] = fn1(i); 1114 m1.slice!(1)[fn2(i)] = fn2(i); 1115 m1.slice!(2)[fn3(i)] = fn3(i); 1116 m1.slice!(3)[fn4(i)] = fn4(i); 1117 } 1118 for (size_t i=0; i < 2^^16; i++) 1119 { 1120 assert(m1.slice!(4)[i] == i % 2); 1121 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1122 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1123 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1124 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1125 } 1126 } 1127 1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1129 { 1130 import std.math.algebraic : nextPow2; 1131 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1132 static if (bits > 8*size_t.sizeof) 1133 { 1134 static assert(bits % (size_t.sizeof*8) == 0); 1135 return new_len * bits/(8*size_t.sizeof); 1136 } 1137 else 1138 { 1139 enum factor = size_t.sizeof*8/bits; 1140 return (new_len+factor-1)/factor; // rounded up 1141 } 1142 } 1143 1144 template isBitPackableType(T) 1145 { 1146 enum isBitPackableType = isBitPacked!T 1147 || isIntegral!T || is(T == bool) || isSomeChar!T; 1148 } 1149 1150 //============================================================================ 1151 template PackedArrayView(T) 1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1153 && isBitPackableType!U) || isBitPackableType!T) 1154 { 1155 import std.math.algebraic : nextPow2; 1156 private enum bits = bitSizeOf!T; 1157 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1158 } 1159 1160 //unsafe and fast access to a chunk of RAM as if it contains packed values 1161 template PackedPtr(T) 1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1163 && isBitPackableType!U) || isBitPackableType!T) 1164 { 1165 import std.math.algebraic : nextPow2; 1166 private enum bits = bitSizeOf!T; 1167 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1168 } 1169 1170 struct PackedPtrImpl(T, size_t bits) 1171 { 1172 pure nothrow: 1173 static assert(isPow2OrZero(bits)); 1174 1175 this(inout(size_t)* ptr)inout @safe @nogc 1176 { 1177 origin = ptr; 1178 } 1179 1180 private T simpleIndex(size_t n) inout 1181 { 1182 immutable q = n / factor; 1183 immutable r = n % factor; 1184 return cast(T)((origin[q] >> bits*r) & mask); 1185 } 1186 1187 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1188 in 1189 { 1190 static if (isIntegral!T) 1191 assert(val <= mask); 1192 } 1193 do 1194 { 1195 immutable q = n / factor; 1196 immutable r = n % factor; 1197 immutable tgt_shift = bits*r; 1198 immutable word = origin[q]; 1199 origin[q] = (word & ~(mask << tgt_shift)) 1200 | (cast(size_t) val << tgt_shift); 1201 } 1202 1203 static if (factor == bytesPerWord// can safely pack by byte 1204 || factor == 1 // a whole word at a time 1205 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1206 && hasUnalignedReads)) // this needs unaligned reads 1207 { 1208 static if (factor == bytesPerWord) 1209 alias U = ubyte; 1210 else static if (factor == bytesPerWord/2) 1211 alias U = ushort; 1212 else static if (factor == bytesPerWord/4) 1213 alias U = uint; 1214 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1215 alias U = ulong; 1216 1217 T opIndex(size_t idx) inout 1218 { 1219 T ret; 1220 version (LittleEndian) 1221 ret = __ctfe ? simpleIndex(idx) : 1222 cast(inout(T))(cast(U*) origin)[idx]; 1223 else 1224 ret = simpleIndex(idx); 1225 return ret; 1226 } 1227 1228 static if (isBitPacked!T) // lack of user-defined implicit conversion 1229 { 1230 void opIndexAssign(T val, size_t idx) 1231 { 1232 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1233 } 1234 } 1235 1236 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1237 { 1238 version (LittleEndian) 1239 { 1240 if (__ctfe) 1241 simpleWrite(val, idx); 1242 else 1243 (cast(U*) origin)[idx] = cast(U) val; 1244 } 1245 else 1246 simpleWrite(val, idx); 1247 } 1248 } 1249 else 1250 { 1251 T opIndex(size_t n) inout 1252 { 1253 return simpleIndex(n); 1254 } 1255 1256 static if (isBitPacked!T) // lack of user-defined implicit conversion 1257 { 1258 void opIndexAssign(T val, size_t idx) 1259 { 1260 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1261 } 1262 } 1263 1264 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1265 { 1266 return simpleWrite(val, n); 1267 } 1268 } 1269 1270 private: 1271 // factor - number of elements in one machine word 1272 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1273 enum bytesPerWord = size_t.sizeof; 1274 size_t* origin; 1275 } 1276 1277 // data is packed only by power of two sized packs per word, 1278 // thus avoiding mul/div overhead at the cost of ultimate packing 1279 // this construct doesn't own memory, only provides access, see MultiArray for usage 1280 struct PackedArrayViewImpl(T, size_t bits) 1281 { 1282 pure nothrow: 1283 1284 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1285 { 1286 ptr = inout(PackedPtr!(T))(origin); 1287 ofs = offset; 1288 limit = items; 1289 } 1290 1291 bool zeros(size_t s, size_t e) 1292 in 1293 { 1294 assert(s <= e); 1295 } 1296 do 1297 { 1298 s += ofs; 1299 e += ofs; 1300 immutable pad_s = roundUp(s); 1301 if ( s >= e) 1302 { 1303 foreach (i; s .. e) 1304 if (ptr[i]) 1305 return false; 1306 return true; 1307 } 1308 immutable pad_e = roundDown(e); 1309 size_t i; 1310 for (i=s; i<pad_s; i++) 1311 if (ptr[i]) 1312 return false; 1313 // all in between is x*factor elements 1314 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1315 if (ptr.origin[j]) 1316 return false; 1317 for (; i<e; i++) 1318 if (ptr[i]) 1319 return false; 1320 return true; 1321 } 1322 1323 T opIndex(size_t idx) inout 1324 in 1325 { 1326 assert(idx < limit); 1327 } 1328 do 1329 { 1330 return ptr[ofs + idx]; 1331 } 1332 1333 static if (isBitPacked!T) // lack of user-defined implicit conversion 1334 { 1335 void opIndexAssign(T val, size_t idx) 1336 { 1337 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1338 } 1339 } 1340 1341 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1342 in 1343 { 1344 assert(idx < limit); 1345 } 1346 do 1347 { 1348 ptr[ofs + idx] = val; 1349 } 1350 1351 static if (isBitPacked!T) // lack of user-defined implicit conversions 1352 { 1353 void opSliceAssign(T val, size_t start, size_t end) 1354 { 1355 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1356 } 1357 } 1358 1359 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1360 in 1361 { 1362 assert(start <= end); 1363 assert(end <= limit); 1364 } 1365 do 1366 { 1367 // account for ofsetted view 1368 start += ofs; 1369 end += ofs; 1370 // rounded to factor granularity 1371 immutable pad_start = roundUp(start);// rounded up 1372 if (pad_start >= end) //rounded up >= then end of slice 1373 { 1374 //nothing to gain, use per element assignment 1375 foreach (i; start .. end) 1376 ptr[i] = val; 1377 return; 1378 } 1379 immutable pad_end = roundDown(end); // rounded down 1380 size_t i; 1381 for (i=start; i<pad_start; i++) 1382 ptr[i] = val; 1383 // all in between is x*factor elements 1384 if (pad_start != pad_end) 1385 { 1386 immutable repval = replicateBits!(factor, bits)(val); 1387 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1388 ptr.origin[j] = repval;// so speed it up by factor 1389 } 1390 for (; i<end; i++) 1391 ptr[i] = val; 1392 } 1393 1394 auto opSlice(size_t from, size_t to)inout 1395 in 1396 { 1397 assert(from <= to); 1398 assert(ofs + to <= limit); 1399 } 1400 do 1401 { 1402 return typeof(this)(ptr.origin, ofs + from, to - from); 1403 } 1404 1405 auto opSlice(){ return opSlice(0, length); } 1406 1407 bool opEquals(T)(auto ref T arr) const 1408 { 1409 if (limit != arr.limit) 1410 return false; 1411 size_t s1 = ofs, s2 = arr.ofs; 1412 size_t e1 = s1 + limit, e2 = s2 + limit; 1413 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1414 { 1415 return ptr.origin[s1/factor .. e1/factor] 1416 == arr.ptr.origin[s2/factor .. e2/factor]; 1417 } 1418 for (size_t i=0;i<limit; i++) 1419 if (this[i] != arr[i]) 1420 return false; 1421 return true; 1422 } 1423 1424 @property size_t length()const{ return limit; } 1425 1426 private: 1427 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1428 auto roundDown()(size_t val){ return val/factor*factor; } 1429 // factor - number of elements in one machine word 1430 enum factor = size_t.sizeof*8/bits; 1431 PackedPtr!(T) ptr; 1432 size_t ofs, limit; 1433 } 1434 1435 1436 private struct SliceOverIndexed(T) 1437 { 1438 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1439 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1440 auto opIndex(size_t idx)const 1441 in 1442 { 1443 assert(idx < to - from); 1444 } 1445 do 1446 { 1447 return (*arr)[from+idx]; 1448 } 1449 1450 static if (assignableIndex) 1451 void opIndexAssign(Item val, size_t idx) 1452 in 1453 { 1454 assert(idx < to - from); 1455 } 1456 do 1457 { 1458 (*arr)[from+idx] = val; 1459 } 1460 1461 auto opSlice(size_t a, size_t b) 1462 { 1463 return typeof(this)(from+a, from+b, arr); 1464 } 1465 1466 // static if (assignableSlice) 1467 void opSliceAssign(T)(T val, size_t start, size_t end) 1468 { 1469 (*arr)[start+from .. end+from] = val; 1470 } 1471 1472 auto opSlice() 1473 { 1474 return typeof(this)(from, to, arr); 1475 } 1476 1477 @property size_t length()const { return to-from;} 1478 1479 alias opDollar = length; 1480 1481 @property bool empty()const { return from == to; } 1482 1483 @property auto front()const { return (*arr)[from]; } 1484 1485 static if (assignableIndex) 1486 @property void front(Item val) { (*arr)[from] = val; } 1487 1488 @property auto back()const { return (*arr)[to-1]; } 1489 1490 static if (assignableIndex) 1491 @property void back(Item val) { (*arr)[to-1] = val; } 1492 1493 @property auto save() inout { return this; } 1494 1495 void popFront() { from++; } 1496 1497 void popBack() { to--; } 1498 1499 bool opEquals(T)(auto ref T arr) const 1500 { 1501 if (arr.length != length) 1502 return false; 1503 for (size_t i=0; i <length; i++) 1504 if (this[i] != arr[i]) 1505 return false; 1506 return true; 1507 } 1508 private: 1509 alias Item = typeof(T.init[0]); 1510 size_t from, to; 1511 T* arr; 1512 } 1513 1514 @safe pure nothrow @nogc unittest 1515 { 1516 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1517 } 1518 1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1520 if (is(Unqual!T == T)) 1521 { 1522 return SliceOverIndexed!(const(T))(a, b, x); 1523 } 1524 1525 // BUG? inout is out of reach 1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1528 if (is(Unqual!T == T)) 1529 { 1530 return SliceOverIndexed!T(a, b, x); 1531 } 1532 1533 @system unittest 1534 { 1535 int[] idxArray = [2, 3, 5, 8, 13]; 1536 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1537 1538 assert(!sliced.empty); 1539 assert(sliced.front == 2); 1540 sliced.front = 1; 1541 assert(sliced.front == 1); 1542 assert(sliced.back == 13); 1543 sliced.popFront(); 1544 assert(sliced.front == 3); 1545 assert(sliced.back == 13); 1546 sliced.back = 11; 1547 assert(sliced.back == 11); 1548 sliced.popBack(); 1549 1550 assert(sliced.front == 3); 1551 assert(sliced[$-1] == 8); 1552 sliced = sliced[]; 1553 assert(sliced[0] == 3); 1554 assert(sliced.back == 8); 1555 sliced = sliced[1..$]; 1556 assert(sliced.front == 5); 1557 sliced = sliced[0..$-1]; 1558 assert(sliced[$-1] == 5); 1559 1560 int[] other = [2, 5]; 1561 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1562 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1563 assert(idxArray[0 .. 2] == [-1, -1]); 1564 uint[] nullArr = null; 1565 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1566 assert(nullSlice.empty); 1567 } 1568 1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1570 { 1571 return inout(PackedArrayView!T)(ptr, 0, items); 1572 } 1573 1574 1575 //============================================================================ 1576 // Partially unrolled binary search using Shar's method 1577 //============================================================================ 1578 1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1580 { 1581 import core.bitop : bsr; 1582 import std.array : replace; 1583 import std.conv : to; 1584 assert(isPow2OrZero(size)); 1585 string code = ` 1586 import core.bitop : bsr; 1587 auto power = bsr(m)+1; 1588 switch (power){`; 1589 size_t i = bsr(size); 1590 foreach_reverse (val; 0 .. bsr(size)) 1591 { 1592 auto v = 2^^val; 1593 code ~= ` 1594 case pow: 1595 if (pred(range[idx+m], needle)) 1596 idx += m; 1597 goto case; 1598 `.replace("m", to!string(v)) 1599 .replace("pow", to!string(i)); 1600 i--; 1601 } 1602 code ~= ` 1603 case 0: 1604 if (pred(range[idx], needle)) 1605 idx += 1; 1606 goto default; 1607 `; 1608 code ~= ` 1609 default: 1610 }`; 1611 return code; 1612 } 1613 1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1615 { 1616 // See also: std.math.isPowerOf2() 1617 return (sz & (sz-1)) == 0; 1618 } 1619 1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1621 if (is(T : ElementType!Range)) 1622 { 1623 assert(isPow2OrZero(range.length)); 1624 size_t idx = 0, m = range.length/2; 1625 while (m != 0) 1626 { 1627 if (pred(range[idx+m], needle)) 1628 idx += m; 1629 m /= 2; 1630 } 1631 if (pred(range[idx], needle)) 1632 idx += 1; 1633 return idx; 1634 } 1635 1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1637 if (is(T : ElementType!Range)) 1638 { 1639 assert(isPow2OrZero(range.length)); 1640 size_t idx = 0, m = range.length/2; 1641 enum max = 1 << 10; 1642 while (m >= max) 1643 { 1644 if (pred(range[idx+m], needle)) 1645 idx += m; 1646 m /= 2; 1647 } 1648 mixin(genUnrolledSwitchSearch(max)); 1649 return idx; 1650 } 1651 1652 template sharMethod(alias uniLowerBound) 1653 { 1654 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1655 if (is(T : ElementType!Range)) 1656 { 1657 import std.functional : binaryFun; 1658 import std.math.algebraic : nextPow2, truncPow2; 1659 alias pred = binaryFun!_pred; 1660 if (range.length == 0) 1661 return 0; 1662 if (isPow2OrZero(range.length)) 1663 return uniLowerBound!pred(range, needle); 1664 size_t n = truncPow2(range.length); 1665 if (pred(range[n-1], needle)) 1666 {// search in another 2^^k area that fully covers the tail of range 1667 size_t k = nextPow2(range.length - n + 1); 1668 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1669 } 1670 else 1671 return uniLowerBound!pred(range[0 .. n], needle); 1672 } 1673 } 1674 1675 alias sharLowerBound = sharMethod!uniformLowerBound; 1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1677 1678 @safe unittest 1679 { 1680 import std.array : array; 1681 import std.range : assumeSorted, iota; 1682 1683 auto stdLowerBound(T)(T[] range, T needle) 1684 { 1685 return assumeSorted(range).lowerBound(needle).length; 1686 } 1687 immutable MAX = 5*1173; 1688 auto arr = array(iota(5, MAX, 5)); 1689 assert(arr.length == MAX/5-1); 1690 foreach (i; 0 .. MAX+5) 1691 { 1692 auto st = stdLowerBound(arr, i); 1693 assert(st == sharLowerBound(arr, i)); 1694 assert(st == sharSwitchLowerBound(arr, i)); 1695 } 1696 arr = []; 1697 auto st = stdLowerBound(arr, 33); 1698 assert(st == sharLowerBound(arr, 33)); 1699 assert(st == sharSwitchLowerBound(arr, 33)); 1700 } 1701 //============================================================================ 1702 1703 @safe 1704 { 1705 // hope to see simillar stuff in public interface... once Allocators are out 1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1707 1708 @trusted size_t genericReplace(Policy=void, T, Range) 1709 (ref T dest, size_t from, size_t to, Range stuff) 1710 { 1711 import std.algorithm.mutation : copy; 1712 size_t delta = to - from; 1713 size_t stuff_end = from+stuff.length; 1714 if (stuff.length > delta) 1715 {// replace increases length 1716 delta = stuff.length - delta;// now, new is > old by delta 1717 static if (is(Policy == void)) 1718 dest.length = dest.length+delta;//@@@BUG lame @property 1719 else 1720 dest = Policy.realloc(dest, dest.length+delta); 1721 copyBackwards(dest[to .. dest.length-delta], 1722 dest[to+delta .. dest.length]); 1723 copyForward(stuff, dest[from .. stuff_end]); 1724 } 1725 else if (stuff.length == delta) 1726 { 1727 copy(stuff, dest[from .. to]); 1728 } 1729 else 1730 {// replace decreases length by delta 1731 delta = delta - stuff.length; 1732 copy(stuff, dest[from .. stuff_end]); 1733 copyForward(dest[to .. dest.length], 1734 dest[stuff_end .. dest.length-delta]); 1735 static if (is(Policy == void)) 1736 dest.length = dest.length - delta;//@@@BUG lame @property 1737 else 1738 dest = Policy.realloc(dest, dest.length-delta); 1739 } 1740 return stuff_end; 1741 } 1742 1743 1744 // Simple storage manipulation policy 1745 @safe private struct GcPolicy 1746 { 1747 import std.traits : isDynamicArray; 1748 1749 static T[] dup(T)(const T[] arr) 1750 { 1751 return arr.dup; 1752 } 1753 1754 static T[] alloc(T)(size_t size) 1755 { 1756 return new T[size]; 1757 } 1758 1759 static T[] realloc(T)(T[] arr, size_t sz) 1760 { 1761 arr.length = sz; 1762 return arr; 1763 } 1764 1765 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1766 { 1767 replaceInPlace(dest, from, to, stuff); 1768 } 1769 1770 static void append(T, V)(ref T[] arr, V value) 1771 if (!isInputRange!V) 1772 { 1773 arr ~= force!T(value); 1774 } 1775 1776 static void append(T, V)(ref T[] arr, V value) 1777 if (isInputRange!V) 1778 { 1779 insertInPlace(arr, arr.length, value); 1780 } 1781 1782 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1783 if (isDynamicArray!T && is(Unqual!T == T)) 1784 { 1785 debug 1786 { 1787 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1788 } 1789 arr = null; 1790 } 1791 1792 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1793 if (isDynamicArray!T && !is(Unqual!T == T)) 1794 { 1795 arr = null; 1796 } 1797 } 1798 1799 // ditto 1800 @safe struct ReallocPolicy 1801 { 1802 import std.range.primitives : hasLength; 1803 1804 static T[] dup(T)(const T[] arr) 1805 { 1806 auto result = alloc!T(arr.length); 1807 result[] = arr[]; 1808 return result; 1809 } 1810 1811 static T[] alloc(T)(size_t size) @trusted 1812 { 1813 import std.internal.memory : enforceMalloc; 1814 1815 import core.checkedint : mulu; 1816 bool overflow; 1817 size_t nbytes = mulu(size, T.sizeof, overflow); 1818 if (overflow) assert(0); 1819 1820 auto ptr = cast(T*) enforceMalloc(nbytes); 1821 return ptr[0 .. size]; 1822 } 1823 1824 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1825 { 1826 import std.internal.memory : enforceRealloc; 1827 if (!size) 1828 { 1829 destroy(arr); 1830 return null; 1831 } 1832 1833 import core.checkedint : mulu; 1834 bool overflow; 1835 size_t nbytes = mulu(size, T.sizeof, overflow); 1836 if (overflow) assert(0); 1837 1838 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1839 return ptr[0 .. size]; 1840 } 1841 1842 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1843 { 1844 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1845 } 1846 1847 static void append(T, V)(ref T[] arr, V value) 1848 if (!isInputRange!V) 1849 { 1850 if (arr.length == size_t.max) assert(0); 1851 arr = realloc(arr, arr.length+1); 1852 arr[$-1] = force!T(value); 1853 } 1854 1855 pure @safe unittest 1856 { 1857 int[] arr; 1858 ReallocPolicy.append(arr, 3); 1859 1860 import std.algorithm.comparison : equal; 1861 assert(equal(arr, [3])); 1862 } 1863 1864 static void append(T, V)(ref T[] arr, V value) 1865 if (isInputRange!V && hasLength!V) 1866 { 1867 import core.checkedint : addu; 1868 bool overflow; 1869 size_t nelems = addu(arr.length, value.length, overflow); 1870 if (overflow) assert(0); 1871 1872 arr = realloc(arr, nelems); 1873 1874 import std.algorithm.mutation : copy; 1875 copy(value, arr[$-value.length..$]); 1876 } 1877 1878 pure @safe unittest 1879 { 1880 int[] arr; 1881 ReallocPolicy.append(arr, [1,2,3]); 1882 1883 import std.algorithm.comparison : equal; 1884 assert(equal(arr, [1,2,3])); 1885 } 1886 1887 static void destroy(T)(scope ref T[] arr) @trusted 1888 { 1889 import core.memory : pureFree; 1890 if (arr.ptr) 1891 pureFree(arr.ptr); 1892 arr = null; 1893 } 1894 } 1895 1896 //build hack 1897 alias _RealArray = CowArray!ReallocPolicy; 1898 1899 pure @safe unittest 1900 { 1901 import std.algorithm.comparison : equal; 1902 1903 with(ReallocPolicy) 1904 { 1905 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1906 string file = __FILE__, size_t line = __LINE__) 1907 { 1908 { 1909 replaceImpl(orig, from, to, toReplace); 1910 scope(exit) destroy(orig); 1911 if (!equal(orig, result)) 1912 return false; 1913 } 1914 return true; 1915 } 1916 static T[] arr(T)(T[] args... ) 1917 { 1918 return dup(args); 1919 } 1920 1921 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1922 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1923 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1924 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1925 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1926 } 1927 } 1928 1929 /** 1930 Tests if T is some kind a set of code points. Intended for template constraints. 1931 */ 1932 public template isCodepointSet(T) 1933 { 1934 static if (is(T dummy == InversionList!(Args), Args...)) 1935 enum isCodepointSet = true; 1936 else 1937 enum isCodepointSet = false; 1938 } 1939 1940 /** 1941 Tests if `T` is a pair of integers that implicitly convert to `V`. 1942 The following code must compile for any pair `T`: 1943 --- 1944 (T x){ V a = x[0]; V b = x[1];} 1945 --- 1946 The following must not compile: 1947 --- 1948 (T x){ V c = x[2];} 1949 --- 1950 */ 1951 public template isIntegralPair(T, V=uint) 1952 { 1953 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1954 && !is(typeof((T x){ V c = x[2]; })); 1955 } 1956 1957 1958 /** 1959 The recommended default type for set of $(CODEPOINTS). 1960 For details, see the current implementation: $(LREF InversionList). 1961 */ 1962 public alias CodepointSet = InversionList!GcPolicy; 1963 1964 1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1967 // hence below doesn't seem to work 1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1969 1970 /** 1971 The recommended type of $(REF Tuple, std,_typecons) 1972 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1973 Any interval type should pass $(LREF isIntegralPair) trait. 1974 */ 1975 public struct CodepointInterval 1976 { 1977 pure: 1978 uint[2] _tuple; 1979 alias _tuple this; 1980 1981 @safe pure nothrow @nogc: 1982 1983 this(uint low, uint high) 1984 { 1985 _tuple[0] = low; 1986 _tuple[1] = high; 1987 } 1988 bool opEquals(T)(T val) const 1989 { 1990 return this[0] == val[0] && this[1] == val[1]; 1991 } 1992 @property ref inout(uint) a() return inout { return _tuple[0]; } 1993 @property ref inout(uint) b() return inout { return _tuple[1]; } 1994 } 1995 1996 /** 1997 $(P 1998 `InversionList` is a set of $(CODEPOINTS) 1999 represented as an array of open-right [a, b$(RPAREN) 2000 intervals (see $(LREF CodepointInterval) above). 2001 The name comes from the way the representation reads left to right. 2002 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2003 plus a singular value 60 looks like this: 2004 ) 2005 --- 2006 10, 50, 60, 61, 80, 90 2007 --- 2008 $(P 2009 The way to read this is: start with negative meaning that all numbers 2010 smaller then the next one are not present in this set (and positive - 2011 the contrary). Then switch positive/negative after each 2012 number passed from left to right. 2013 ) 2014 $(P This way negative spans until 10, then positive until 50, 2015 then negative until 60, then positive until 61, and so on. 2016 As seen this provides a space-efficient storage of highly redundant data 2017 that comes in long runs. A description which Unicode $(CHARACTER) 2018 properties fit nicely. The technique itself could be seen as a variation 2019 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2020 ) 2021 2022 $(P Sets are value types (just like `int` is) thus they 2023 are never aliased. 2024 ) 2025 Example: 2026 --- 2027 auto a = CodepointSet('a', 'z'+1); 2028 auto b = CodepointSet('A', 'Z'+1); 2029 auto c = a; 2030 a = a | b; 2031 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2032 assert(a != c); 2033 --- 2034 $(P See also $(LREF unicode) for simpler construction of sets 2035 from predefined ones. 2036 ) 2037 2038 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2039 The value semantics are achieved by using the 2040 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2041 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2042 ) 2043 2044 Note: 2045 $(P It's not recommended to rely on the template parameters 2046 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2047 The type and parameters may change when the standard 2048 allocators design is finalized. 2049 Use $(LREF isCodepointSet) with templates or just stick with the default 2050 alias $(LREF CodepointSet) throughout the whole code base. 2051 ) 2052 */ 2053 public struct InversionList(SP=GcPolicy) 2054 { 2055 import std.range : assumeSorted; 2056 2057 /** 2058 Construct from another code point set of any type. 2059 */ 2060 this(Set)(Set set) pure 2061 if (isCodepointSet!Set) 2062 { 2063 uint[] arr; 2064 foreach (v; set.byInterval) 2065 { 2066 arr ~= v.a; 2067 arr ~= v.b; 2068 } 2069 data = CowArray!(SP).reuse(arr); 2070 } 2071 2072 /** 2073 Construct a set from a forward range of code point intervals. 2074 */ 2075 this(Range)(Range intervals) pure 2076 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2077 { 2078 uint[] arr; 2079 foreach (v; intervals) 2080 { 2081 SP.append(arr, v.a); 2082 SP.append(arr, v.b); 2083 } 2084 data = CowArray!(SP).reuse(arr); 2085 sanitize(); //enforce invariant: sort intervals etc. 2086 } 2087 2088 //helper function that avoids sanity check to be CTFE-friendly 2089 private static fromIntervals(Range)(Range intervals) pure 2090 { 2091 import std.algorithm.iteration : map; 2092 import std.range : roundRobin; 2093 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2094 intervals.save.map!"a[1]"()); 2095 InversionList set; 2096 set.data = CowArray!(SP)(flattened); 2097 return set; 2098 } 2099 //ditto untill sort is CTFE-able 2100 private static fromIntervals()(uint[] intervals...) pure 2101 in 2102 { 2103 import std.conv : text; 2104 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2105 for (uint i = 0; i < intervals.length; i += 2) 2106 { 2107 auto a = intervals[i], b = intervals[i+1]; 2108 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2109 } 2110 } 2111 do 2112 { 2113 InversionList set; 2114 set.data = CowArray!(SP)(intervals); 2115 return set; 2116 } 2117 2118 /** 2119 Construct a set from plain values of code point intervals. 2120 */ 2121 this()(uint[] intervals...) 2122 in 2123 { 2124 import std.conv : text; 2125 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2126 for (uint i = 0; i < intervals.length; i += 2) 2127 { 2128 auto a = intervals[i], b = intervals[i+1]; 2129 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2130 } 2131 } 2132 do 2133 { 2134 data = CowArray!(SP)(intervals); 2135 sanitize(); //enforce invariant: sort intervals etc. 2136 } 2137 2138 /// 2139 pure @safe unittest 2140 { 2141 import std.algorithm.comparison : equal; 2142 2143 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2144 foreach (v; 'a'..'z'+1) 2145 assert(set[v]); 2146 // Cyrillic lowercase interval 2147 foreach (v; 'а'..'я'+1) 2148 assert(set[v]); 2149 //specific order is not required, intervals may interesect 2150 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2151 //the same end result 2152 assert(set2.byInterval.equal(set.byInterval)); 2153 // test constructor this(Range)(Range intervals) 2154 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2155 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2156 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2157 foreach (v; '♔'..'♟'+1) 2158 assert(set3[v]); 2159 } 2160 2161 /** 2162 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2163 */ 2164 @property auto byInterval() scope 2165 { 2166 // TODO: change this to data[] once the -dip1000 errors have been fixed 2167 // see e.g. https://github.com/dlang/phobos/pull/6638 2168 import std.array : array; 2169 return Intervals!(typeof(data.array))(data.array); 2170 } 2171 2172 @safe unittest 2173 { 2174 import std.algorithm.comparison : equal; 2175 import std.typecons : tuple; 2176 2177 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2178 2179 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2180 } 2181 2182 package(std) @property const(CodepointInterval)[] intervals() const 2183 { 2184 import std.array : array; 2185 return Intervals!(typeof(data[]))(data[]).array; 2186 } 2187 2188 /** 2189 Tests the presence of code point `val` in this set. 2190 */ 2191 bool opIndex(uint val) const 2192 { 2193 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2194 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2195 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2196 } 2197 2198 /// 2199 pure @safe unittest 2200 { 2201 auto gothic = unicode.Gothic; 2202 // Gothic letter ahsa 2203 assert(gothic['\U00010330']); 2204 // no ascii in Gothic obviously 2205 assert(!gothic['$']); 2206 } 2207 2208 2209 // Linear scan for `ch`. Useful only for small sets. 2210 // TODO: 2211 // used internally in std.regex 2212 // should be properly exposed in a public API ? 2213 package(std) auto scanFor()(dchar ch) const 2214 { 2215 immutable len = data.length; 2216 for (size_t i = 0; i < len; i++) 2217 if (ch < data[i]) 2218 return i & 1; 2219 return 0; 2220 } 2221 2222 /// Number of $(CODEPOINTS) in this set 2223 @property size_t length() 2224 { 2225 size_t sum = 0; 2226 foreach (iv; byInterval) 2227 { 2228 sum += iv.b - iv.a; 2229 } 2230 return sum; 2231 } 2232 2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2235 //============================================================================ 2236 public: 2237 /** 2238 $(P Sets support natural syntax for set algebra, namely: ) 2239 $(BOOKTABLE , 2240 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2241 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2242 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2243 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2244 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2245 ) 2246 */ 2247 This opBinary(string op, U)(U rhs) 2248 if (isCodepointSet!U || is(U:dchar)) 2249 { 2250 static if (op == "&" || op == "|" || op == "~") 2251 {// symmetric ops thus can swap arguments to reuse r-value 2252 static if (is(U:dchar)) 2253 { 2254 auto tmp = this; 2255 mixin("tmp "~op~"= rhs; "); 2256 return tmp; 2257 } 2258 else 2259 { 2260 static if (is(Unqual!U == U)) 2261 { 2262 // try hard to reuse r-value 2263 mixin("rhs "~op~"= this;"); 2264 return rhs; 2265 } 2266 else 2267 { 2268 auto tmp = this; 2269 mixin("tmp "~op~"= rhs;"); 2270 return tmp; 2271 } 2272 } 2273 } 2274 else static if (op == "-") // anti-symmetric 2275 { 2276 auto tmp = this; 2277 tmp -= rhs; 2278 return tmp; 2279 } 2280 else 2281 static assert(0, "no operator "~op~" defined for Set"); 2282 } 2283 2284 /// 2285 pure @safe unittest 2286 { 2287 import std.algorithm.comparison : equal; 2288 import std.range : iota; 2289 2290 auto lower = unicode.LowerCase; 2291 auto upper = unicode.UpperCase; 2292 auto ascii = unicode.ASCII; 2293 2294 assert((lower & upper).empty); // no intersection 2295 auto lowerASCII = lower & ascii; 2296 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2297 // throw away all of the lowercase ASCII 2298 assert((ascii - lower).length == 128 - 26); 2299 2300 auto onlyOneOf = lower ~ ascii; 2301 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2302 assert(onlyOneOf['$']); // ASCII and not lowercase 2303 assert(!onlyOneOf['a']); // ASCII and lowercase 2304 assert(onlyOneOf['я']); // not ASCII but lowercase 2305 2306 // throw away all cased letters from ASCII 2307 auto noLetters = ascii - (lower | upper); 2308 assert(noLetters.length == 128 - 26*2); 2309 } 2310 2311 /// The 'op=' versions of the above overloaded operators. 2312 ref This opOpAssign(string op, U)(U rhs) 2313 if (isCodepointSet!U || is(U:dchar)) 2314 { 2315 static if (op == "|") // union 2316 { 2317 static if (is(U:dchar)) 2318 { 2319 this.addInterval(rhs, rhs+1); 2320 return this; 2321 } 2322 else 2323 return this.add(rhs); 2324 } 2325 else static if (op == "&") // intersection 2326 return this.intersect(rhs);// overloaded 2327 else static if (op == "-") // set difference 2328 return this.sub(rhs);// overloaded 2329 else static if (op == "~") // symmetric set difference 2330 { 2331 auto copy = this & rhs; 2332 this |= rhs; 2333 this -= copy; 2334 return this; 2335 } 2336 else 2337 static assert(0, "no operator "~op~" defined for Set"); 2338 } 2339 2340 /** 2341 Tests the presence of codepoint `ch` in this set, 2342 the same as $(LREF opIndex). 2343 */ 2344 bool opBinaryRight(string op: "in", U)(U ch) const 2345 if (is(U : dchar)) 2346 { 2347 return this[ch]; 2348 } 2349 2350 /// 2351 pure @safe unittest 2352 { 2353 assert('я' in unicode.Cyrillic); 2354 assert(!('z' in unicode.Cyrillic)); 2355 } 2356 2357 2358 2359 /** 2360 * Obtains a set that is the inversion of this set. 2361 * 2362 * See_Also: $(LREF inverted) 2363 */ 2364 auto opUnary(string op: "!")() 2365 { 2366 return this.inverted; 2367 } 2368 2369 /** 2370 A range that spans each $(CODEPOINT) in this set. 2371 */ 2372 @property auto byCodepoint() 2373 { 2374 static struct CodepointRange 2375 { 2376 this(This set) 2377 { 2378 r = set.byInterval; 2379 if (!r.empty) 2380 cur = r.front.a; 2381 } 2382 2383 @property dchar front() const 2384 { 2385 return cast(dchar) cur; 2386 } 2387 2388 @property bool empty() const 2389 { 2390 return r.empty; 2391 } 2392 2393 void popFront() 2394 { 2395 cur++; 2396 while (cur >= r.front.b) 2397 { 2398 r.popFront(); 2399 if (r.empty) 2400 break; 2401 cur = r.front.a; 2402 } 2403 } 2404 private: 2405 uint cur; 2406 typeof(This.init.byInterval) r; 2407 } 2408 2409 return CodepointRange(this); 2410 } 2411 2412 /// 2413 pure @safe unittest 2414 { 2415 import std.algorithm.comparison : equal; 2416 import std.range : iota; 2417 2418 auto set = unicode.ASCII; 2419 set.byCodepoint.equal(iota(0, 0x80)); 2420 } 2421 2422 /** 2423 $(P Obtain textual representation of this set in from of 2424 open-right intervals and feed it to `sink`. 2425 ) 2426 $(P Used by various standard formatting facilities such as 2427 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2428 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2429 ) 2430 Example: 2431 --- 2432 import std.conv; 2433 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2434 --- 2435 */ 2436 2437 private import std.format.spec : FormatSpec; 2438 2439 /*************************************** 2440 * Obtain a textual representation of this InversionList 2441 * in form of open-right intervals. 2442 * 2443 * The formatting flag is applied individually to each value, for example: 2444 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2445 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2446 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2447 */ 2448 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2449 { 2450 import std.format.write : formatValue; 2451 auto range = byInterval; 2452 if (range.empty) 2453 return; 2454 2455 while (1) 2456 { 2457 auto i = range.front; 2458 range.popFront(); 2459 2460 put(sink, "["); 2461 formatValue(sink, i.a, fmt); 2462 put(sink, ".."); 2463 formatValue(sink, i.b, fmt); 2464 put(sink, ")"); 2465 if (range.empty) return; 2466 put(sink, " "); 2467 } 2468 } 2469 2470 /// 2471 pure @safe unittest 2472 { 2473 import std.conv : to; 2474 import std.format : format; 2475 import std.uni : unicode; 2476 2477 // This was originally using Cyrillic script. 2478 // Unfortunately this is a pretty active range for changes, 2479 // and hence broke in an update. 2480 // Therefore the range Basic latin was used instead as it 2481 // unlikely to ever change. 2482 2483 assert(unicode.InBasic_latin.to!string == "[0..128)"); 2484 2485 // The specs '%s' and '%d' are equivalent to the to!string call above. 2486 assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); 2487 2488 assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); 2489 assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); 2490 } 2491 2492 pure @safe unittest 2493 { 2494 import std.exception : assertThrown; 2495 import std.format : format, FormatException; 2496 assertThrown!FormatException(format("%z", unicode.ASCII)); 2497 } 2498 2499 2500 /** 2501 Add an interval [a, b$(RPAREN) to this set. 2502 */ 2503 ref add()(uint a, uint b) 2504 { 2505 addInterval(a, b); 2506 return this; 2507 } 2508 2509 /// 2510 pure @safe unittest 2511 { 2512 CodepointSet someSet; 2513 someSet.add('0', '5').add('A','Z'+1); 2514 someSet.add('5', '9'+1); 2515 assert(someSet['0']); 2516 assert(someSet['5']); 2517 assert(someSet['9']); 2518 assert(someSet['Z']); 2519 } 2520 2521 private: 2522 2523 package(std) // used from: std.regex.internal.parser 2524 ref intersect(U)(U rhs) 2525 if (isCodepointSet!U) 2526 { 2527 Marker mark; 2528 foreach ( i; rhs.byInterval) 2529 { 2530 mark = this.dropUpTo(i.a, mark); 2531 mark = this.skipUpTo(i.b, mark); 2532 } 2533 this.dropUpTo(uint.max, mark); 2534 return this; 2535 } 2536 2537 ref intersect()(dchar ch) 2538 { 2539 foreach (i; byInterval) 2540 if (i.a <= ch && ch < i.b) 2541 return this = This.init.add(ch, ch+1); 2542 this = This.init; 2543 return this; 2544 } 2545 2546 pure @safe unittest 2547 { 2548 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2549 } 2550 2551 ref sub()(dchar ch) 2552 { 2553 return subChar(ch); 2554 } 2555 2556 // same as the above except that skip & drop parts are swapped 2557 package(std) // used from: std.regex.internal.parser 2558 ref sub(U)(U rhs) 2559 if (isCodepointSet!U) 2560 { 2561 Marker mark; 2562 foreach (i; rhs.byInterval) 2563 { 2564 mark = this.skipUpTo(i.a, mark); 2565 mark = this.dropUpTo(i.b, mark); 2566 } 2567 return this; 2568 } 2569 2570 package(std) // used from: std.regex.internal.parse 2571 ref add(U)(U rhs) 2572 if (isCodepointSet!U) 2573 { 2574 Marker start; 2575 foreach (i; rhs.byInterval) 2576 { 2577 start = addInterval(i.a, i.b, start); 2578 } 2579 return this; 2580 } 2581 2582 // end of mixin-able part 2583 //============================================================================ 2584 public: 2585 /** 2586 Obtains a set that is the inversion of this set. 2587 2588 See the '!' $(LREF opUnary) for the same but using operators. 2589 */ 2590 @property auto inverted() 2591 { 2592 InversionList inversion = this; 2593 if (inversion.data.length == 0) 2594 { 2595 inversion.addInterval(0, lastDchar+1); 2596 return inversion; 2597 } 2598 if (inversion.data[0] != 0) 2599 genericReplace(inversion.data, 0, 0, [0]); 2600 else 2601 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2602 if (data[data.length-1] != lastDchar+1) 2603 genericReplace(inversion.data, 2604 inversion.data.length, inversion.data.length, [lastDchar+1]); 2605 else 2606 genericReplace(inversion.data, 2607 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2608 2609 return inversion; 2610 } 2611 2612 /// 2613 pure @safe unittest 2614 { 2615 auto set = unicode.ASCII; 2616 // union with the inverse gets all of the code points in the Unicode 2617 assert((set | set.inverted).length == 0x110000); 2618 // no intersection with the inverse 2619 assert((set & set.inverted).empty); 2620 } 2621 2622 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2623 { 2624 import std.algorithm.searching : countUntil; 2625 import std.format : format; 2626 enum maxBinary = 3; 2627 static string linearScope(R)(R ivals, string indent) 2628 { 2629 string result = indent~"{\n"; 2630 string deeper = indent~" "; 2631 foreach (ival; ivals) 2632 { 2633 immutable span = ival[1] - ival[0]; 2634 assert(span != 0); 2635 if (span == 1) 2636 { 2637 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2638 } 2639 else if (span == 2) 2640 { 2641 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2642 deeper, ival[0], ival[0]+1); 2643 } 2644 else 2645 { 2646 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2647 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2648 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2649 } 2650 } 2651 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2652 return result; 2653 } 2654 2655 static string binaryScope(R)(R ivals, string indent) @safe 2656 { 2657 // time to do unrolled comparisons? 2658 if (ivals.length < maxBinary) 2659 return linearScope(ivals, indent); 2660 else 2661 return bisect(ivals, ivals.length/2, indent); 2662 } 2663 2664 // not used yet if/elsebinary search is far better with DMD as of 2.061 2665 // and GDC is doing fine job either way 2666 static string switchScope(R)(R ivals, string indent) 2667 { 2668 string result = indent~"switch (ch){\n"; 2669 string deeper = indent~" "; 2670 foreach (ival; ivals) 2671 { 2672 if (ival[0]+1 == ival[1]) 2673 { 2674 result ~= format("%scase %s: return true;\n", 2675 deeper, ival[0]); 2676 } 2677 else 2678 { 2679 result ~= format("%scase %s: .. case %s: return true;\n", 2680 deeper, ival[0], ival[1]-1); 2681 } 2682 } 2683 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2684 return result; 2685 } 2686 2687 static string bisect(R)(R range, size_t idx, string indent) 2688 { 2689 string deeper = indent ~ " "; 2690 // bisect on one [a, b) interval at idx 2691 string result = indent~"{\n"; 2692 // less branch, < a 2693 result ~= format("%sif (ch < %s)\n%s", 2694 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2695 // middle point, >= a && < b 2696 result ~= format("%selse if (ch < %s) return true;\n", 2697 deeper, range[idx][1]); 2698 // greater or equal branch, >= b 2699 result ~= format("%selse\n%s", 2700 deeper, binaryScope(range[idx+1..$], deeper)); 2701 return result~indent~"}\n"; 2702 } 2703 2704 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2705 funcName.empty ? "function" : funcName); 2706 // special case first bisection to be on ASCII vs beyond 2707 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2708 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2709 code ~= binaryScope(range, ""); 2710 else 2711 code ~= bisect(range, tillAscii, ""); 2712 return code; 2713 } 2714 2715 /** 2716 Generates string with D source code of unary function with name of 2717 `funcName` taking a single `dchar` argument. If `funcName` is empty 2718 the code is adjusted to be a lambda function. 2719 2720 The function generated tests if the $(CODEPOINT) passed 2721 belongs to this set or not. The result is to be used with string mixin. 2722 The intended usage area is aggressive optimization via meta programming 2723 in parser generators and the like. 2724 2725 Note: Use with care for relatively small or regular sets. It 2726 could end up being slower then just using multi-staged tables. 2727 2728 Example: 2729 --- 2730 import std.stdio; 2731 2732 // construct set directly from [a, b$RPAREN intervals 2733 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2734 writeln(set); 2735 writeln(set.toSourceCode("func")); 2736 --- 2737 2738 The above outputs something along the lines of: 2739 --- 2740 bool func(dchar ch) @safe pure nothrow @nogc 2741 { 2742 if (ch < 45) 2743 { 2744 if (ch == 10 || ch == 11) return true; 2745 return false; 2746 } 2747 else if (ch < 65) return true; 2748 else 2749 { 2750 if (ch < 100) return false; 2751 if (ch < 200) return true; 2752 return false; 2753 } 2754 } 2755 --- 2756 */ 2757 string toSourceCode(string funcName="") 2758 { 2759 import std.array : array; 2760 auto range = byInterval.array(); 2761 return toSourceCode(range, funcName); 2762 } 2763 2764 /** 2765 True if this set doesn't contain any $(CODEPOINTS). 2766 */ 2767 @property bool empty() const 2768 { 2769 return data.length == 0; 2770 } 2771 2772 /// 2773 pure @safe unittest 2774 { 2775 CodepointSet emptySet; 2776 assert(emptySet.length == 0); 2777 assert(emptySet.empty); 2778 } 2779 2780 private: 2781 alias This = typeof(this); 2782 alias Marker = size_t; 2783 2784 // a random-access range of integral pairs 2785 static struct Intervals(Range) 2786 { 2787 import std.range.primitives : hasAssignableElements; 2788 2789 this(Range sp) scope 2790 { 2791 slice = sp; 2792 start = 0; 2793 end = sp.length; 2794 } 2795 2796 this(Range sp, size_t s, size_t e) scope 2797 { 2798 slice = sp; 2799 start = s; 2800 end = e; 2801 } 2802 2803 @property auto front()const 2804 { 2805 immutable a = slice[start]; 2806 immutable b = slice[start+1]; 2807 return CodepointInterval(a, b); 2808 } 2809 2810 //may break sorted property - but we need std.sort to access it 2811 //hence package(std) protection attribute 2812 static if (hasAssignableElements!Range) 2813 package(std) @property void front(CodepointInterval val) 2814 { 2815 slice[start] = val.a; 2816 slice[start+1] = val.b; 2817 } 2818 2819 @property auto back()const 2820 { 2821 immutable a = slice[end-2]; 2822 immutable b = slice[end-1]; 2823 return CodepointInterval(a, b); 2824 } 2825 2826 //ditto about package 2827 static if (hasAssignableElements!Range) 2828 package(std) @property void back(CodepointInterval val) 2829 { 2830 slice[end-2] = val.a; 2831 slice[end-1] = val.b; 2832 } 2833 2834 void popFront() 2835 { 2836 start += 2; 2837 } 2838 2839 void popBack() 2840 { 2841 end -= 2; 2842 } 2843 2844 auto opIndex(size_t idx) const 2845 { 2846 immutable a = slice[start+idx*2]; 2847 immutable b = slice[start+idx*2+1]; 2848 return CodepointInterval(a, b); 2849 } 2850 2851 //ditto about package 2852 static if (hasAssignableElements!Range) 2853 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2854 { 2855 slice[start+idx*2] = val.a; 2856 slice[start+idx*2+1] = val.b; 2857 } 2858 2859 auto opSlice(size_t s, size_t e) 2860 { 2861 return Intervals(slice, s*2+start, e*2+start); 2862 } 2863 2864 @property size_t length()const { return slice.length/2; } 2865 2866 @property bool empty()const { return start == end; } 2867 2868 @property auto save(){ return this; } 2869 private: 2870 size_t start, end; 2871 Range slice; 2872 } 2873 2874 // called after construction from intervals 2875 // to make sure invariants hold 2876 void sanitize() 2877 { 2878 import std.algorithm.comparison : max; 2879 import std.algorithm.mutation : SwapStrategy; 2880 import std.algorithm.sorting : sort; 2881 if (data.length == 0) 2882 return; 2883 alias Ival = CodepointInterval; 2884 //intervals wrapper for a _range_ over packed array 2885 auto ivals = Intervals!(typeof(data[]))(data[]); 2886 //@@@BUG@@@ can't use "a.a < b.a" see 2887 // https://issues.dlang.org/show_bug.cgi?id=12265 2888 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2889 // what follows is a variation on stable remove 2890 // differences: 2891 // - predicate is binary, and is tested against 2892 // the last kept element (at 'i'). 2893 // - predicate mutates lhs (merges rhs into lhs) 2894 size_t len = ivals.length; 2895 size_t i = 0; 2896 size_t j = 1; 2897 while (j < len) 2898 { 2899 if (ivals[i].b >= ivals[j].a) 2900 { 2901 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2902 j++; 2903 } 2904 else //unmergable 2905 { 2906 // check if there is a hole after merges 2907 // (in the best case we do 0 writes to ivals) 2908 if (j != i+1) 2909 ivals[i+1] = ivals[j]; //copy over 2910 i++; 2911 j++; 2912 } 2913 } 2914 len = i + 1; 2915 for (size_t k=0; k + 1 < len; k++) 2916 { 2917 assert(ivals[k].a < ivals[k].b); 2918 assert(ivals[k].b < ivals[k+1].a); 2919 } 2920 data.length = len * 2; 2921 } 2922 2923 // special case for normal InversionList 2924 ref subChar(dchar ch) 2925 { 2926 auto mark = skipUpTo(ch); 2927 if (mark != data.length 2928 && data[mark] == ch && data[mark-1] == ch) 2929 { 2930 // it has split, meaning that ch happens to be in one of intervals 2931 data[mark] = data[mark]+1; 2932 } 2933 return this; 2934 } 2935 2936 // 2937 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2938 in 2939 { 2940 assert(a <= b); 2941 } 2942 do 2943 { 2944 import std.range : assumeSorted, SearchPolicy; 2945 auto range = assumeSorted(data[]); 2946 size_t pos; 2947 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2948 if (a_idx == range.length) 2949 { 2950 // [---+++----++++----++++++] 2951 // [ a b] 2952 data.append(a, b); 2953 return data.length-1; 2954 } 2955 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2956 uint[3] buf = void; 2957 uint to_insert; 2958 debug(std_uni) 2959 { 2960 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2961 } 2962 if (b_idx == range.length) 2963 { 2964 // [-------++++++++----++++++-] 2965 // [ s a b] 2966 if (a_idx & 1)// a in positive 2967 { 2968 buf[0] = b; 2969 to_insert = 1; 2970 } 2971 else// a in negative 2972 { 2973 buf[0] = a; 2974 buf[1] = b; 2975 to_insert = 2; 2976 } 2977 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2978 return pos - 1; 2979 } 2980 2981 uint top = data[b_idx]; 2982 2983 debug(std_uni) 2984 { 2985 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2986 writefln("a=%s; b=%s; top=%s;", a, b, top); 2987 } 2988 if (a_idx & 1) 2989 {// a in positive 2990 if (b_idx & 1)// b in positive 2991 { 2992 // [-------++++++++----++++++-] 2993 // [ s a b ] 2994 buf[0] = top; 2995 to_insert = 1; 2996 } 2997 else // b in negative 2998 { 2999 // [-------++++++++----++++++-] 3000 // [ s a b ] 3001 if (top == b) 3002 { 3003 assert(b_idx+1 < data.length); 3004 buf[0] = data[b_idx+1]; 3005 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3006 return pos - 1; 3007 } 3008 buf[0] = b; 3009 buf[1] = top; 3010 to_insert = 2; 3011 } 3012 } 3013 else 3014 { // a in negative 3015 if (b_idx & 1) // b in positive 3016 { 3017 // [----------+++++----++++++-] 3018 // [ a b ] 3019 buf[0] = a; 3020 buf[1] = top; 3021 to_insert = 2; 3022 } 3023 else// b in negative 3024 { 3025 // [----------+++++----++++++-] 3026 // [ a s b ] 3027 if (top == b) 3028 { 3029 assert(b_idx+1 < data.length); 3030 buf[0] = a; 3031 buf[1] = data[b_idx+1]; 3032 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3033 return pos - 1; 3034 } 3035 buf[0] = a; 3036 buf[1] = b; 3037 buf[2] = top; 3038 to_insert = 3; 3039 } 3040 } 3041 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3042 debug(std_uni) 3043 { 3044 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3045 writeln("inserting ", buf[0 .. to_insert]); 3046 } 3047 return pos - 1; 3048 } 3049 3050 // 3051 Marker dropUpTo(uint a, Marker pos=Marker.init) 3052 in 3053 { 3054 assert(pos % 2 == 0); // at start of interval 3055 } 3056 do 3057 { 3058 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3059 if (range.empty) 3060 return pos; 3061 size_t idx = pos; 3062 idx += range.lowerBound(a).length; 3063 3064 debug(std_uni) 3065 { 3066 writeln("dropUpTo full length=", data.length); 3067 writeln(pos,"~~~", idx); 3068 } 3069 if (idx == data.length) 3070 return genericReplace(data, pos, idx, cast(uint[])[]); 3071 if (idx & 1) 3072 { // a in positive 3073 //[--+++----++++++----+++++++------...] 3074 // |<---si s a t 3075 genericReplace(data, pos, idx, [a]); 3076 } 3077 else 3078 { // a in negative 3079 //[--+++----++++++----+++++++-------+++...] 3080 // |<---si s a t 3081 genericReplace(data, pos, idx, cast(uint[])[]); 3082 } 3083 return pos; 3084 } 3085 3086 // 3087 Marker skipUpTo(uint a, Marker pos=Marker.init) 3088 out(result) 3089 { 3090 assert(result % 2 == 0);// always start of interval 3091 //(may be 0-width after-split) 3092 } 3093 do 3094 { 3095 assert(data.length % 2 == 0); 3096 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3097 size_t idx = pos+range.lowerBound(a).length; 3098 3099 if (idx >= data.length) // could have Marker point to recently removed stuff 3100 return data.length; 3101 3102 if (idx & 1)// inside of interval, check for split 3103 { 3104 3105 immutable top = data[idx]; 3106 if (top == a)// no need to split, it's end 3107 return idx+1; 3108 immutable start = data[idx-1]; 3109 if (a == start) 3110 return idx-1; 3111 // split it up 3112 genericReplace(data, idx, idx+1, [a, a, top]); 3113 return idx+1; // avoid odd index 3114 } 3115 return idx; 3116 } 3117 3118 CowArray!SP data; 3119 } 3120 3121 pure @safe unittest 3122 { 3123 import std.conv : to; 3124 assert(unicode.ASCII.to!string() == "[0..128)"); 3125 } 3126 3127 // pedantic version for ctfe, and aligned-access only architectures 3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3129 { 3130 idx *= 3; 3131 version (LittleEndian) 3132 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3133 + (cast(uint) ptr[idx+2]<<16); 3134 else 3135 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3136 + ptr[idx+2]; 3137 } 3138 3139 // ditto 3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3141 { 3142 idx *= 3; 3143 version (LittleEndian) 3144 { 3145 ptr[idx] = val & 0xFF; 3146 ptr[idx+1] = (val >> 8) & 0xFF; 3147 ptr[idx+2] = (val >> 16) & 0xFF; 3148 } 3149 else 3150 { 3151 ptr[idx] = (val >> 16) & 0xFF; 3152 ptr[idx+1] = (val >> 8) & 0xFF; 3153 ptr[idx+2] = val & 0xFF; 3154 } 3155 } 3156 3157 // unaligned x86-like read/write functions 3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3159 { 3160 uint* src = cast(uint*)(ptr+3*idx); 3161 version (LittleEndian) 3162 return *src & 0xFF_FFFF; 3163 else 3164 return *src >> 8; 3165 } 3166 3167 // ditto 3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3169 { 3170 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3171 version (LittleEndian) 3172 *dest = val | (*dest & 0xFF00_0000); 3173 else 3174 *dest = (val << 8) | (*dest & 0xFF); 3175 } 3176 3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3178 { 3179 static if (hasUnalignedReads) 3180 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3181 else 3182 return safeRead24(ptr, idx); 3183 } 3184 3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3186 { 3187 static if (hasUnalignedReads) 3188 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3189 else 3190 return safeWrite24(ptr, val, idx); 3191 } 3192 3193 struct CowArray(SP=GcPolicy) 3194 { 3195 import std.range.primitives : hasLength; 3196 3197 @safe: 3198 static auto reuse(uint[] arr) 3199 { 3200 CowArray cow; 3201 cow.data = arr; 3202 SP.append(cow.data, 1); 3203 assert(cow.refCount == 1); 3204 assert(cow.length == arr.length); 3205 return cow; 3206 } 3207 3208 this(Range)(Range range) 3209 if (isInputRange!Range && hasLength!Range) 3210 { 3211 import std.algorithm.mutation : copy; 3212 length = range.length; 3213 copy(range, data[0..$-1]); 3214 } 3215 3216 this(Range)(Range range) 3217 if (isForwardRange!Range && !hasLength!Range) 3218 { 3219 import std.algorithm.mutation : copy; 3220 import std.range.primitives : walkLength; 3221 immutable len = walkLength(range.save); 3222 length = len; 3223 copy(range, data[0..$-1]); 3224 } 3225 3226 this(this) 3227 { 3228 if (!empty) 3229 { 3230 refCount = refCount + 1; 3231 } 3232 } 3233 3234 ~this() 3235 { 3236 if (!empty) 3237 { 3238 immutable cnt = refCount; 3239 if (cnt == 1) 3240 SP.destroy(data); 3241 else 3242 refCount = cnt - 1; 3243 } 3244 } 3245 3246 // no ref-count for empty U24 array 3247 @property bool empty() const { return data.length == 0; } 3248 3249 // report one less then actual size 3250 @property size_t length() const 3251 { 3252 return data.length ? data.length - 1 : 0; 3253 } 3254 3255 //+ an extra slot for ref-count 3256 @property void length(size_t len) 3257 { 3258 import std.algorithm.comparison : min; 3259 import std.algorithm.mutation : copy; 3260 if (len == 0) 3261 { 3262 if (!empty) 3263 freeThisReference(); 3264 return; 3265 } 3266 immutable total = len + 1; // including ref-count 3267 if (empty) 3268 { 3269 data = SP.alloc!uint(total); 3270 refCount = 1; 3271 return; 3272 } 3273 immutable cur_cnt = refCount; 3274 if (cur_cnt != 1) // have more references to this memory 3275 { 3276 refCount = cur_cnt - 1; 3277 auto new_data = SP.alloc!uint(total); 3278 // take shrinking into account 3279 auto to_copy = min(total, data.length) - 1; 3280 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3281 data = new_data; // before setting refCount! 3282 refCount = 1; 3283 } 3284 else // 'this' is the only reference 3285 { 3286 // use the realloc (hopefully in-place operation) 3287 data = SP.realloc(data, total); 3288 refCount = 1; // setup a ref-count in the new end of the array 3289 } 3290 } 3291 3292 alias opDollar = length; 3293 3294 uint opIndex()(size_t idx)const 3295 { 3296 return data[idx]; 3297 } 3298 3299 void opIndexAssign(uint val, size_t idx) 3300 { 3301 auto cnt = refCount; 3302 if (cnt != 1) 3303 dupThisReference(cnt); 3304 data[idx] = val; 3305 } 3306 3307 // 3308 auto opSlice(size_t from, size_t to) 3309 { 3310 if (!empty) 3311 { 3312 auto cnt = refCount; 3313 if (cnt != 1) 3314 dupThisReference(cnt); 3315 } 3316 return data[from .. to]; 3317 3318 } 3319 3320 // 3321 auto opSlice(size_t from, size_t to) const 3322 { 3323 return data[from .. to]; 3324 } 3325 3326 // length slices before the ref count 3327 auto opSlice() 3328 { 3329 return opSlice(0, length); 3330 } 3331 3332 // ditto 3333 auto opSlice() const 3334 { 3335 return opSlice(0, length); 3336 } 3337 3338 void append(Range)(Range range) 3339 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3340 { 3341 size_t nl = length + range.length; 3342 length = nl; 3343 copy(range, this[nl-range.length .. nl]); 3344 } 3345 3346 void append()(uint[] val...) 3347 { 3348 length = length + val.length; 3349 data[$-val.length-1 .. $-1] = val[]; 3350 } 3351 3352 bool opEquals()(auto const ref CowArray rhs)const 3353 { 3354 if (empty ^ rhs.empty) 3355 return false; // one is empty and the other isn't 3356 return empty || data[0..$-1] == rhs.data[0..$-1]; 3357 } 3358 3359 private: 3360 // ref-count is right after the data 3361 @property uint refCount() const 3362 { 3363 return data[$-1]; 3364 } 3365 3366 @property void refCount(uint cnt) 3367 { 3368 data[$-1] = cnt; 3369 } 3370 3371 void freeThisReference() 3372 { 3373 immutable count = refCount; 3374 if (count != 1) // have more references to this memory 3375 { 3376 // dec shared ref-count 3377 refCount = count - 1; 3378 data = []; 3379 } 3380 else 3381 SP.destroy(data); 3382 assert(!data.ptr); 3383 } 3384 3385 void dupThisReference(uint count) 3386 in 3387 { 3388 assert(!empty && count != 1 && count == refCount); 3389 } 3390 do 3391 { 3392 import std.algorithm.mutation : copy; 3393 // dec shared ref-count 3394 refCount = count - 1; 3395 // copy to the new chunk of RAM 3396 auto new_data = SP.alloc!uint(data.length); 3397 // bit-blit old stuff except the counter 3398 copy(data[0..$-1], new_data[0..$-1]); 3399 data = new_data; // before setting refCount! 3400 refCount = 1; // so that this updates the right one 3401 } 3402 3403 uint[] data; 3404 } 3405 3406 pure @safe unittest// Uint24 tests 3407 { 3408 import std.algorithm.comparison : equal; 3409 import std.algorithm.mutation : copy; 3410 import std.conv : text; 3411 import std.range : iota, chain; 3412 import std.range.primitives : isBidirectionalRange, isOutputRange; 3413 void funcRef(T)(ref T u24) 3414 { 3415 u24.length = 2; 3416 u24[1] = 1024; 3417 T u24_c = u24; 3418 assert(u24[1] == 1024); 3419 u24.length = 0; 3420 assert(u24.empty); 3421 u24.append([1, 2]); 3422 assert(equal(u24[], [1, 2])); 3423 u24.append(111); 3424 assert(equal(u24[], [1, 2, 111])); 3425 assert(!u24_c.empty && u24_c[1] == 1024); 3426 u24.length = 3; 3427 copy(iota(0, 3), u24[]); 3428 assert(equal(u24[], iota(0, 3))); 3429 assert(u24_c[1] == 1024); 3430 } 3431 3432 void func2(T)(T u24) 3433 { 3434 T u24_2 = u24; 3435 T u24_3; 3436 u24_3 = u24_2; 3437 assert(u24_2 == u24_3); 3438 assert(equal(u24[], u24_2[])); 3439 assert(equal(u24_2[], u24_3[])); 3440 funcRef(u24_3); 3441 3442 assert(equal(u24_3[], iota(0, 3))); 3443 assert(!equal(u24_2[], u24_3[])); 3444 assert(equal(u24_2[], u24[])); 3445 u24_2 = u24_3; 3446 assert(equal(u24_2[], iota(0, 3))); 3447 // to test that passed arg is intact outside 3448 // plus try out opEquals 3449 u24 = u24_3; 3450 u24 = T.init; 3451 u24_3 = T.init; 3452 assert(u24.empty); 3453 assert(u24 == u24_3); 3454 assert(u24 != u24_2); 3455 } 3456 3457 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3458 {{ 3459 alias Range = typeof(CowArray!Policy.init[]); 3460 alias U24A = CowArray!Policy; 3461 static assert(isForwardRange!Range); 3462 static assert(isBidirectionalRange!Range); 3463 static assert(isOutputRange!(Range, uint)); 3464 static assert(isRandomAccessRange!(Range)); 3465 3466 auto arr = U24A([42u, 36, 100]); 3467 assert(arr[0] == 42); 3468 assert(arr[1] == 36); 3469 arr[0] = 72; 3470 arr[1] = 0xFE_FEFE; 3471 assert(arr[0] == 72); 3472 assert(arr[1] == 0xFE_FEFE); 3473 assert(arr[2] == 100); 3474 U24A arr2 = arr; 3475 assert(arr2[0] == 72); 3476 arr2[0] = 11; 3477 // test COW-ness 3478 assert(arr[0] == 72); 3479 assert(arr2[0] == 11); 3480 // set this to about 100M to stress-test COW memory management 3481 foreach (v; 0 .. 10_000) 3482 func2(arr); 3483 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3484 3485 auto r2 = U24A(iota(0, 100)); 3486 assert(equal(r2[], iota(0, 100)), text(r2[])); 3487 copy(iota(10, 170, 2), r2[10 .. 90]); 3488 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3489 , text(r2[])); 3490 }} 3491 } 3492 3493 pure @safe unittest// core set primitives test 3494 { 3495 import std.conv : text; 3496 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3497 foreach (CodeList; AllSets) 3498 { 3499 CodeList a; 3500 //"plug a hole" test 3501 a.add(10, 20).add(25, 30).add(15, 27); 3502 assert(a == CodeList(10, 30), text(a)); 3503 3504 auto x = CodeList.init; 3505 x.add(10, 20).add(30, 40).add(50, 60); 3506 3507 a = x; 3508 a.add(20, 49);//[10, 49) [50, 60) 3509 assert(a == CodeList(10, 49, 50 ,60)); 3510 3511 a = x; 3512 a.add(20, 50); 3513 assert(a == CodeList(10, 60), text(a)); 3514 3515 // simple unions, mostly edge effects 3516 x = CodeList.init; 3517 x.add(10, 20).add(40, 60); 3518 3519 a = x; 3520 a.add(10, 25); //[10, 25) [40, 60) 3521 assert(a == CodeList(10, 25, 40, 60)); 3522 3523 a = x; 3524 a.add(5, 15); //[5, 20) [40, 60) 3525 assert(a == CodeList(5, 20, 40, 60)); 3526 3527 a = x; 3528 a.add(0, 10); // [0, 20) [40, 60) 3529 assert(a == CodeList(0, 20, 40, 60)); 3530 3531 a = x; 3532 a.add(0, 5); // prepand 3533 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3534 3535 a = x; 3536 a.add(5, 20); 3537 assert(a == CodeList(5, 20, 40, 60)); 3538 3539 a = x; 3540 a.add(3, 37); 3541 assert(a == CodeList(3, 37, 40, 60)); 3542 3543 a = x; 3544 a.add(37, 65); 3545 assert(a == CodeList(10, 20, 37, 65)); 3546 3547 // some tests on helpers for set intersection 3548 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3549 a = x; 3550 3551 auto m = a.skipUpTo(60); 3552 a.dropUpTo(110, m); 3553 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3554 3555 a = x; 3556 a.dropUpTo(100); 3557 assert(a == CodeList(100, 120), text(a.data[])); 3558 3559 a = x; 3560 m = a.skipUpTo(50); 3561 a.dropUpTo(140, m); 3562 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3563 a = x; 3564 a.dropUpTo(60); 3565 assert(a == CodeList(100, 120), text(a.data[])); 3566 } 3567 } 3568 3569 3570 //test constructor to work with any order of intervals 3571 pure @safe unittest 3572 { 3573 import std.algorithm.comparison : equal; 3574 import std.conv : text, to; 3575 import std.range : chain, iota; 3576 import std.typecons : tuple; 3577 //ensure constructor handles bad ordering and overlap 3578 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3579 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3580 assert(ch in c1, to!string(ch)); 3581 3582 //contiguos 3583 assert(CodepointSet(1000, 1006, 1006, 1009) 3584 .byInterval.equal([tuple(1000, 1009)])); 3585 //contains 3586 assert(CodepointSet(900, 1200, 1000, 1100) 3587 .byInterval.equal([tuple(900, 1200)])); 3588 //intersect left 3589 assert(CodepointSet(900, 1100, 1000, 1200) 3590 .byInterval.equal([tuple(900, 1200)])); 3591 //intersect right 3592 assert(CodepointSet(1000, 1200, 900, 1100) 3593 .byInterval.equal([tuple(900, 1200)])); 3594 3595 //ditto with extra items at end 3596 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3597 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3598 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3599 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3600 3601 //"plug a hole" test 3602 auto c2 = CodepointSet(20, 40, 3603 60, 80, 100, 140, 150, 200, 3604 40, 60, 80, 100, 140, 150 3605 ); 3606 assert(c2.byInterval.equal([tuple(20, 200)])); 3607 3608 auto c3 = CodepointSet( 3609 20, 40, 60, 80, 100, 140, 150, 200, 3610 0, 10, 15, 100, 10, 20, 200, 220); 3611 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3612 } 3613 3614 3615 pure @safe unittest 3616 { // full set operations 3617 import std.conv : text; 3618 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3619 foreach (CodeList; AllSets) 3620 { 3621 CodeList a, b, c, d; 3622 3623 //"plug a hole" 3624 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3625 b.add(40, 60).add(80, 100).add(140, 150); 3626 c = a | b; 3627 d = b | a; 3628 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3629 assert(c == d, text(c," vs ", d)); 3630 3631 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3632 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3633 d = b | a; 3634 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3635 assert(c == d, text(c," vs ", d)); 3636 3637 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3638 c = a | b;//[10, 140) [145, 200) 3639 d = b | a; 3640 assert(c == CodeList(10, 140, 145, 200)); 3641 assert(c == d, text(c," vs ", d)); 3642 3643 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3644 c = a | b;//[0, 140) [150, 220) 3645 d = b | a; 3646 assert(c == CodeList(0, 140, 150, 220)); 3647 assert(c == d, text(c," vs ", d)); 3648 3649 3650 a = CodeList.init.add(20, 40).add(60, 80); 3651 b = CodeList.init.add(25, 35).add(65, 75); 3652 c = a & b; 3653 d = b & a; 3654 assert(c == CodeList(25, 35, 65, 75), text(c)); 3655 assert(c == d, text(c," vs ", d)); 3656 3657 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3658 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3659 c = a & b; 3660 d = b & a; 3661 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3662 assert(c == d, text(c," vs ", d)); 3663 3664 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3665 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3666 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3667 d = b & a; 3668 3669 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3670 assert(c == d, text(c, " vs ",d)); 3671 assert((c & a) == c); 3672 assert((d & b) == d); 3673 assert((c & d) == d); 3674 3675 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3676 c = a & b; 3677 d = b & a; 3678 assert(c == CodeList(150, 200), text(c)); 3679 assert(c == d, text(c, " vs ",d)); 3680 assert((c & a) == c); 3681 assert((d & b) == d); 3682 assert((c & d) == d); 3683 3684 assert((a & a) == a); 3685 assert((b & b) == b); 3686 3687 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3688 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3689 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3690 d = b - a;// [40, 60) [80, 100) [200, 300) 3691 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3692 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3693 assert(c - d == c, text(c-d, " vs ", c)); 3694 assert(d - c == d, text(d-c, " vs ", d)); 3695 assert(c - c == CodeList.init); 3696 assert(d - d == CodeList.init); 3697 3698 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3699 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3700 c = a - b;// [160, 190) 3701 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3702 assert(c == CodeList(160, 190), text(c)); 3703 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3704 assert(c - d == c, text(c-d, " vs ", c)); 3705 assert(d - c == d, text(d-c, " vs ", d)); 3706 assert(c - c == CodeList.init); 3707 assert(d - d == CodeList.init); 3708 3709 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3710 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3711 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3712 d = b ~ a; 3713 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3714 text(c)); 3715 assert(c == d, text(c, " vs ", d)); 3716 } 3717 } 3718 3719 } 3720 3721 pure @safe unittest// vs single dchar 3722 { 3723 import std.conv : text; 3724 CodepointSet a = CodepointSet(10, 100, 120, 200); 3725 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3726 assert((a & 'B') == CodepointSet(66, 67)); 3727 } 3728 3729 pure @safe unittest// iteration & opIndex 3730 { 3731 import std.algorithm.comparison : equal; 3732 import std.conv : text; 3733 import std.typecons : tuple, Tuple; 3734 3735 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3736 {{ 3737 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3738 auto a = CodeList('A','N','a', 'n'); 3739 assert(equal(a.byInterval, 3740 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3741 ), text(a.byInterval)); 3742 3743 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3744 version (bug8949) 3745 { 3746 import std.range : retro; 3747 assert(equal(retro(a.byInterval), 3748 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3749 ), text(retro(a.byInterval))); 3750 } 3751 auto achr = a.byCodepoint; 3752 assert(equal(achr, arr), text(a.byCodepoint)); 3753 foreach (ch; a.byCodepoint) 3754 assert(a[ch]); 3755 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3756 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3757 foreach (ch; x.byCodepoint) 3758 assert(x[ch]); 3759 static if (is(CodeList == CodepointSet)) 3760 { 3761 auto y = CodeList(x.byInterval); 3762 assert(equal(x.byInterval, y.byInterval)); 3763 } 3764 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3765 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3766 }} 3767 } 3768 3769 //============================================================================ 3770 // Generic Trie template and various ways to build it 3771 //============================================================================ 3772 3773 // debug helper to get a shortened array dump 3774 auto arrayRepr(T)(T x) 3775 { 3776 import std.conv : text; 3777 if (x.length > 32) 3778 { 3779 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3780 } 3781 else 3782 return text(x); 3783 } 3784 3785 /** 3786 Maps `Key` to a suitable integer index within the range of `size_t`. 3787 The mapping is constructed by applying predicates from `Prefix` left to right 3788 and concatenating the resulting bits. 3789 3790 The first (leftmost) predicate defines the most significant bits of 3791 the resulting index. 3792 */ 3793 template mapTrieIndex(Prefix...) 3794 { 3795 size_t mapTrieIndex(Key)(Key key) 3796 if (isValidPrefixForTrie!(Key, Prefix)) 3797 { 3798 alias p = Prefix; 3799 size_t idx; 3800 foreach (i, v; p[0..$-1]) 3801 { 3802 idx |= p[i](key); 3803 idx <<= p[i+1].bitSize; 3804 } 3805 idx |= p[$-1](key); 3806 return idx; 3807 } 3808 } 3809 3810 /* 3811 `TrieBuilder` is a type used for incremental construction 3812 of $(LREF Trie)s. 3813 3814 See $(LREF buildTrie) for generic helpers built on top of it. 3815 */ 3816 @trusted private struct TrieBuilder(Value, Key, Args...) 3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3818 { 3819 import std.exception : enforce; 3820 3821 private: 3822 // last index is not stored in table, it is used as an offset to values in a block. 3823 static if (is(Value == bool))// always pack bool 3824 alias V = BitPacked!(Value, 1); 3825 else 3826 alias V = Value; 3827 static auto deduceMaxIndex(Preds...)() 3828 { 3829 size_t idx = 1; 3830 foreach (v; Preds) 3831 idx *= 2^^v.bitSize; 3832 return idx; 3833 } 3834 3835 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3836 { 3837 alias Prefix = Args[1..$]; 3838 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3839 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3840 enum roughedMaxIndex = 3841 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3842 // check warp around - if wrapped, use the default deduction rule 3843 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3844 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3845 } 3846 else 3847 { 3848 alias Prefix = Args; 3849 enum maxIndex = deduceMaxIndex!(Prefix)(); 3850 } 3851 3852 alias getIndex = mapTrieIndex!(Prefix); 3853 3854 enum lastLevel = Prefix.length-1; 3855 struct ConstructState 3856 { 3857 size_t idx_zeros, idx_ones; 3858 } 3859 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3860 size_t[Prefix.length] indices; 3861 // default filler value to use 3862 Value defValue; 3863 // this is a full-width index of next item 3864 size_t curIndex; 3865 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3866 ConstructState[Prefix.length] state; 3867 // the table being constructed 3868 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3869 3870 @disable this(); 3871 3872 //shortcut for index variable at level 'level' 3873 @property ref idx(size_t level)(){ return indices[level]; } 3874 3875 // this function assumes no holes in the input so 3876 // indices are going one by one 3877 void addValue(size_t level, T)(T val, size_t numVals) 3878 { 3879 alias j = idx!level; 3880 enum pageSize = 1 << Prefix[level].bitSize; 3881 if (numVals == 0) 3882 return; 3883 auto ptr = table.slice!(level); 3884 if (numVals == 1) 3885 { 3886 static if (level == Prefix.length-1) 3887 ptr[j] = val; 3888 else 3889 {// can incur narrowing conversion 3890 assert(j < ptr.length); 3891 ptr[j] = force!(typeof(ptr[j]))(val); 3892 } 3893 j++; 3894 if (j % pageSize == 0) 3895 spillToNextPage!level(ptr); 3896 return; 3897 } 3898 // longer row of values 3899 // get to the next page boundary 3900 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3901 immutable n = nextPB - j;// can fill right in this page 3902 if (numVals < n) //fits in current page 3903 { 3904 ptr[j .. j+numVals] = val; 3905 j += numVals; 3906 return; 3907 } 3908 static if (level != 0)//on the first level it always fits 3909 { 3910 numVals -= n; 3911 //write till the end of current page 3912 ptr[j .. j+n] = val; 3913 j += n; 3914 //spill to the next page 3915 spillToNextPage!level(ptr); 3916 // page at once loop 3917 if (state[level].idx_zeros != size_t.max && val == T.init) 3918 { 3919 alias NextIdx = typeof(table.slice!(level-1)[0]); 3920 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3921 numVals/pageSize); 3922 ptr = table.slice!level; //table structure might have changed 3923 numVals %= pageSize; 3924 } 3925 else 3926 { 3927 while (numVals >= pageSize) 3928 { 3929 numVals -= pageSize; 3930 ptr[j .. j+pageSize] = val; 3931 j += pageSize; 3932 spillToNextPage!level(ptr); 3933 } 3934 } 3935 if (numVals) 3936 { 3937 // the leftovers, an incomplete page 3938 ptr[j .. j+numVals] = val; 3939 j += numVals; 3940 } 3941 } 3942 } 3943 3944 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3945 { 3946 // last level (i.e. topmost) has 1 "page" 3947 // thus it need not to add a new page on upper level 3948 static if (level != 0) 3949 spillToNextPageImpl!(level)(ptr); 3950 } 3951 3952 // this can re-use the current page if duplicate or allocate a new one 3953 // it also makes sure that previous levels point to the correct page in this level 3954 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3955 { 3956 alias NextIdx = typeof(table.slice!(level-1)[0]); 3957 NextIdx next_lvl_index; 3958 enum pageSize = 1 << Prefix[level].bitSize; 3959 assert(idx!level % pageSize == 0); 3960 immutable last = idx!level-pageSize; 3961 const slice = ptr[idx!level - pageSize .. idx!level]; 3962 size_t j; 3963 for (j=0; j<last; j+=pageSize) 3964 { 3965 if (ptr[j .. j+pageSize] == slice) 3966 { 3967 // get index to it, reuse ptr space for the next block 3968 next_lvl_index = force!NextIdx(j/pageSize); 3969 version (none) 3970 { 3971 import std.stdio : writefln, writeln; 3972 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3973 ,level 3974 ,indices[level-1], pageSize, j, j+pageSize); 3975 writeln("LEVEL(", level 3976 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3977 writeln("LEVEL(", level 3978 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3979 } 3980 idx!level -= pageSize; // reuse this page, it is duplicate 3981 break; 3982 } 3983 } 3984 if (j == last) 3985 { 3986 L_allocate_page: 3987 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3988 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3989 { 3990 state[level].idx_zeros = next_lvl_index; 3991 } 3992 // allocate next page 3993 version (none) 3994 { 3995 import std.stdio : writefln; 3996 writefln("LEVEL(%s) page allocated: %s" 3997 , level, arrayRepr(slice[0 .. pageSize])); 3998 writefln("LEVEL(%s) index: %s ; page at this index %s" 3999 , level 4000 , next_lvl_index 4001 , arrayRepr( 4002 table.slice!(level) 4003 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4004 )); 4005 } 4006 table.length!level = table.length!level + pageSize; 4007 } 4008 L_know_index: 4009 // for the previous level, values are indices to the pages in the current level 4010 addValue!(level-1)(next_lvl_index, 1); 4011 ptr = table.slice!level; //re-load the slice after moves 4012 } 4013 4014 // idx - full-width index to fill with v (full-width index != key) 4015 // fills everything in the range of [curIndex, idx) with filler 4016 void putAt(size_t idx, Value v) 4017 { 4018 assert(idx >= curIndex); 4019 immutable numFillers = idx - curIndex; 4020 addValue!lastLevel(defValue, numFillers); 4021 addValue!lastLevel(v, 1); 4022 curIndex = idx + 1; 4023 } 4024 4025 // ditto, but sets the range of [idxA, idxB) to v 4026 void putRangeAt(size_t idxA, size_t idxB, Value v) 4027 { 4028 assert(idxA >= curIndex); 4029 assert(idxB >= idxA); 4030 size_t numFillers = idxA - curIndex; 4031 addValue!lastLevel(defValue, numFillers); 4032 addValue!lastLevel(v, idxB - idxA); 4033 curIndex = idxB; // open-right 4034 } 4035 4036 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4037 "duplicate key->value mapping"; 4038 4039 public: 4040 /** 4041 Construct a builder, where `filler` is a value 4042 to indicate empty slots (or "not found" condition). 4043 */ 4044 this(Value filler) 4045 { 4046 curIndex = 0; 4047 defValue = filler; 4048 // zeros-page index, ones-page index 4049 foreach (ref v; state) 4050 v = ConstructState(size_t.max, size_t.max); 4051 table = typeof(table)(indices); 4052 // one page per level is a bootstrap minimum 4053 foreach (i, Pred; Prefix) 4054 table.length!i = (1 << Pred.bitSize); 4055 } 4056 4057 /** 4058 Put a value `v` into interval as 4059 mapped by keys from `a` to `b`. 4060 All slots prior to `a` are filled with 4061 the default filler. 4062 */ 4063 void putRange(Key a, Key b, Value v) 4064 { 4065 auto idxA = getIndex(a), idxB = getIndex(b); 4066 // indexes of key should always grow 4067 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4068 putRangeAt(idxA, idxB, v); 4069 } 4070 4071 /** 4072 Put a value `v` into slot mapped by `key`. 4073 All slots prior to `key` are filled with the 4074 default filler. 4075 */ 4076 void putValue(Key key, Value v) 4077 { 4078 auto idx = getIndex(key); 4079 enforce(idx >= curIndex, errMsg); 4080 putAt(idx, v); 4081 } 4082 4083 /// Finishes construction of Trie, yielding an immutable Trie instance. 4084 auto build() 4085 { 4086 static if (maxIndex != 0) // doesn't cover full range of size_t 4087 { 4088 assert(curIndex <= maxIndex); 4089 addValue!lastLevel(defValue, maxIndex - curIndex); 4090 } 4091 else 4092 { 4093 if (curIndex != 0 // couldn't wrap around 4094 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4095 { 4096 addValue!lastLevel(defValue, size_t.max - curIndex); 4097 addValue!lastLevel(defValue, 1); 4098 } 4099 // else curIndex already completed the full range of size_t by wrapping around 4100 } 4101 return Trie!(V, Key, maxIndex, Prefix)(table); 4102 } 4103 } 4104 4105 /** 4106 $(P A generic Trie data-structure for a fixed number of stages. 4107 The design goal is optimal speed with smallest footprint size. 4108 ) 4109 $(P It's intentionally read-only and doesn't provide constructors. 4110 To construct one use a special builder, 4111 see $(LREF TrieBuilder) and $(LREF buildTrie). 4112 ) 4113 4114 */ 4115 @trusted private struct Trie(Value, Key, Args...) 4116 if (isValidPrefixForTrie!(Key, Args) 4117 || (isValidPrefixForTrie!(Key, Args[1..$]) 4118 && is(typeof(Args[0]) : size_t))) 4119 { 4120 import std.range.primitives : isOutputRange; 4121 static if (is(typeof(Args[0]) : size_t)) 4122 { 4123 private enum maxIndex = Args[0]; 4124 private enum hasBoundsCheck = true; 4125 private alias Prefix = Args[1..$]; 4126 } 4127 else 4128 { 4129 private enum hasBoundsCheck = false; 4130 private alias Prefix = Args; 4131 } 4132 4133 private this()(typeof(_table) table) 4134 { 4135 _table = table; 4136 } 4137 4138 // only for constant Tries constructed from precompiled tables 4139 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4140 const(size_t)[] data) const 4141 { 4142 _table = typeof(_table)(offsets, sizes, data); 4143 } 4144 4145 /** 4146 $(P Lookup the `key` in this `Trie`. ) 4147 4148 $(P The lookup always succeeds if key fits the domain 4149 provided during construction. The whole domain defined 4150 is covered so instead of not found condition 4151 the sentinel (filler) value could be used. ) 4152 4153 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4154 define a domain of `Trie` keys and the sentinel value. ) 4155 4156 Note: 4157 Domain range-checking is only enabled in debug builds 4158 and results in assertion failure. 4159 */ 4160 TypeOfBitPacked!Value opIndex()(Key key) const 4161 { 4162 static if (hasBoundsCheck) 4163 assert(mapTrieIndex!Prefix(key) < maxIndex); 4164 size_t idx; 4165 alias p = Prefix; 4166 idx = cast(size_t) p[0](key); 4167 foreach (i, v; p[0..$-1]) 4168 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4169 return _table.ptr!(p.length-1)[idx]; 4170 } 4171 4172 /// 4173 @property size_t bytes(size_t n=size_t.max)() const 4174 { 4175 return _table.bytes!n; 4176 } 4177 4178 /// 4179 @property size_t pages(size_t n)() const 4180 { 4181 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4182 /2^^Prefix[n].bitSize; 4183 } 4184 4185 /// 4186 void store(OutRange)(scope OutRange sink) const 4187 if (isOutputRange!(OutRange, char)) 4188 { 4189 _table.store(sink); 4190 } 4191 4192 private: 4193 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4194 } 4195 4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4197 // left-to-right, the most significant bits first 4198 template GetBitSlicing(size_t top, sizes...) 4199 { 4200 static if (sizes.length > 0) 4201 alias GetBitSlicing = 4202 AliasSeq!(sliceBits!(top - sizes[0], top), 4203 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4204 else 4205 alias GetBitSlicing = AliasSeq!(); 4206 } 4207 4208 template callableWith(T) 4209 { 4210 template callableWith(alias Pred) 4211 { 4212 static if (!is(typeof(Pred(T.init)))) 4213 enum callableWith = false; 4214 else 4215 { 4216 alias Result = typeof(Pred(T.init)); 4217 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4218 } 4219 } 4220 } 4221 4222 /* 4223 Check if `Prefix` is a valid set of predicates 4224 for `Trie` template having `Key` as the type of keys. 4225 This requires all predicates to be callable, take 4226 single argument of type `Key` and return unsigned value. 4227 */ 4228 template isValidPrefixForTrie(Key, Prefix...) 4229 { 4230 import std.meta : allSatisfy; 4231 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4232 } 4233 4234 /* 4235 Check if `Args` is a set of maximum key value followed by valid predicates 4236 for `Trie` template having `Key` as the type of keys. 4237 */ 4238 template isValidArgsForTrie(Key, Args...) 4239 { 4240 static if (Args.length > 1) 4241 { 4242 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4243 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4244 } 4245 else 4246 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4247 } 4248 4249 @property size_t sumOfIntegerTuple(ints...)() 4250 { 4251 size_t count=0; 4252 foreach (v; ints) 4253 count += v; 4254 return count; 4255 } 4256 4257 /** 4258 A shorthand for creating a custom multi-level fixed Trie 4259 from a `CodepointSet`. `sizes` are numbers of bits per level, 4260 with the most significant bits used first. 4261 4262 Note: The sum of `sizes` must be equal 21. 4263 4264 See_Also: $(LREF toTrie), which is even simpler. 4265 4266 Example: 4267 --- 4268 { 4269 import std.stdio; 4270 auto set = unicode("Number"); 4271 auto trie = codepointSetTrie!(8, 5, 8)(set); 4272 writeln("Input code points to test:"); 4273 foreach (line; stdin.byLine) 4274 { 4275 int count=0; 4276 foreach (dchar ch; line) 4277 if (trie[ch])// is number 4278 count++; 4279 writefln("Contains %d number code points.", count); 4280 } 4281 } 4282 --- 4283 */ 4284 public template codepointSetTrie(sizes...) 4285 if (sumOfIntegerTuple!sizes == 21) 4286 { 4287 auto codepointSetTrie(Set)(Set set) 4288 if (isCodepointSet!Set) 4289 { 4290 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4291 foreach (ival; set.byInterval) 4292 builder.putRange(ival[0], ival[1], true); 4293 return builder.build(); 4294 } 4295 } 4296 4297 /// Type of Trie generated by codepointSetTrie function. 4298 public template CodepointSetTrie(sizes...) 4299 if (sumOfIntegerTuple!sizes == 21) 4300 { 4301 alias Prefix = GetBitSlicing!(21, sizes); 4302 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4303 } 4304 4305 /** 4306 A slightly more general tool for building fixed `Trie` 4307 for the Unicode data. 4308 4309 Specifically unlike `codepointSetTrie` it's allows creating mappings 4310 of `dchar` to an arbitrary type `T`. 4311 4312 Note: Overload taking `CodepointSet`s will naturally convert 4313 only to bool mapping `Trie`s. 4314 4315 CodepointTrie is the type of Trie as generated by codepointTrie function. 4316 */ 4317 public template codepointTrie(T, sizes...) 4318 if (sumOfIntegerTuple!sizes == 21) 4319 { 4320 alias Prefix = GetBitSlicing!(21, sizes); 4321 4322 static if (is(TypeOfBitPacked!T == bool)) 4323 { 4324 auto codepointTrie(Set)(const scope Set set) 4325 if (isCodepointSet!Set) 4326 { 4327 return codepointSetTrie(set); 4328 } 4329 } 4330 4331 /// 4332 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4333 { 4334 return buildTrie!(T, dchar, Prefix)(map, defValue); 4335 } 4336 4337 // unsorted range of pairs 4338 /// 4339 auto codepointTrie(R)(R range, T defValue=T.init) 4340 if (isInputRange!R 4341 && is(typeof(ElementType!R.init[0]) : T) 4342 && is(typeof(ElementType!R.init[1]) : dchar)) 4343 { 4344 // build from unsorted array of pairs 4345 // TODO: expose index sorting functions for Trie 4346 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4347 } 4348 } 4349 4350 @system pure unittest 4351 { 4352 import std.algorithm.comparison : max; 4353 import std.algorithm.searching : count; 4354 4355 // pick characters from the Greek script 4356 auto set = unicode.Greek; 4357 4358 // a user-defined property (or an expensive function) 4359 // that we want to look up 4360 static uint luckFactor(dchar ch) 4361 { 4362 // here we consider a character lucky 4363 // if its code point has a lot of identical hex-digits 4364 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4365 ubyte[6] nibbles; // 6 4-bit chunks of code point 4366 uint value = ch; 4367 foreach (i; 0 .. 6) 4368 { 4369 nibbles[i] = value & 0xF; 4370 value >>= 4; 4371 } 4372 uint luck; 4373 foreach (n; nibbles) 4374 luck = cast(uint) max(luck, count(nibbles[], n)); 4375 return luck; 4376 } 4377 4378 // only unsigned built-ins are supported at the moment 4379 alias LuckFactor = BitPacked!(uint, 3); 4380 4381 // create a temporary associative array (AA) 4382 LuckFactor[dchar] map; 4383 foreach (ch; set.byCodepoint) 4384 map[ch] = LuckFactor(luckFactor(ch)); 4385 4386 // bits per stage are chosen randomly, fell free to optimize 4387 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4388 4389 // from now on the AA is not needed 4390 foreach (ch; set.byCodepoint) 4391 assert(trie[ch] == luckFactor(ch)); // verify 4392 // CJK is not Greek, thus it has the default value 4393 assert(trie['\u4444'] == 0); 4394 // and here is a couple of quite lucky Greek characters: 4395 // Greek small letter epsilon with dasia 4396 assert(trie['\u1F11'] == 3); 4397 // Ancient Greek metretes sign 4398 assert(trie['\U00010181'] == 3); 4399 4400 } 4401 4402 /// ditto 4403 public template CodepointTrie(T, sizes...) 4404 if (sumOfIntegerTuple!sizes == 21) 4405 { 4406 alias Prefix = GetBitSlicing!(21, sizes); 4407 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4408 } 4409 4410 package(std) template cmpK0(alias Pred) 4411 { 4412 import std.typecons : Tuple; 4413 static bool cmpK0(Value, Key) 4414 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4415 { 4416 return Pred(a[1]) < Pred(b[1]); 4417 } 4418 } 4419 4420 /** 4421 The most general utility for construction of `Trie`s 4422 short of using `TrieBuilder` directly. 4423 4424 Provides a number of convenience overloads. 4425 `Args` is tuple of maximum key value followed by 4426 predicates to construct index from key. 4427 4428 Alternatively if the first argument is not a value convertible to `Key` 4429 then the whole tuple of `Args` is treated as predicates 4430 and the maximum Key is deduced from predicates. 4431 */ 4432 private template buildTrie(Value, Key, Args...) 4433 if (isValidArgsForTrie!(Key, Args)) 4434 { 4435 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4436 { 4437 alias Prefix = Args[1..$]; 4438 } 4439 else 4440 alias Prefix = Args; 4441 4442 alias getIndex = mapTrieIndex!(Prefix); 4443 4444 // for multi-sort 4445 template GetComparators(size_t n) 4446 { 4447 static if (n > 0) 4448 alias GetComparators = 4449 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4450 else 4451 alias GetComparators = AliasSeq!(); 4452 } 4453 4454 /* 4455 Build `Trie` from a range of a Key-Value pairs, 4456 assuming it is sorted by Key as defined by the following lambda: 4457 ------ 4458 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4459 ------ 4460 Exception is thrown if it's detected that the above order doesn't hold. 4461 4462 In other words $(LREF mapTrieIndex) should be a 4463 monotonically increasing function that maps `Key` to an integer. 4464 4465 See_Also: $(REF sort, std,_algorithm), 4466 $(REF SortedRange, std,range), 4467 $(REF setUnion, std,_algorithm). 4468 */ 4469 auto buildTrie(Range)(Range range, Value filler=Value.init) 4470 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4471 && is(typeof(Range.init.front[1]) : Key)) 4472 { 4473 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4474 foreach (v; range) 4475 builder.putValue(v[1], v[0]); 4476 return builder.build(); 4477 } 4478 4479 /* 4480 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4481 to build `Trie` from a range of open-right intervals of `Key`s. 4482 The requirement on the ordering of keys (and the behavior on the 4483 violation of it) is the same as for Key-Value range overload. 4484 4485 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4486 If no filler provided keys inside of the intervals map to true, 4487 and `filler` is false. 4488 */ 4489 auto buildTrie(Range)(Range range, Value filler=Value.init) 4490 if (is(TypeOfBitPacked!Value == bool) 4491 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4492 && is(typeof(Range.init.front[1]) : Key)) 4493 { 4494 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4495 foreach (ival; range) 4496 builder.putRange(ival[0], ival[1], !filler); 4497 return builder.build(); 4498 } 4499 4500 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4501 if (isInputRange!Range 4502 && is(typeof(Range.init.front[0]) : Value) 4503 && is(typeof(Range.init.front[1]) : Key)) 4504 { 4505 import std.algorithm.sorting : multiSort; 4506 alias Comps = GetComparators!(Prefix.length); 4507 if (unsorted) 4508 multiSort!(Comps)(range); 4509 return buildTrie(range, filler); 4510 } 4511 4512 /* 4513 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4514 to build `Trie` simply from an input range of `Key`s. 4515 The requirement on the ordering of keys (and the behavior on the 4516 violation of it) is the same as for Key-Value range overload. 4517 4518 Keys found in range denote !`filler` i.e. the opposite of filler. 4519 If no filler provided keys map to true, and `filler` is false. 4520 */ 4521 auto buildTrie(Range)(Range range, Value filler=Value.init) 4522 if (is(TypeOfBitPacked!Value == bool) 4523 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4524 { 4525 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4526 foreach (v; range) 4527 builder.putValue(v, !filler); 4528 return builder.build(); 4529 } 4530 4531 /* 4532 If `Key` is unsigned integer `Trie` could be constructed from array 4533 of values where array index serves as key. 4534 */ 4535 auto buildTrie()(Value[] array, Value filler=Value.init) 4536 if (isUnsigned!Key) 4537 { 4538 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4539 foreach (idx, v; array) 4540 builder.putValue(idx, v); 4541 return builder.build(); 4542 } 4543 4544 /* 4545 Builds `Trie` from associative array. 4546 */ 4547 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4548 { 4549 import std.array : array; 4550 import std.range : zip; 4551 auto range = array(zip(map.values, map.keys)); 4552 return buildTrie(range, filler, true); // sort it 4553 } 4554 } 4555 4556 // helper in place of assumeSize to 4557 //reduce mangled name & help DMD inline Trie functors 4558 struct clamp(size_t bits) 4559 { 4560 static size_t opCall(T)(T arg){ return arg; } 4561 enum bitSize = bits; 4562 } 4563 4564 struct clampIdx(size_t idx, size_t bits) 4565 { 4566 static size_t opCall(T)(T arg){ return arg[idx]; } 4567 enum bitSize = bits; 4568 } 4569 4570 /** 4571 Conceptual type that outlines the common properties of all UTF Matchers. 4572 4573 Note: For illustration purposes only, every method 4574 call results in assertion failure. 4575 Use $(LREF utfMatcher) to obtain a concrete matcher 4576 for UTF-8 or UTF-16 encodings. 4577 */ 4578 public struct MatcherConcept 4579 { 4580 /** 4581 $(P Perform a semantic equivalent 2 operations: 4582 decoding a $(CODEPOINT) at front of `inp` and testing if 4583 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4584 4585 $(P The effect on `inp` depends on the kind of function called:) 4586 4587 $(P Match. If the codepoint is found in the set then range `inp` 4588 is advanced by its size in $(S_LINK Code unit, code units), 4589 otherwise the range is not modifed.) 4590 4591 $(P Skip. The range is always advanced by the size 4592 of the tested $(CODEPOINT) regardless of the result of test.) 4593 4594 $(P Test. The range is left unaffected regardless 4595 of the result of test.) 4596 */ 4597 public bool match(Range)(ref Range inp) 4598 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4599 { 4600 assert(false); 4601 } 4602 4603 ///ditto 4604 public bool skip(Range)(ref Range inp) 4605 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4606 { 4607 assert(false); 4608 } 4609 4610 ///ditto 4611 public bool test(Range)(ref Range inp) 4612 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4613 { 4614 assert(false); 4615 } 4616 /// 4617 pure @safe unittest 4618 { 4619 string truth = "2² = 4"; 4620 auto m = utfMatcher!char(unicode.Number); 4621 assert(m.match(truth)); // '2' is a number all right 4622 assert(truth == "² = 4"); // skips on match 4623 assert(m.match(truth)); // so is the superscript '2' 4624 assert(!m.match(truth)); // space is not a number 4625 assert(truth == " = 4"); // unaffected on no match 4626 assert(!m.skip(truth)); // same test ... 4627 assert(truth == "= 4"); // but skips a codepoint regardless 4628 assert(!m.test(truth)); // '=' is not a number 4629 assert(truth == "= 4"); // test never affects argument 4630 } 4631 4632 /** 4633 Advanced feature - provide direct access to a subset of matcher based a 4634 set of known encoding lengths. Lengths are provided in 4635 $(S_LINK Code unit, code units). The sub-matcher then may do less 4636 operations per any `test`/`match`. 4637 4638 Use with care as the sub-matcher won't match 4639 any $(CODEPOINTS) that have encoded length that doesn't belong 4640 to the selected set of lengths. Also the sub-matcher object references 4641 the parent matcher and must not be used past the liftetime 4642 of the latter. 4643 4644 Another caveat of using sub-matcher is that skip is not available 4645 preciesly because sub-matcher doesn't detect all lengths. 4646 */ 4647 @property auto subMatcher(Lengths...)() 4648 { 4649 assert(0); 4650 return this; 4651 } 4652 4653 pure @safe unittest 4654 { 4655 auto m = utfMatcher!char(unicode.Number); 4656 string square = "2²"; 4657 // about sub-matchers 4658 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4659 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4660 assert(!m.subMatcher!1.test(square)); // unicode '²' 4661 assert(m.subMatcher!(2,3,4).match(square)); // 4662 assert(square == ""); 4663 wstring wsquare = "2²"; 4664 auto m16 = utfMatcher!wchar(unicode.Number); 4665 // may keep ref, but the orignal (m16) must be kept alive 4666 auto bmp = m16.subMatcher!1; 4667 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4668 assert(bmp.match(wsquare)); // And '²' too 4669 } 4670 } 4671 4672 /** 4673 Test if `M` is an UTF Matcher for ranges of `Char`. 4674 */ 4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4676 C[] s; 4677 auto d = s.decoder; 4678 M m; 4679 assert(is(typeof(m.match(d)) == bool)); 4680 assert(is(typeof(m.test(d)) == bool)); 4681 static if (is(typeof(m.skip(d)))) 4682 { 4683 assert(is(typeof(m.skip(d)) == bool)); 4684 assert(is(typeof(m.skip(s)) == bool)); 4685 } 4686 assert(is(typeof(m.match(s)) == bool)); 4687 assert(is(typeof(m.test(s)) == bool)); 4688 }); 4689 4690 pure @safe unittest 4691 { 4692 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4693 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4694 static assert(isUtfMatcher!(CharMatcher, char)); 4695 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4696 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4697 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4698 } 4699 4700 enum Mode { 4701 alwaysSkip, 4702 neverSkip, 4703 skipOnMatch 4704 } 4705 4706 mixin template ForwardStrings() 4707 { 4708 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4709 { 4710 import std.utf : byCodeUnit; 4711 alias type = typeof(byCodeUnit(str)); 4712 return mixin(fn~"(*cast(type*)&str)"); 4713 } 4714 } 4715 4716 template Utf8Matcher() 4717 { 4718 enum validSize(int sz) = sz >= 1 && sz <= 4; 4719 4720 void badEncoding() pure @safe 4721 { 4722 import std.utf : UTFException; 4723 throw new UTFException("Invalid UTF-8 sequence"); 4724 } 4725 4726 //for 1-stage ASCII 4727 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4728 //for 2-stage lookup of 2 byte UTF-8 sequences 4729 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4730 clampIdx!(0, 5), clampIdx!(1, 6)); 4731 //ditto for 3 byte 4732 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4733 clampIdx!(0, 4), 4734 clampIdx!(1, 6), 4735 clampIdx!(2, 6) 4736 ); 4737 //ditto for 4 byte 4738 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4739 clampIdx!(0, 3), clampIdx!(1, 6), 4740 clampIdx!(2, 6), clampIdx!(3, 6) 4741 ); 4742 alias Tables = AliasSeq!( 4743 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4744 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4745 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4746 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4747 ); 4748 alias Table(int size) = Tables[size-1]; 4749 4750 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4751 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4752 4753 char truncate()(char ch) pure @safe 4754 { 4755 ch -= 0x80; 4756 if (ch < 0x40) 4757 { 4758 return ch; 4759 } 4760 else 4761 { 4762 badEncoding(); 4763 return cast(char) 0; 4764 } 4765 } 4766 4767 static auto encode(size_t sz)(dchar ch) 4768 if (sz > 1) 4769 { 4770 import std.utf : encodeUTF = encode; 4771 char[4] buf; 4772 encodeUTF(buf, ch); 4773 char[sz] ret; 4774 buf[0] &= leadMask!sz; 4775 foreach (n; 1 .. sz) 4776 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4777 ret[] = buf[0 .. sz]; 4778 return ret; 4779 } 4780 4781 auto build(Set)(Set set) 4782 { 4783 import std.algorithm.iteration : map; 4784 auto ascii = set & unicode.ASCII; 4785 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4786 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4787 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4788 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4789 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4790 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4791 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4792 alias Ret = Impl!(1,2,3,4); 4793 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4794 } 4795 4796 // Bootstrap UTF-8 static matcher interface 4797 // from 3 primitives: tab!(size), lookup and Sizes 4798 mixin template DefMatcher() 4799 { 4800 import std.format : format; 4801 import std.meta : Erase, staticIndexOf; 4802 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4803 alias UniSizes = Erase!(1, Sizes); 4804 4805 //generate dispatch code sequence for unicode parts 4806 static auto genDispatch() 4807 { 4808 string code; 4809 foreach (size; UniSizes) 4810 code ~= format(q{ 4811 if ((ch & ~leadMask!%d) == encMask!(%d)) 4812 return lookup!(%d, mode)(inp); 4813 else 4814 }, size, size, size); 4815 static if (Sizes.length == 4) //covers all code unit cases 4816 code ~= "{ badEncoding(); return false; }"; 4817 else 4818 code ~= "return false;"; //may be just fine but not covered 4819 return code; 4820 } 4821 enum dispatch = genDispatch(); 4822 4823 public bool match(Range)(ref Range inp) const 4824 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4825 !isDynamicArray!Range) 4826 { 4827 enum mode = Mode.skipOnMatch; 4828 assert(!inp.empty); 4829 immutable ch = inp[0]; 4830 static if (hasASCII) 4831 { 4832 if (ch < 0x80) 4833 { 4834 immutable r = tab!1[ch]; 4835 if (r) 4836 inp.popFront(); 4837 return r; 4838 } 4839 else 4840 mixin(dispatch); 4841 } 4842 else 4843 mixin(dispatch); 4844 } 4845 4846 static if (Sizes.length == 4) // can skip iff can detect all encodings 4847 { 4848 public bool skip(Range)(ref Range inp) const 4849 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4850 !isDynamicArray!Range) 4851 { 4852 enum mode = Mode.alwaysSkip; 4853 assert(!inp.empty); 4854 auto ch = inp[0]; 4855 static if (hasASCII) 4856 { 4857 if (ch < 0x80) 4858 { 4859 inp.popFront(); 4860 return tab!1[ch]; 4861 } 4862 else 4863 mixin(dispatch); 4864 } 4865 else 4866 mixin(dispatch); 4867 } 4868 } 4869 4870 public bool test(Range)(ref Range inp) const 4871 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4872 !isDynamicArray!Range) 4873 { 4874 enum mode = Mode.neverSkip; 4875 assert(!inp.empty); 4876 auto ch = inp[0]; 4877 4878 static if (hasASCII) 4879 { 4880 if (ch < 0x80) 4881 return tab!1[ch]; 4882 else 4883 mixin(dispatch); 4884 } 4885 else 4886 mixin(dispatch); 4887 } 4888 4889 bool match(C)(ref C[] str) const 4890 if (isSomeChar!C) 4891 { 4892 return fwdStr!"match"(str); 4893 } 4894 4895 bool skip(C)(ref C[] str) const 4896 if (isSomeChar!C) 4897 { 4898 return fwdStr!"skip"(str); 4899 } 4900 4901 bool test(C)(ref C[] str) const 4902 if (isSomeChar!C) 4903 { 4904 return fwdStr!"test"(str); 4905 } 4906 4907 mixin ForwardStrings; 4908 } 4909 4910 struct Impl(Sizes...) 4911 { 4912 import std.meta : allSatisfy, staticMap; 4913 static assert(allSatisfy!(validSize, Sizes), 4914 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4915 private: 4916 //pick tables for chosen sizes 4917 alias OurTabs = staticMap!(Table, Sizes); 4918 OurTabs tables; 4919 mixin DefMatcher; 4920 //static disptach helper UTF size ==> table 4921 alias tab(int i) = tables[i - 1]; 4922 4923 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4924 { 4925 return CherryPick!(Impl, SizesToPick)(&this); 4926 } 4927 4928 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4929 { 4930 import std.range : popFrontN; 4931 if (inp.length < size) 4932 { 4933 badEncoding(); 4934 return false; 4935 } 4936 char[size] needle = void; 4937 needle[0] = leadMask!size & inp[0]; 4938 static foreach (i; 1 .. size) 4939 { 4940 needle[i] = truncate(inp[i]); 4941 } 4942 //overlong encoding checks 4943 static if (size == 2) 4944 { 4945 //0x80-0x7FF 4946 //got 6 bits in needle[1], must use at least 8 bits 4947 //must use at least 2 bits in needle[1] 4948 if (needle[0] < 2) badEncoding(); 4949 } 4950 else static if (size == 3) 4951 { 4952 //0x800-0xFFFF 4953 //got 6 bits in needle[2], must use at least 12bits 4954 //must use 6 bits in needle[1] or anything in needle[0] 4955 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4956 } 4957 else static if (size == 4) 4958 { 4959 //0x800-0xFFFF 4960 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4961 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4962 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4963 } 4964 static if (mode == Mode.alwaysSkip) 4965 { 4966 inp.popFrontN(size); 4967 return tab!size[needle]; 4968 } 4969 else static if (mode == Mode.neverSkip) 4970 { 4971 return tab!size[needle]; 4972 } 4973 else 4974 { 4975 static assert(mode == Mode.skipOnMatch); 4976 4977 if (tab!size[needle]) 4978 { 4979 inp.popFrontN(size); 4980 return true; 4981 } 4982 else 4983 return false; 4984 } 4985 } 4986 } 4987 4988 struct CherryPick(I, Sizes...) 4989 { 4990 import std.meta : allSatisfy; 4991 static assert(allSatisfy!(validSize, Sizes), 4992 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4993 private: 4994 I* m; 4995 @property auto tab(int i)() const { return m.tables[i - 1]; } 4996 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4997 { 4998 return m.lookup!(size, mode)(inp); 4999 } 5000 mixin DefMatcher; 5001 } 5002 } 5003 5004 template Utf16Matcher() 5005 { 5006 enum validSize(int sz) = sz >= 1 && sz <= 2; 5007 5008 void badEncoding() pure @safe 5009 { 5010 import std.utf : UTFException; 5011 throw new UTFException("Invalid UTF-16 sequence"); 5012 } 5013 5014 // 1-stage ASCII 5015 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5016 //2-stage BMP 5017 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5018 //4-stage - full Unicode 5019 //assume that 0xD800 & 0xDC00 bits are cleared 5020 //thus leaving 10 bit per wchar to worry about 5021 alias UniSpec = AliasSeq!(bool, wchar[2], 5022 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5023 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5024 ); 5025 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5026 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5027 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5028 5029 auto encode2(dchar ch) 5030 { 5031 ch -= 0x1_0000; 5032 assert(ch <= 0xF_FFFF); 5033 wchar[2] ret; 5034 //do not put surrogate bits, they are sliced off 5035 ret[0] = cast(wchar)(ch >> 10); 5036 ret[1] = (ch & 0xFFF); 5037 return ret; 5038 } 5039 5040 auto build(Set)(Set set) 5041 { 5042 import std.algorithm.iteration : map; 5043 auto ascii = set & unicode.ASCII; 5044 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5045 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5046 auto other = set - (bmp | ascii); 5047 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5048 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5049 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5050 alias Ret = Impl!(1,2); 5051 return Ret(asciiT, bmpT, otherT); 5052 } 5053 5054 //bootstrap full UTF-16 matcher interace from 5055 //sizeFlags, lookupUni and ascii 5056 mixin template DefMatcher() 5057 { 5058 public bool match(Range)(ref Range inp) const 5059 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5060 !isDynamicArray!Range) 5061 { 5062 enum mode = Mode.skipOnMatch; 5063 assert(!inp.empty); 5064 immutable ch = inp[0]; 5065 static if (sizeFlags & 1) 5066 { 5067 if (ch < 0x80) 5068 { 5069 if (ascii[ch]) 5070 { 5071 inp.popFront(); 5072 return true; 5073 } 5074 else 5075 return false; 5076 } 5077 return lookupUni!mode(inp); 5078 } 5079 else 5080 return lookupUni!mode(inp); 5081 } 5082 5083 static if (Sizes.length == 2) 5084 { 5085 public bool skip(Range)(ref Range inp) const 5086 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5087 !isDynamicArray!Range) 5088 { 5089 enum mode = Mode.alwaysSkip; 5090 assert(!inp.empty); 5091 immutable ch = inp[0]; 5092 static if (sizeFlags & 1) 5093 { 5094 if (ch < 0x80) 5095 { 5096 inp.popFront(); 5097 return ascii[ch]; 5098 } 5099 else 5100 return lookupUni!mode(inp); 5101 } 5102 else 5103 return lookupUni!mode(inp); 5104 } 5105 } 5106 5107 public bool test(Range)(ref Range inp) const 5108 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5109 !isDynamicArray!Range) 5110 { 5111 enum mode = Mode.neverSkip; 5112 assert(!inp.empty); 5113 auto ch = inp[0]; 5114 static if (sizeFlags & 1) 5115 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5116 else 5117 return lookupUni!mode(inp); 5118 } 5119 5120 bool match(C)(ref C[] str) const 5121 if (isSomeChar!C) 5122 { 5123 return fwdStr!"match"(str); 5124 } 5125 5126 bool skip(C)(ref C[] str) const 5127 if (isSomeChar!C) 5128 { 5129 return fwdStr!"skip"(str); 5130 } 5131 5132 bool test(C)(ref C[] str) const 5133 if (isSomeChar!C) 5134 { 5135 return fwdStr!"test"(str); 5136 } 5137 5138 mixin ForwardStrings; //dispatch strings to range versions 5139 } 5140 5141 struct Impl(Sizes...) 5142 if (Sizes.length >= 1 && Sizes.length <= 2) 5143 { 5144 private: 5145 import std.meta : allSatisfy; 5146 static assert(allSatisfy!(validSize, Sizes), 5147 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5148 static if (Sizes.length > 1) 5149 enum sizeFlags = Sizes[0] | Sizes[1]; 5150 else 5151 enum sizeFlags = Sizes[0]; 5152 5153 static if (sizeFlags & 1) 5154 { 5155 Ascii ascii; 5156 Bmp bmp; 5157 } 5158 static if (sizeFlags & 2) 5159 { 5160 Uni uni; 5161 } 5162 mixin DefMatcher; 5163 5164 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5165 { 5166 return CherryPick!(Impl, SizesToPick)(&this); 5167 } 5168 5169 bool lookupUni(Mode mode, Range)(ref Range inp) const 5170 { 5171 wchar x = cast(wchar)(inp[0] - 0xD800); 5172 //not a high surrogate 5173 if (x > 0x3FF) 5174 { 5175 //low surrogate 5176 if (x <= 0x7FF) badEncoding(); 5177 static if (sizeFlags & 1) 5178 { 5179 auto ch = inp[0]; 5180 static if (mode == Mode.alwaysSkip) 5181 inp.popFront(); 5182 static if (mode == Mode.skipOnMatch) 5183 { 5184 if (bmp[ch]) 5185 { 5186 inp.popFront(); 5187 return true; 5188 } 5189 else 5190 return false; 5191 } 5192 else 5193 return bmp[ch]; 5194 } 5195 else //skip is not available for sub-matchers, so just false 5196 return false; 5197 } 5198 else 5199 { 5200 import std.range : popFrontN; 5201 static if (sizeFlags & 2) 5202 { 5203 if (inp.length < 2) 5204 badEncoding(); 5205 wchar y = cast(wchar)(inp[1] - 0xDC00); 5206 //not a low surrogate 5207 if (y > 0x3FF) 5208 badEncoding(); 5209 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5210 static if (mode == Mode.alwaysSkip) 5211 inp.popFrontN(2); 5212 static if (mode == Mode.skipOnMatch) 5213 { 5214 if (uni[needle]) 5215 { 5216 inp.popFrontN(2); 5217 return true; 5218 } 5219 else 5220 return false; 5221 } 5222 else 5223 return uni[needle]; 5224 } 5225 else //ditto 5226 return false; 5227 } 5228 } 5229 } 5230 5231 struct CherryPick(I, Sizes...) 5232 if (Sizes.length >= 1 && Sizes.length <= 2) 5233 { 5234 private: 5235 import std.meta : allSatisfy; 5236 I* m; 5237 enum sizeFlags = I.sizeFlags; 5238 5239 static if (sizeFlags & 1) 5240 { 5241 @property auto ascii()() const { return m.ascii; } 5242 } 5243 5244 bool lookupUni(Mode mode, Range)(ref Range inp) const 5245 { 5246 return m.lookupUni!mode(inp); 5247 } 5248 mixin DefMatcher; 5249 static assert(allSatisfy!(validSize, Sizes), 5250 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5251 } 5252 } 5253 5254 private auto utf8Matcher(Set)(Set set) 5255 { 5256 return Utf8Matcher!().build(set); 5257 } 5258 5259 private auto utf16Matcher(Set)(Set set) 5260 { 5261 return Utf16Matcher!().build(set); 5262 } 5263 5264 /** 5265 Constructs a matcher object 5266 to classify $(CODEPOINTS) from the `set` for encoding 5267 that has `Char` as code unit. 5268 5269 See $(LREF MatcherConcept) for API outline. 5270 */ 5271 public auto utfMatcher(Char, Set)(Set set) 5272 if (isCodepointSet!Set) 5273 { 5274 static if (is(Char : char)) 5275 return utf8Matcher(set); 5276 else static if (is(Char : wchar)) 5277 return utf16Matcher(set); 5278 else static if (is(Char : dchar)) 5279 static assert(false, "UTF-32 needs no decoding, 5280 and thus not supported by utfMatcher"); 5281 else 5282 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5283 } 5284 5285 5286 //a range of code units, packed with index to speed up forward iteration 5287 package(std) auto decoder(C)(C[] s, size_t offset=0) 5288 if (is(C : wchar) || is(C : char)) 5289 { 5290 static struct Decoder 5291 { 5292 pure nothrow: 5293 C[] str; 5294 size_t idx; 5295 @property C front(){ return str[idx]; } 5296 @property C back(){ return str[$-1]; } 5297 void popFront(){ idx++; } 5298 void popBack(){ str = str[0..$-1]; } 5299 void popFrontN(size_t n){ idx += n; } 5300 @property bool empty(){ return idx == str.length; } 5301 @property auto save(){ return this; } 5302 auto opIndex(size_t i){ return str[idx+i]; } 5303 @property size_t length(){ return str.length - idx; } 5304 alias opDollar = length; 5305 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5306 } 5307 static assert(isRandomAccessRange!Decoder); 5308 static assert(is(ElementType!Decoder : C)); 5309 return Decoder(s, offset); 5310 } 5311 5312 pure @safe unittest 5313 { 5314 string rs = "hi! ネемног砀 текста"; 5315 auto codec = rs.decoder; 5316 auto utf8 = utf8Matcher(unicode.Letter); 5317 auto asc = utf8.subMatcher!(1); 5318 auto uni = utf8.subMatcher!(2,3,4); 5319 5320 // h 5321 assert(asc.test(codec)); 5322 assert(!uni.match(codec)); 5323 assert(utf8.skip(codec)); 5324 assert(codec.idx == 1); 5325 5326 // i 5327 assert(asc.test(codec)); 5328 assert(!uni.match(codec)); 5329 assert(utf8.skip(codec)); 5330 assert(codec.idx == 2); 5331 5332 // ! 5333 assert(!asc.match(codec)); 5334 assert(!utf8.test(codec)); 5335 assert(!utf8.skip(codec)); 5336 assert(codec.idx == 3); 5337 5338 // space 5339 assert(!asc.test(codec)); 5340 assert(!utf8.test(codec)); 5341 assert(!utf8.skip(codec)); 5342 assert(codec.idx == 4); 5343 5344 assert(utf8.test(codec)); 5345 foreach (i; 0 .. 7) 5346 { 5347 assert(!asc.test(codec)); 5348 assert(uni.test(codec)); 5349 assert(utf8.skip(codec)); 5350 } 5351 assert(!utf8.test(codec)); 5352 assert(!utf8.skip(codec)); 5353 5354 //the same with match where applicable 5355 codec = rs.decoder; 5356 assert(utf8.match(codec)); 5357 assert(codec.idx == 1); 5358 assert(utf8.match(codec)); 5359 assert(codec.idx == 2); 5360 assert(!utf8.match(codec)); 5361 assert(codec.idx == 2); 5362 assert(!utf8.skip(codec)); 5363 assert(!utf8.skip(codec)); 5364 5365 foreach (i; 0 .. 7) 5366 { 5367 assert(!asc.test(codec)); 5368 assert(utf8.test(codec)); 5369 assert(utf8.match(codec)); 5370 } 5371 auto i = codec.idx; 5372 assert(!utf8.match(codec)); 5373 assert(codec.idx == i); 5374 } 5375 5376 pure @system unittest 5377 { 5378 import std.range : stride; 5379 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5380 { 5381 bool t = m.test(r); 5382 auto save = r.idx; 5383 assert(t == m.match(r)); 5384 assert(r.idx == save || t); //ether no change or was match 5385 r.idx = save; 5386 static if (is(typeof(m.skip(r)))) 5387 { 5388 assert(t == m.skip(r)); 5389 assert(r.idx != save); //always changed 5390 r.idx = save; 5391 } 5392 return t; 5393 } 5394 auto utf16 = utfMatcher!wchar(unicode.L); 5395 auto bmp = utf16.subMatcher!1; 5396 auto nonBmp = utf16.subMatcher!1; 5397 auto utf8 = utfMatcher!char(unicode.L); 5398 auto ascii = utf8.subMatcher!1; 5399 auto uni2 = utf8.subMatcher!2; 5400 auto uni3 = utf8.subMatcher!3; 5401 auto uni24 = utf8.subMatcher!(2,4); 5402 foreach (ch; unicode.L.byCodepoint.stride(3)) 5403 { 5404 import std.utf : encode; 5405 char[4] buf; 5406 wchar[2] buf16; 5407 auto len = encode(buf, ch); 5408 auto len16 = encode(buf16, ch); 5409 auto c8 = buf[0 .. len].decoder; 5410 auto c16 = buf16[0 .. len16].decoder; 5411 assert(testAll(utf16, c16)); 5412 assert(testAll(bmp, c16) || len16 != 1); 5413 assert(testAll(nonBmp, c16) || len16 != 2); 5414 5415 assert(testAll(utf8, c8)); 5416 5417 //submatchers return false on out of their domain 5418 assert(testAll(ascii, c8) || len != 1); 5419 assert(testAll(uni2, c8) || len != 2); 5420 assert(testAll(uni3, c8) || len != 3); 5421 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5422 } 5423 } 5424 5425 // cover decode fail cases of Matcher 5426 pure @safe unittest 5427 { 5428 import std.algorithm.iteration : map; 5429 import std.exception : collectException; 5430 import std.format : format; 5431 auto utf16 = utfMatcher!wchar(unicode.L); 5432 auto utf8 = utfMatcher!char(unicode.L); 5433 //decode failure cases UTF-8 5434 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5435 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5436 "\xCF\x00\0x00\0x00\x00"); 5437 foreach (msg; fails8) 5438 { 5439 assert(collectException((){ 5440 auto s = msg; 5441 size_t idx = 0; 5442 utf8.test(s); 5443 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5444 } 5445 //decode failure cases UTF-16 5446 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5447 foreach (msg; fails16) 5448 { 5449 assert(collectException((){ 5450 auto s = msg.map!(x => cast(wchar) x); 5451 utf16.test(s); 5452 }())); 5453 } 5454 } 5455 5456 /++ 5457 Convenience function to construct optimal configurations for 5458 packed Trie from any `set` of $(CODEPOINTS). 5459 5460 The parameter `level` indicates the number of trie levels to use, 5461 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5462 speed-size wise. 5463 5464 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5465 $(P Level 4 is the slowest and has the smallest footprint. ) 5466 5467 See the $(S_LINK Synopsis, Synopsis) section for example. 5468 5469 Note: 5470 Level 4 stays very practical (being faster and more predictable) 5471 compared to using direct lookup on the `set` itself. 5472 5473 5474 +/ 5475 public auto toTrie(size_t level, Set)(Set set) 5476 if (isCodepointSet!Set) 5477 { 5478 static if (level == 1) 5479 return codepointSetTrie!(21)(set); 5480 else static if (level == 2) 5481 return codepointSetTrie!(10, 11)(set); 5482 else static if (level == 3) 5483 return codepointSetTrie!(8, 5, 8)(set); 5484 else static if (level == 4) 5485 return codepointSetTrie!(6, 4, 4, 7)(set); 5486 else 5487 static assert(false, 5488 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5489 } 5490 5491 /** 5492 $(P Builds a `Trie` with typically optimal speed-size trade-off 5493 and wraps it into a delegate of the following type: 5494 $(D bool delegate(dchar ch)). ) 5495 5496 $(P Effectively this creates a 'tester' lambda suitable 5497 for algorithms like std.algorithm.find that take unary predicates. ) 5498 5499 See the $(S_LINK Synopsis, Synopsis) section for example. 5500 */ 5501 public auto toDelegate(Set)(Set set) 5502 if (isCodepointSet!Set) 5503 { 5504 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5505 auto t = toTrie!3(set); 5506 return (dchar ch) => t[ch]; 5507 } 5508 5509 /** 5510 $(P Opaque wrapper around unsigned built-in integers and 5511 code unit (char/wchar/dchar) types. 5512 Parameter `sz` indicates that the value is confined 5513 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5514 packed more tightly when stored in certain 5515 data-structures like trie. ) 5516 5517 Note: 5518 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5519 but not vise-versa. Users have to ensure the value fits in 5520 the range required and use the `cast` 5521 operator to perform the conversion.) 5522 */ 5523 struct BitPacked(T, size_t sz) 5524 if (isIntegral!T || is(T:dchar)) 5525 { 5526 enum bitSize = sz; 5527 T _value; 5528 alias _value this; 5529 } 5530 5531 /* 5532 Depending on the form of the passed argument `bitSizeOf` returns 5533 the amount of bits required to represent a given type 5534 or a return type of a given functor. 5535 */ 5536 template bitSizeOf(Args...) 5537 if (Args.length == 1) 5538 { 5539 import std.traits : ReturnType; 5540 alias T = Args[0]; 5541 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5542 { 5543 enum bitSizeOf = T.bitSize; 5544 } 5545 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5546 { 5547 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5548 } 5549 else 5550 { 5551 enum bitSizeOf = T.sizeof*8; 5552 } 5553 } 5554 5555 /** 5556 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5557 and thus suitable for packing. 5558 */ 5559 template isBitPacked(T) 5560 { 5561 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5562 enum isBitPacked = true; 5563 else 5564 enum isBitPacked = false; 5565 } 5566 5567 /** 5568 Gives the type `U` from $(LREF BitPacked)!(U, x) 5569 or `T` itself for every other type. 5570 */ 5571 template TypeOfBitPacked(T) 5572 { 5573 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5574 alias TypeOfBitPacked = U; 5575 else 5576 alias TypeOfBitPacked = T; 5577 } 5578 5579 /* 5580 Wrapper, used in definition of custom data structures from `Trie` template. 5581 Applying it to a unary lambda function indicates that the returned value always 5582 fits within `bits` of bits. 5583 */ 5584 struct assumeSize(alias Fn, size_t bits) 5585 { 5586 enum bitSize = bits; 5587 static auto ref opCall(T)(auto ref T arg) 5588 { 5589 return Fn(arg); 5590 } 5591 } 5592 5593 /* 5594 A helper for defining lambda function that yields a slice 5595 of certain bits from an unsigned integral value. 5596 The resulting lambda is wrapped in assumeSize and can be used directly 5597 with `Trie` template. 5598 */ 5599 struct sliceBits(size_t from, size_t to) 5600 { 5601 //for now bypass assumeSize, DMD has trouble inlining it 5602 enum bitSize = to-from; 5603 static auto opCall(T)(T x) 5604 out(result) 5605 { 5606 assert(result < (1 << to-from)); 5607 } 5608 do 5609 { 5610 static assert(from < to); 5611 static if (from == 0) 5612 return x & ((1 << to)-1); 5613 else 5614 return (x >> from) & ((1<<(to-from))-1); 5615 } 5616 } 5617 5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5620 alias lo8 = assumeSize!(low_8, 8); 5621 alias mlo8 = assumeSize!(midlow_8, 8); 5622 5623 @safe pure nothrow @nogc unittest 5624 { 5625 static assert(bitSizeOf!lo8 == 8); 5626 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5627 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5628 } 5629 5630 template Sequence(size_t start, size_t end) 5631 { 5632 static if (start < end) 5633 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5634 else 5635 alias Sequence = AliasSeq!(); 5636 } 5637 5638 //---- TRIE TESTS ---- 5639 @system unittest 5640 { 5641 import std.algorithm.iteration : map; 5642 import std.algorithm.sorting : sort; 5643 import std.array : array; 5644 import std.conv : text, to; 5645 import std.range : iota; 5646 static trieStats(TRIE)(TRIE t) 5647 { 5648 version (std_uni_stats) 5649 { 5650 import std.stdio : writefln, writeln; 5651 writeln("---TRIE FOOTPRINT STATS---"); 5652 static foreach (i; 0 .. t.table.dim) 5653 { 5654 writefln("lvl%s = %s bytes; %s pages" 5655 , i, t.bytes!i, t.pages!i); 5656 } 5657 writefln("TOTAL: %s bytes", t.bytes); 5658 version (none) 5659 { 5660 writeln("INDEX (excluding value level):"); 5661 static foreach (i; 0 .. t.table.dim-1) 5662 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5663 } 5664 writeln("---------------------------"); 5665 } 5666 } 5667 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5668 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5669 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5670 alias Set = CodepointSet; 5671 auto set = Set('A','Z','a','z'); 5672 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5673 for (int a='a'; a<'z';a++) 5674 assert(trie[a]); 5675 for (int a='A'; a<'Z';a++) 5676 assert(trie[a]); 5677 for (int a=0; a<'A'; a++) 5678 assert(!trie[a]); 5679 for (int a ='Z'; a<'a'; a++) 5680 assert(!trie[a]); 5681 trieStats(trie); 5682 5683 auto redundant2 = Set( 5684 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5685 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5686 trieStats(trie2); 5687 foreach (e; redundant2.byCodepoint) 5688 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5689 foreach (i; 0 .. 1024) 5690 { 5691 assert(trie2[i] == (i in redundant2)); 5692 } 5693 5694 5695 auto redundant3 = Set( 5696 2, 4, 6, 8, 16, 5697 2+16, 4+16, 16+6, 16+8, 16+16, 5698 2+32, 4+32, 32+6, 32+8, 5699 ); 5700 5701 enum max3 = 256; 5702 // sliceBits 5703 auto trie3 = buildTrie!(bool, uint, max3, 5704 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5705 )(redundant3.byInterval); 5706 trieStats(trie3); 5707 foreach (i; 0 .. max3) 5708 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5709 5710 auto redundant4 = Set( 5711 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5712 1000, 2000, 3000, 4000, 5000, 6000 5713 ); 5714 enum max4 = 2^^16; 5715 auto trie4 = buildTrie!(bool, size_t, max4, 5716 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5717 )(redundant4.byInterval); 5718 foreach (i; 0 .. max4) 5719 { 5720 if (i in redundant4) 5721 assert(trie4[i], text(cast(uint) i)); 5722 } 5723 trieStats(trie4); 5724 5725 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5726 string[] redundantS = ["tea", "start", "orange"]; 5727 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5728 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5729 // using first char only 5730 assert(redundantS == ["orange", "start", "tea"]); 5731 assert(strie["test"], text(strie["test"])); 5732 assert(!strie["aea"]); 5733 assert(strie["s"]); 5734 5735 // a bit size test 5736 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5737 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5738 trieStats(bt); 5739 foreach (i; 0 .. 256) 5740 assert(bt[cast(ubyte) i]); 5741 } 5742 5743 template useItemAt(size_t idx, T) 5744 if (isIntegral!T || is(T: dchar)) 5745 { 5746 size_t impl(const scope T[] arr){ return arr[idx]; } 5747 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5748 } 5749 5750 template useLastItem(T) 5751 { 5752 size_t impl(const scope T[] arr){ return arr[$-1]; } 5753 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5754 } 5755 5756 template fullBitSize(Prefix...) 5757 { 5758 static if (Prefix.length > 0) 5759 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5760 else 5761 enum fullBitSize = 0; 5762 } 5763 5764 template idxTypes(Key, size_t fullBits, Prefix...) 5765 { 5766 static if (Prefix.length == 1) 5767 {// the last level is value level, so no index once reduced to 1-level 5768 alias idxTypes = AliasSeq!(); 5769 } 5770 else 5771 { 5772 // Important note on bit packing 5773 // Each level has to hold enough of bits to address the next one 5774 // The bottom level is known to hold full bit width 5775 // thus it's size in pages is full_bit_width - size_of_last_prefix 5776 // Recourse on this notion 5777 alias idxTypes = 5778 AliasSeq!( 5779 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5780 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5781 ); 5782 } 5783 } 5784 5785 //============================================================================ 5786 5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5788 if (is(Char1 : dchar) && is(Char2 : dchar)) 5789 { 5790 import std.algorithm.comparison : cmp; 5791 import std.algorithm.iteration : map, filter; 5792 import std.ascii : toLower; 5793 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5794 return cmp( 5795 a.map!toLower.filter!pred, 5796 b.map!toLower.filter!pred); 5797 } 5798 5799 @safe pure unittest 5800 { 5801 assert(!comparePropertyName("foo-bar", "fooBar")); 5802 } 5803 5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5805 if (is(Char1 : dchar) && is(Char2 : dchar)) 5806 { 5807 return comparePropertyName(a, b) < 0; 5808 } 5809 5810 //============================================================================ 5811 // Utilities for compression of Unicode code point sets 5812 //============================================================================ 5813 5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5815 { 5816 // not optimized as usually done 1 time (and not public interface) 5817 if (val < 128) 5818 arr ~= cast(ubyte) val; 5819 else if (val < (1 << 13)) 5820 { 5821 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5822 arr ~= val & 0xFF; 5823 } 5824 else 5825 { 5826 assert(val < (1 << 21)); 5827 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5828 arr ~= (val >> 8) & 0xFF; 5829 arr ~= val & 0xFF; 5830 } 5831 } 5832 5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5834 { 5835 import std.exception : enforce; 5836 immutable first = arr[idx++]; 5837 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5838 return first; 5839 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5840 uint val = (first & 0x1F); 5841 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5842 foreach (j; 0 .. extra) 5843 val = (val << 8) | arr[idx+j]; 5844 idx += extra; 5845 return val; 5846 } 5847 5848 5849 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5851 { 5852 ubyte[] storage; 5853 uint base = 0; 5854 // RLE encode 5855 foreach (val; intervals) 5856 { 5857 compressTo(val[0]-base, storage); 5858 base = val[0]; 5859 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5860 { 5861 compressTo(val[1]-base, storage); 5862 base = val[1]; 5863 } 5864 } 5865 return storage; 5866 } 5867 5868 @safe pure unittest 5869 { 5870 import std.algorithm.comparison : equal; 5871 import std.typecons : tuple; 5872 5873 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5874 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5875 assert(compressIntervals(run) == enc); 5876 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5877 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5878 assert(compressIntervals(run2) == enc2); 5879 size_t idx = 0; 5880 assert(decompressFrom(enc, idx) == 80); 5881 assert(decompressFrom(enc, idx) == 47); 5882 assert(decompressFrom(enc, idx) == 1); 5883 assert(decompressFrom(enc, idx) == (1 << 10)); 5884 idx = 0; 5885 assert(decompressFrom(enc2, idx) == 0); 5886 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5887 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5888 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5889 } 5890 5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5893 { 5894 return DecompressedIntervals(data); 5895 } 5896 5897 @safe struct DecompressedIntervals 5898 { 5899 pure: 5900 const(ubyte)[] _stream; 5901 size_t _idx; 5902 CodepointInterval _front; 5903 5904 this(const(ubyte)[] stream) 5905 { 5906 _stream = stream; 5907 popFront(); 5908 } 5909 5910 @property CodepointInterval front() 5911 { 5912 assert(!empty); 5913 return _front; 5914 } 5915 5916 void popFront() 5917 { 5918 if (_idx == _stream.length) 5919 { 5920 _idx = size_t.max; 5921 return; 5922 } 5923 uint base = _front[1]; 5924 _front[0] = base + decompressFrom(_stream, _idx); 5925 if (_idx == _stream.length)// odd length ---> till the end 5926 _front[1] = lastDchar+1; 5927 else 5928 { 5929 base = _front[0]; 5930 _front[1] = base + decompressFrom(_stream, _idx); 5931 } 5932 } 5933 5934 @property bool empty() const 5935 { 5936 return _idx == size_t.max; 5937 } 5938 5939 @property DecompressedIntervals save() return scope { return this; } 5940 } 5941 5942 @safe pure nothrow @nogc unittest 5943 { 5944 static assert(isInputRange!DecompressedIntervals); 5945 static assert(isForwardRange!DecompressedIntervals); 5946 } 5947 5948 //============================================================================ 5949 5950 version (std_uni_bootstrap){} 5951 else 5952 { 5953 5954 // helper for looking up code point sets 5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5956 { 5957 import std.algorithm.iteration : map; 5958 import std.range : assumeSorted; 5959 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5960 (table.map!"a.name"()); 5961 size_t idx = range.lowerBound(name).length; 5962 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5963 return idx; 5964 return -1; 5965 } 5966 5967 // another one that loads it 5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5969 { 5970 auto idx = findUnicodeSet!table(name); 5971 if (idx >= 0) 5972 { 5973 dest = Set(asSet(table[idx].compressed)); 5974 return true; 5975 } 5976 return false; 5977 } 5978 5979 bool loadProperty(Set=CodepointSet, C) 5980 (const scope C[] name, ref Set target) pure 5981 { 5982 import std.internal.unicode_tables : uniProps; // generated file 5983 alias ucmp = comparePropertyName; 5984 // conjure cumulative properties by hand 5985 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5986 { 5987 target = asSet(uniProps.Lu); 5988 target |= asSet(uniProps.Ll); 5989 target |= asSet(uniProps.Lt); 5990 target |= asSet(uniProps.Lo); 5991 target |= asSet(uniProps.Lm); 5992 } 5993 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5994 { 5995 target = asSet(uniProps.Ll); 5996 target |= asSet(uniProps.Lu); 5997 target |= asSet(uniProps.Lt);// Title case 5998 } 5999 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 6000 { 6001 target = asSet(uniProps.Mn); 6002 target |= asSet(uniProps.Mc); 6003 target |= asSet(uniProps.Me); 6004 } 6005 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 6006 { 6007 target = asSet(uniProps.Nd); 6008 target |= asSet(uniProps.Nl); 6009 target |= asSet(uniProps.No); 6010 } 6011 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 6012 { 6013 target = asSet(uniProps.Pc); 6014 target |= asSet(uniProps.Pd); 6015 target |= asSet(uniProps.Ps); 6016 target |= asSet(uniProps.Pe); 6017 target |= asSet(uniProps.Pi); 6018 target |= asSet(uniProps.Pf); 6019 target |= asSet(uniProps.Po); 6020 } 6021 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6022 { 6023 target = asSet(uniProps.Sm); 6024 target |= asSet(uniProps.Sc); 6025 target |= asSet(uniProps.Sk); 6026 target |= asSet(uniProps.So); 6027 } 6028 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6029 { 6030 target = asSet(uniProps.Zs); 6031 target |= asSet(uniProps.Zl); 6032 target |= asSet(uniProps.Zp); 6033 } 6034 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6035 { 6036 target = asSet(uniProps.Cc); 6037 target |= asSet(uniProps.Cf); 6038 target |= asSet(uniProps.Cs); 6039 target |= asSet(uniProps.Co); 6040 target |= asSet(uniProps.Cn); 6041 } 6042 else if (ucmp(name, "graphical") == 0) 6043 { 6044 target = asSet(uniProps.Alphabetic); 6045 6046 target |= asSet(uniProps.Mn); 6047 target |= asSet(uniProps.Mc); 6048 target |= asSet(uniProps.Me); 6049 6050 target |= asSet(uniProps.Nd); 6051 target |= asSet(uniProps.Nl); 6052 target |= asSet(uniProps.No); 6053 6054 target |= asSet(uniProps.Pc); 6055 target |= asSet(uniProps.Pd); 6056 target |= asSet(uniProps.Ps); 6057 target |= asSet(uniProps.Pe); 6058 target |= asSet(uniProps.Pi); 6059 target |= asSet(uniProps.Pf); 6060 target |= asSet(uniProps.Po); 6061 6062 target |= asSet(uniProps.Zs); 6063 6064 target |= asSet(uniProps.Sm); 6065 target |= asSet(uniProps.Sc); 6066 target |= asSet(uniProps.Sk); 6067 target |= asSet(uniProps.So); 6068 } 6069 else if (ucmp(name, "any") == 0) 6070 target = Set.fromIntervals(0, 0x110000); 6071 else if (ucmp(name, "ascii") == 0) 6072 target = Set.fromIntervals(0, 0x80); 6073 else 6074 return loadUnicodeSet!(uniProps.tab)(name, target); 6075 return true; 6076 } 6077 6078 // CTFE-only helper for checking property names at compile-time 6079 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6080 { 6081 import std.algorithm.searching : find; 6082 auto names = [ 6083 "L", "Letter", 6084 "LC", "Cased Letter", 6085 "M", "Mark", 6086 "N", "Number", 6087 "P", "Punctuation", 6088 "S", "Symbol", 6089 "Z", "Separator", 6090 "Graphical", 6091 "any", 6092 "ascii" 6093 ]; 6094 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6095 return !x.empty; 6096 } 6097 6098 // ditto, CTFE-only, not optimized 6099 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6100 { 6101 return findUnicodeSet!table(name) >= 0; 6102 } 6103 6104 template SetSearcher(alias table, string kind) 6105 { 6106 /// Run-time checked search. 6107 static auto opCall(C)(const scope C[] name) 6108 if (is(C : dchar)) 6109 { 6110 import std.conv : to; 6111 CodepointSet set; 6112 if (loadUnicodeSet!table(name, set)) 6113 return set; 6114 throw new Exception("No unicode set for "~kind~" by name " 6115 ~name.to!string()~" was found."); 6116 } 6117 /// Compile-time checked search. 6118 static @property auto opDispatch(string name)() 6119 { 6120 static if (findSetName!table(name)) 6121 { 6122 CodepointSet set; 6123 loadUnicodeSet!table(name, set); 6124 return set; 6125 } 6126 else 6127 static assert(false, "No unicode set for "~kind~" by name " 6128 ~name~" was found."); 6129 } 6130 } 6131 6132 // Characters that need escaping in string posed as regular expressions 6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6134 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6135 6136 package(std) CodepointSet memoizeExpr(string expr)() 6137 { 6138 if (__ctfe) 6139 return mixin(expr); 6140 alias T = typeof(mixin(expr)); 6141 static T slot; 6142 static bool initialized; 6143 if (!initialized) 6144 { 6145 slot = mixin(expr); 6146 initialized = true; 6147 } 6148 return slot; 6149 } 6150 6151 //property for \w character class 6152 package(std) @property CodepointSet wordCharacter() @safe 6153 { 6154 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6155 | unicode.Me | unicode.Nd | unicode.Pc")(); 6156 } 6157 6158 //basic stack, just in case it gets used anywhere else then Parser 6159 package(std) struct Stack(T) 6160 { 6161 @safe: 6162 T[] data; 6163 @property bool empty(){ return data.empty; } 6164 6165 @property size_t length(){ return data.length; } 6166 6167 void push(T val){ data ~= val; } 6168 6169 @trusted T pop() 6170 { 6171 assert(!empty); 6172 auto val = data[$ - 1]; 6173 data = data[0 .. $ - 1]; 6174 if (!__ctfe) 6175 cast(void) data.assumeSafeAppend(); 6176 return val; 6177 } 6178 6179 @property ref T top() 6180 { 6181 assert(!empty); 6182 return data[$ - 1]; 6183 } 6184 } 6185 6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6187 //returns it's value and skips these maxDigit chars on success, throws on failure 6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6189 { 6190 import std.exception : enforce; 6191 //std.conv.parse is both @system and bogus 6192 uint val; 6193 for (int k = 0; k < maxDigit; k++) 6194 { 6195 enforce(!str.empty, "incomplete escape sequence"); 6196 //accepts ascii only, so it's OK to index directly 6197 immutable current = str.front; 6198 if ('0' <= current && current <= '9') 6199 val = val * 16 + current - '0'; 6200 else if ('a' <= current && current <= 'f') 6201 val = val * 16 + current -'a' + 10; 6202 else if ('A' <= current && current <= 'F') 6203 val = val * 16 + current - 'A' + 10; 6204 else 6205 throw new Exception("invalid escape sequence"); 6206 str.popFront(); 6207 } 6208 enforce(val <= 0x10FFFF, "invalid codepoint"); 6209 return val; 6210 } 6211 6212 @safe unittest 6213 { 6214 import std.algorithm.searching : canFind; 6215 import std.exception : collectException; 6216 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6217 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6218 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6219 foreach (v; non_hex) 6220 assert(collectException(parseUniHex(v, v.length)).msg 6221 .canFind("invalid escape sequence")); 6222 foreach (i, v; hex) 6223 assert(parseUniHex(v, v.length) == value[i]); 6224 string over = "0011FFFF"; 6225 assert(collectException(parseUniHex(over, over.length)).msg 6226 .canFind("invalid codepoint")); 6227 } 6228 6229 auto caseEnclose(CodepointSet set) 6230 { 6231 auto cased = set & unicode.LC; 6232 foreach (dchar ch; cased.byCodepoint) 6233 { 6234 foreach (c; simpleCaseFoldings(ch)) 6235 set |= c; 6236 } 6237 return set; 6238 } 6239 6240 /+ 6241 fetch codepoint set corresponding to a name (InBlock or binary property) 6242 +/ 6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6244 { 6245 CodepointSet s = unicode(name); 6246 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6247 if (casefold) 6248 s = caseEnclose(s); 6249 if (negated) 6250 s = s.inverted; 6251 return s; 6252 } 6253 6254 struct UnicodeSetParser(Range) 6255 { 6256 import std.exception : enforce; 6257 import std.typecons : tuple, Tuple; 6258 Range range; 6259 bool casefold_; 6260 6261 @property bool empty(){ return range.empty; } 6262 @property dchar front(){ return range.front; } 6263 void popFront(){ range.popFront(); } 6264 6265 //CodepointSet operations relatively in order of priority 6266 enum Operator:uint { 6267 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6268 } 6269 6270 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6271 //also fetches next set operation 6272 Tuple!(CodepointSet,Operator) parseCharTerm() 6273 { 6274 import std.range : drop; 6275 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6276 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6277 PotentialTwinSymbolOperator } 6278 Operator op = Operator.None; 6279 dchar last; 6280 CodepointSet set; 6281 State state = State.Start; 6282 6283 void addWithFlags(ref CodepointSet set, uint ch) 6284 { 6285 if (casefold_) 6286 { 6287 auto foldings = simpleCaseFoldings(ch); 6288 foreach (v; foldings) 6289 set |= v; 6290 } 6291 else 6292 set |= ch; 6293 } 6294 6295 static Operator twinSymbolOperator(dchar symbol) 6296 { 6297 switch (symbol) 6298 { 6299 case '|': 6300 return Operator.Union; 6301 case '-': 6302 return Operator.Difference; 6303 case '~': 6304 return Operator.SymDifference; 6305 case '&': 6306 return Operator.Intersection; 6307 default: 6308 assert(false); 6309 } 6310 } 6311 6312 L_CharTermLoop: 6313 for (;;) 6314 { 6315 final switch (state) 6316 { 6317 case State.Start: 6318 switch (front) 6319 { 6320 case '|': 6321 case '-': 6322 case '~': 6323 case '&': 6324 state = State.PotentialTwinSymbolOperator; 6325 last = front; 6326 break; 6327 case '[': 6328 op = Operator.Union; 6329 goto case; 6330 case ']': 6331 break L_CharTermLoop; 6332 case '\\': 6333 state = State.Escape; 6334 break; 6335 default: 6336 state = State.Char; 6337 last = front; 6338 } 6339 break; 6340 case State.Char: 6341 // xxx last front xxx 6342 switch (front) 6343 { 6344 case '|': 6345 case '~': 6346 case '&': 6347 // then last is treated as normal char and added as implicit union 6348 state = State.PotentialTwinSymbolOperator; 6349 addWithFlags(set, last); 6350 last = front; 6351 break; 6352 case '-': // still need more info 6353 state = State.CharDash; 6354 break; 6355 case '\\': 6356 set |= last; 6357 state = State.Escape; 6358 break; 6359 case '[': 6360 op = Operator.Union; 6361 goto case; 6362 case ']': 6363 addWithFlags(set, last); 6364 break L_CharTermLoop; 6365 default: 6366 state = State.Char; 6367 addWithFlags(set, last); 6368 last = front; 6369 } 6370 break; 6371 case State.PotentialTwinSymbolOperator: 6372 // xxx last front xxxx 6373 // where last = [|-&~] 6374 if (front == last) 6375 { 6376 op = twinSymbolOperator(last); 6377 popFront();//skip second twin char 6378 break L_CharTermLoop; 6379 } 6380 goto case State.Char; 6381 case State.Escape: 6382 // xxx \ front xxx 6383 switch (front) 6384 { 6385 case 'f': 6386 last = '\f'; 6387 state = State.Char; 6388 break; 6389 case 'n': 6390 last = '\n'; 6391 state = State.Char; 6392 break; 6393 case 'r': 6394 last = '\r'; 6395 state = State.Char; 6396 break; 6397 case 't': 6398 last = '\t'; 6399 state = State.Char; 6400 break; 6401 case 'v': 6402 last = '\v'; 6403 state = State.Char; 6404 break; 6405 case 'c': 6406 last = unicode.parseControlCode(this); 6407 state = State.Char; 6408 break; 6409 foreach (val; Escapables) 6410 { 6411 case val: 6412 } 6413 last = front; 6414 state = State.Char; 6415 break; 6416 case 'p': 6417 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6418 state = State.Start; 6419 continue L_CharTermLoop; //next char already fetched 6420 case 'P': 6421 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6422 state = State.Start; 6423 continue L_CharTermLoop; //next char already fetched 6424 case 'x': 6425 popFront(); 6426 last = parseUniHex(this, 2); 6427 state = State.Char; 6428 continue L_CharTermLoop; 6429 case 'u': 6430 popFront(); 6431 last = parseUniHex(this, 4); 6432 state = State.Char; 6433 continue L_CharTermLoop; 6434 case 'U': 6435 popFront(); 6436 last = parseUniHex(this, 8); 6437 state = State.Char; 6438 continue L_CharTermLoop; 6439 case 'd': 6440 set.add(unicode.Nd); 6441 state = State.Start; 6442 break; 6443 case 'D': 6444 set.add(unicode.Nd.inverted); 6445 state = State.Start; 6446 break; 6447 case 's': 6448 set.add(unicode.White_Space); 6449 state = State.Start; 6450 break; 6451 case 'S': 6452 set.add(unicode.White_Space.inverted); 6453 state = State.Start; 6454 break; 6455 case 'w': 6456 set.add(wordCharacter); 6457 state = State.Start; 6458 break; 6459 case 'W': 6460 set.add(wordCharacter.inverted); 6461 state = State.Start; 6462 break; 6463 default: 6464 if (front >= privateUseStart && front <= privateUseEnd) 6465 enforce(false, "no matching ']' found while parsing character class"); 6466 enforce(false, "invalid escape sequence"); 6467 } 6468 break; 6469 case State.CharDash: 6470 // xxx last - front xxx 6471 switch (front) 6472 { 6473 case '[': 6474 op = Operator.Union; 6475 goto case; 6476 case ']': 6477 //means dash is a single char not an interval specifier 6478 addWithFlags(set, last); 6479 addWithFlags(set, '-'); 6480 break L_CharTermLoop; 6481 case '-'://set Difference again 6482 addWithFlags(set, last); 6483 op = Operator.Difference; 6484 popFront();//skip '-' 6485 break L_CharTermLoop; 6486 case '\\': 6487 state = State.CharDashEscape; 6488 break; 6489 default: 6490 enforce(last <= front, "inverted range"); 6491 if (casefold_) 6492 { 6493 for (uint ch = last; ch <= front; ch++) 6494 addWithFlags(set, ch); 6495 } 6496 else 6497 set.add(last, front + 1); 6498 state = State.Start; 6499 } 6500 break; 6501 case State.CharDashEscape: 6502 //xxx last - \ front xxx 6503 uint end; 6504 switch (front) 6505 { 6506 case 'f': 6507 end = '\f'; 6508 break; 6509 case 'n': 6510 end = '\n'; 6511 break; 6512 case 'r': 6513 end = '\r'; 6514 break; 6515 case 't': 6516 end = '\t'; 6517 break; 6518 case 'v': 6519 end = '\v'; 6520 break; 6521 foreach (val; Escapables) 6522 { 6523 case val: 6524 } 6525 end = front; 6526 break; 6527 case 'c': 6528 end = unicode.parseControlCode(this); 6529 break; 6530 case 'x': 6531 popFront(); 6532 end = parseUniHex(this, 2); 6533 enforce(last <= end,"inverted range"); 6534 set.add(last, end + 1); 6535 state = State.Start; 6536 continue L_CharTermLoop; 6537 case 'u': 6538 popFront(); 6539 end = parseUniHex(this, 4); 6540 enforce(last <= end,"inverted range"); 6541 set.add(last, end + 1); 6542 state = State.Start; 6543 continue L_CharTermLoop; 6544 case 'U': 6545 popFront(); 6546 end = parseUniHex(this, 8); 6547 enforce(last <= end,"inverted range"); 6548 set.add(last, end + 1); 6549 state = State.Start; 6550 continue L_CharTermLoop; 6551 default: 6552 if (front >= privateUseStart && front <= privateUseEnd) 6553 enforce(false, "no matching ']' found while parsing character class"); 6554 enforce(false, "invalid escape sequence"); 6555 } 6556 // Lookahead to check if it's a \T 6557 // where T is sub-pattern terminator in multi-pattern scheme 6558 auto lookahead = range.save.drop(1); 6559 if (end == '\\' && !lookahead.empty) 6560 { 6561 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6562 enforce(false, "no matching ']' found while parsing character class"); 6563 } 6564 enforce(last <= end,"inverted range"); 6565 set.add(last, end + 1); 6566 state = State.Start; 6567 break; 6568 } 6569 popFront(); 6570 enforce(!empty, "unexpected end of CodepointSet"); 6571 } 6572 return tuple(set, op); 6573 } 6574 6575 alias ValStack = Stack!(CodepointSet); 6576 alias OpStack = Stack!(Operator); 6577 6578 CodepointSet parseSet() 6579 { 6580 ValStack vstack; 6581 OpStack opstack; 6582 import std.functional : unaryFun; 6583 enforce(!empty, "unexpected end of input"); 6584 enforce(front == '[', "expected '[' at the start of unicode set"); 6585 // 6586 static bool apply(Operator op, ref ValStack stack) 6587 { 6588 switch (op) 6589 { 6590 case Operator.Negate: 6591 enforce(!stack.empty, "no operand for '^'"); 6592 stack.top = stack.top.inverted; 6593 break; 6594 case Operator.Union: 6595 auto s = stack.pop();//2nd operand 6596 enforce(!stack.empty, "no operand for '||'"); 6597 stack.top.add(s); 6598 break; 6599 case Operator.Difference: 6600 auto s = stack.pop();//2nd operand 6601 enforce(!stack.empty, "no operand for '--'"); 6602 stack.top.sub(s); 6603 break; 6604 case Operator.SymDifference: 6605 auto s = stack.pop();//2nd operand 6606 enforce(!stack.empty, "no operand for '~~'"); 6607 stack.top ~= s; 6608 break; 6609 case Operator.Intersection: 6610 auto s = stack.pop();//2nd operand 6611 enforce(!stack.empty, "no operand for '&&'"); 6612 stack.top.intersect(s); 6613 break; 6614 default: 6615 return false; 6616 } 6617 return true; 6618 } 6619 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6620 { 6621 while (cond(opstack.top)) 6622 { 6623 if (!apply(opstack.pop(),vstack)) 6624 return false;//syntax error 6625 if (opstack.empty) 6626 return false; 6627 } 6628 return true; 6629 } 6630 6631 L_CharsetLoop: 6632 do 6633 { 6634 switch (front) 6635 { 6636 case '[': 6637 opstack.push(Operator.Open); 6638 popFront(); 6639 enforce(!empty, "unexpected end of character class"); 6640 if (front == '^') 6641 { 6642 opstack.push(Operator.Negate); 6643 popFront(); 6644 enforce(!empty, "unexpected end of character class"); 6645 } 6646 else if (front == ']') // []...] is special cased 6647 { 6648 popFront(); 6649 enforce(!empty, "wrong character set"); 6650 auto pair = parseCharTerm(); 6651 pair[0].add(']', ']'+1); 6652 if (pair[1] != Operator.None) 6653 { 6654 if (opstack.top == Operator.Union) 6655 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6656 opstack.push(pair[1]); 6657 } 6658 vstack.push(pair[0]); 6659 } 6660 break; 6661 case ']': 6662 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6663 "character class syntax error"); 6664 enforce(!opstack.empty, "unmatched ']'"); 6665 opstack.pop(); 6666 popFront(); 6667 if (opstack.empty) 6668 break L_CharsetLoop; 6669 auto pair = parseCharTerm(); 6670 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6671 { 6672 vstack.top.add(pair[0]);//apply union 6673 } 6674 if (pair[1] != Operator.None) 6675 { 6676 if (opstack.top == Operator.Union) 6677 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6678 opstack.push(pair[1]); 6679 } 6680 break; 6681 // 6682 default://yet another pair of term(op)? 6683 auto pair = parseCharTerm(); 6684 if (pair[1] != Operator.None) 6685 { 6686 if (opstack.top == Operator.Union) 6687 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6688 opstack.push(pair[1]); 6689 } 6690 vstack.push(pair[0]); 6691 } 6692 6693 }while (!empty || !opstack.empty); 6694 while (!opstack.empty) 6695 apply(opstack.pop(),vstack); 6696 assert(vstack.length == 1); 6697 return vstack.top; 6698 } 6699 } 6700 6701 /** 6702 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6703 a block, script or general category. 6704 6705 It uses well defined standard rules of property name lookup. 6706 This includes fuzzy matching of names, so that 6707 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6708 and yield the same set of white space $(CHARACTERS). 6709 */ 6710 @safe public struct unicode 6711 { 6712 import std.exception : enforce; 6713 /** 6714 Performs the lookup of set of $(CODEPOINTS) 6715 with compile-time correctness checking. 6716 This short-cut version combines 3 searches: 6717 across blocks, scripts, and common binary properties. 6718 6719 Note that since scripts and blocks overlap the 6720 usual trick to disambiguate is used - to get a block use 6721 `unicode.InBlockName`, to search a script 6722 use `unicode.ScriptName`. 6723 6724 See_Also: $(LREF block), $(LREF script) 6725 and (not included in this search) $(LREF hangulSyllableType). 6726 */ 6727 6728 static @property auto opDispatch(string name)() pure 6729 { 6730 static if (findAny(name)) 6731 return loadAny(name); 6732 else 6733 static assert(false, "No unicode set by name "~name~" was found."); 6734 } 6735 6736 /// 6737 @safe unittest 6738 { 6739 import std.exception : collectException; 6740 auto ascii = unicode.ASCII; 6741 assert(ascii['A']); 6742 assert(ascii['~']); 6743 assert(!ascii['\u00e0']); 6744 // matching is case-insensitive 6745 assert(ascii == unicode.ascII); 6746 assert(!ascii['à']); 6747 // underscores, '-' and whitespace in names are ignored too 6748 auto latin = unicode.in_latin1_Supplement; 6749 assert(latin['à']); 6750 assert(!latin['$']); 6751 // BTW Latin 1 Supplement is a block, hence "In" prefix 6752 assert(latin == unicode("In Latin 1 Supplement")); 6753 // run-time look up throws if no such set is found 6754 assert(collectException(unicode("InCyrilliac"))); 6755 } 6756 6757 /** 6758 The same lookup across blocks, scripts, or binary properties, 6759 but performed at run-time. 6760 This version is provided for cases where `name` 6761 is not known beforehand; otherwise compile-time 6762 checked $(LREF opDispatch) is typically a better choice. 6763 6764 See the $(S_LINK Unicode properties, table of properties) for available 6765 sets. 6766 */ 6767 static auto opCall(C)(const scope C[] name) 6768 if (is(C : dchar)) 6769 { 6770 return loadAny(name); 6771 } 6772 6773 /** 6774 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6775 6776 Note: 6777 Here block names are unambiguous as no scripts are searched 6778 and thus to search use simply `unicode.block.BlockName` notation. 6779 6780 See $(S_LINK Unicode properties, table of properties) for available sets. 6781 See_Also: $(S_LINK Unicode properties, table of properties). 6782 */ 6783 struct block 6784 { 6785 import std.internal.unicode_tables : blocks; // generated file 6786 mixin SetSearcher!(blocks.tab, "block"); 6787 } 6788 6789 /// 6790 @safe unittest 6791 { 6792 // use .block for explicitness 6793 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6794 } 6795 6796 /** 6797 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6798 6799 See the $(S_LINK Unicode properties, table of properties) for available 6800 sets. 6801 */ 6802 struct script 6803 { 6804 import std.internal.unicode_tables : scripts; // generated file 6805 mixin SetSearcher!(scripts.tab, "script"); 6806 } 6807 6808 /// 6809 @safe unittest 6810 { 6811 auto arabicScript = unicode.script.arabic; 6812 auto arabicBlock = unicode.block.arabic; 6813 // there is an intersection between script and block 6814 assert(arabicBlock['']); 6815 assert(arabicScript['']); 6816 // but they are different 6817 assert(arabicBlock != arabicScript); 6818 assert(arabicBlock == unicode.inArabic); 6819 assert(arabicScript == unicode.arabic); 6820 } 6821 6822 /** 6823 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6824 6825 Other non-binary properties (once supported) follow the same 6826 notation - `unicode.propertyName.propertyValue` for compile-time 6827 checked access and `unicode.propertyName(propertyValue)` 6828 for run-time checked one. 6829 6830 See the $(S_LINK Unicode properties, table of properties) for available 6831 sets. 6832 */ 6833 struct hangulSyllableType 6834 { 6835 import std.internal.unicode_tables : hangul; // generated file 6836 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6837 } 6838 6839 /// 6840 @safe unittest 6841 { 6842 // L here is syllable type not Letter as in unicode.L short-cut 6843 auto leadingVowel = unicode.hangulSyllableType("L"); 6844 // check that some leading vowels are present 6845 foreach (vowel; '\u1110'..'\u115F') 6846 assert(leadingVowel[vowel]); 6847 assert(leadingVowel == unicode.hangulSyllableType.L); 6848 } 6849 6850 //parse control code of form \cXXX, c assumed to be the current symbol 6851 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6852 { 6853 with(p) 6854 { 6855 popFront(); 6856 enforce(!empty, "Unfinished escape sequence"); 6857 enforce(('a' <= front && front <= 'z') 6858 || ('A' <= front && front <= 'Z'), 6859 "Only letters are allowed after \\c"); 6860 return front & 0x1f; 6861 } 6862 } 6863 6864 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6865 //\ - assumed to be processed, p - is current 6866 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6867 bool negated, bool casefold) 6868 { 6869 static import std.ascii; 6870 with(p) 6871 { 6872 enum MAX_PROPERTY = 128; 6873 char[MAX_PROPERTY] result; 6874 uint k = 0; 6875 popFront(); 6876 enforce(!empty, "eof parsing unicode property spec"); 6877 if (front == '{') 6878 { 6879 popFront(); 6880 while (k < MAX_PROPERTY && !empty && front !='}' 6881 && front !=':') 6882 { 6883 if (front != '-' && front != ' ' && front != '_') 6884 result[k++] = cast(char) std.ascii.toLower(front); 6885 popFront(); 6886 } 6887 enforce(k != MAX_PROPERTY, "invalid property name"); 6888 enforce(front == '}', "} expected "); 6889 } 6890 else 6891 {//single char properties e.g.: \pL, \pN ... 6892 enforce(front < 0x80, "invalid property name"); 6893 result[k++] = cast(char) front; 6894 } 6895 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6896 enforce(!s.empty, "unrecognized unicode property spec"); 6897 popFront(); 6898 return s; 6899 } 6900 } 6901 6902 /** 6903 Parse unicode codepoint set from given `range` using standard regex 6904 syntax '[...]'. The range is advanced skiping over regex set definition. 6905 `casefold` parameter determines if the set should be casefolded - that is 6906 include both lower and upper case versions for any letters in the set. 6907 */ 6908 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6909 if (isInputRange!Range && is(ElementType!Range : dchar)) 6910 { 6911 auto usParser = UnicodeSetParser!Range(range, casefold); 6912 auto set = usParser.parseSet(); 6913 range = usParser.range; 6914 return set; 6915 } 6916 6917 /// 6918 @safe unittest 6919 { 6920 import std.uni : unicode; 6921 string pat = "[a-zA-Z0-9]hello"; 6922 auto set = unicode.parseSet(pat); 6923 // check some of the codepoints 6924 assert(set['a'] && set['A'] && set['9']); 6925 assert(pat == "hello"); 6926 } 6927 6928 private: 6929 alias ucmp = comparePropertyName; 6930 6931 static bool findAny(string name) 6932 { 6933 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6934 return isPrettyPropertyName(name) 6935 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6936 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6937 } 6938 6939 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6940 { 6941 import std.conv : to; 6942 import std.internal.unicode_tables : blocks, scripts; // generated file 6943 Set set; 6944 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6945 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6946 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6947 if (loaded) 6948 return set; 6949 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6950 } 6951 6952 // FIXME: re-disable once the compiler is fixed 6953 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6954 //@disable ~this(); 6955 } 6956 6957 @safe unittest 6958 { 6959 import std.internal.unicode_tables : blocks, uniProps; // generated file 6960 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6961 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6962 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6963 } 6964 6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6966 6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie, 6969 // or extend | '\u200D' separately 6970 6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6972 { 6973 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6974 } 6975 6976 // Our grapheme decoder is a state machine, this is list of all possible 6977 // states before each code point. 6978 private enum GraphemeState 6979 { 6980 Start, 6981 CR, 6982 RI, 6983 L, 6984 V, 6985 LVT, 6986 Emoji, 6987 EmojiZWJ, 6988 Prepend, 6989 End 6990 } 6991 6992 // Message values whether end of grapheme is reached 6993 private enum TransformRes 6994 { 6995 // No, unless the source range ends here 6996 // (GB2 - break at end of text, unless text is empty) 6997 goOn, 6998 redo, // Run last character again with new state 6999 retInclude, // Yes, after the just iterated character 7000 retExclude // Yes, before the just iterated character 7001 } 7002 7003 // The logic of the grapheme decoding is all here 7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 7005 // Note, getting GB1 (break at start of text, unless text is empty) right 7006 // relies on the user starting grapheme walking from beginning of the text, and 7007 // not attempting to walk an empty text. 7008 private enum TransformRes 7009 function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = 7010 [ 7011 GraphemeState.Start: (ref state, ch) 7012 { 7013 // GB4. Break after controls. 7014 if (graphemeControlTrie[ch] || ch == '\n') 7015 return TransformRes.retInclude; 7016 7017 with (GraphemeState) state = 7018 ch == '\r' ? CR : 7019 isRegionalIndicator(ch) ? RI : 7020 isHangL(ch) ? L : 7021 hangLV[ch] || isHangV(ch) ? V : 7022 hangLVT[ch] || isHangT(ch) ? LVT : 7023 prependTrie[ch] ? Prepend : 7024 xpictoTrie[ch] ? Emoji : 7025 End; 7026 7027 // No matter what we encountered, we always include the 7028 // first code point in the grapheme. 7029 return TransformRes.goOn; 7030 }, 7031 7032 // GB3, GB4. Do not break between a CR and LF. 7033 // Otherwise, break after controls. 7034 GraphemeState.CR: (ref state, ch) => ch == '\n' ? 7035 TransformRes.retInclude : 7036 TransformRes.retExclude, 7037 7038 // GB12 - GB13. Do not break within emoji flag sequences. 7039 // That is, do not break between regional indicator (RI) symbols if 7040 // there is an odd number of RI characters before the break point. 7041 // This state applies if one and only one RI code point has been 7042 // encountered. 7043 GraphemeState.RI: (ref state, ch) 7044 { 7045 state = GraphemeState.End; 7046 7047 return isRegionalIndicator(ch) ? 7048 TransformRes.goOn : 7049 TransformRes.redo; 7050 }, 7051 7052 // GB6. Do not break Hangul syllable sequences. 7053 GraphemeState.L: (ref state, ch) 7054 { 7055 if (isHangL(ch)) 7056 return TransformRes.goOn; 7057 else if (isHangV(ch) || hangLV[ch]) 7058 { 7059 state = GraphemeState.V; 7060 return TransformRes.goOn; 7061 } 7062 else if (hangLVT[ch]) 7063 { 7064 state = GraphemeState.LVT; 7065 return TransformRes.goOn; 7066 } 7067 7068 state = GraphemeState.End; 7069 return TransformRes.redo; 7070 }, 7071 7072 // GB7. Do not break Hangul syllable sequences. 7073 GraphemeState.V: (ref state, ch) 7074 { 7075 if (isHangV(ch)) 7076 return TransformRes.goOn; 7077 else if (isHangT(ch)) 7078 { 7079 state = GraphemeState.LVT; 7080 return TransformRes.goOn; 7081 } 7082 7083 state = GraphemeState.End; 7084 return TransformRes.redo; 7085 }, 7086 7087 // GB8. Do not break Hangul syllable sequences. 7088 GraphemeState.LVT: (ref state, ch) 7089 { 7090 if (isHangT(ch)) 7091 return TransformRes.goOn; 7092 7093 state = GraphemeState.End; 7094 return TransformRes.redo; 7095 }, 7096 7097 // GB11. Do not break within emoji modifier sequences or emoji 7098 // zwj sequences. This state applies when the last code point was 7099 // NOT a ZWJ. 7100 GraphemeState.Emoji: (ref state, ch) 7101 { 7102 if (graphemeExtendTrie[ch]) 7103 return TransformRes.goOn; 7104 7105 static assert(!graphemeExtendTrie['\u200D']); 7106 7107 if (ch == '\u200D') 7108 { 7109 state = GraphemeState.EmojiZWJ; 7110 return TransformRes.goOn; 7111 } 7112 7113 state = GraphemeState.End; 7114 // There might still be spacing marks are 7115 // at the end, which are not allowed in 7116 // middle of emoji sequences 7117 return TransformRes.redo; 7118 }, 7119 7120 // GB11. Do not break within emoji modifier sequences or emoji 7121 // zwj sequences. This state applies when the last code point was 7122 // a ZWJ. 7123 GraphemeState.EmojiZWJ: (ref state, ch) 7124 { 7125 state = GraphemeState.Emoji; 7126 if (xpictoTrie[ch]) 7127 return TransformRes.goOn; 7128 return TransformRes.redo; 7129 }, 7130 7131 // GB9b. Do not break after Prepend characters. 7132 GraphemeState.Prepend: (ref state, ch) 7133 { 7134 // GB5. Break before controls. 7135 if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') 7136 return TransformRes.retExclude; 7137 7138 state = GraphemeState.Start; 7139 return TransformRes.redo; 7140 }, 7141 7142 // GB9, GB9a. Do not break before extending characters, ZWJ 7143 // or SpacingMarks. 7144 // GB999. Otherwise, break everywhere. 7145 GraphemeState.End: (ref state, ch) 7146 => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? 7147 TransformRes.retExclude : 7148 TransformRes.goOn 7149 ]; 7150 7151 template genericDecodeGrapheme(bool getValue) 7152 { 7153 static if (getValue) 7154 alias Value = Grapheme; 7155 else 7156 alias Value = void; 7157 7158 Value genericDecodeGrapheme(Input)(ref Input range) 7159 { 7160 static if (getValue) 7161 Grapheme grapheme; 7162 auto state = GraphemeState.Start; 7163 dchar ch; 7164 7165 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 7166 outer: 7167 while (!range.empty) 7168 { 7169 ch = range.front; 7170 7171 rerun: 7172 final switch (graphemeTransforms[state](state, ch)) 7173 with(TransformRes) 7174 { 7175 case goOn: 7176 static if (getValue) 7177 grapheme ~= ch; 7178 range.popFront(); 7179 continue; 7180 7181 case redo: 7182 goto rerun; 7183 7184 case retInclude: 7185 static if (getValue) 7186 grapheme ~= ch; 7187 range.popFront(); 7188 break outer; 7189 7190 case retExclude: 7191 break outer; 7192 } 7193 } 7194 7195 static if (getValue) 7196 return grapheme; 7197 } 7198 } 7199 7200 public: // Public API continues 7201 7202 /++ 7203 Computes the length of grapheme cluster starting at `index`. 7204 Both the resulting length and the `index` are measured 7205 in $(S_LINK Code unit, code units). 7206 7207 Params: 7208 C = type that is implicitly convertible to `dchars` 7209 input = array of grapheme clusters 7210 index = starting index into `input[]` 7211 7212 Returns: 7213 length of grapheme cluster 7214 +/ 7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7216 if (is(C : dchar)) 7217 { 7218 auto src = input[index..$]; 7219 auto n = src.length; 7220 genericDecodeGrapheme!(false)(src); 7221 return n - src.length; 7222 } 7223 7224 /// 7225 @safe unittest 7226 { 7227 assert(graphemeStride(" ", 1) == 1); 7228 // A + combing ring above 7229 string city = "A\u030Arhus"; 7230 size_t first = graphemeStride(city, 0); 7231 assert(first == 3); //\u030A has 2 UTF-8 code units 7232 assert(city[0 .. first] == "A\u030A"); 7233 assert(city[first..$] == "rhus"); 7234 } 7235 7236 @safe unittest 7237 { 7238 // Ensure that graphemeStride is usable from CTFE. 7239 enum c1 = graphemeStride("A", 0); 7240 static assert(c1 == 1); 7241 7242 enum c2 = graphemeStride("A\u0301", 0); 7243 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7244 } 7245 7246 // TODO: make this @nogc. Probably no big deal since the state machine is 7247 // already GC-free. 7248 @safe pure nothrow unittest 7249 { 7250 // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face 7251 assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); 7252 // skier ~ female sign ~ '€' 7253 assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); 7254 // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' 7255 assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); 7256 // skier ~ zero-width joiner ~ female sign ~ '€' 7257 assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); 7258 // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner 7259 // ~ female sign ~ '€' 7260 assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); 7261 // skier ~ zero-width joiner ~ '€' 7262 assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); 7263 //'€' ~ zero-width joiner ~ skier 7264 assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); 7265 // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two 7266 assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); 7267 // Kaithi number sign ~ null 7268 assert(graphemeStride("\U000110BD\0"d, 0) == 1); 7269 } 7270 7271 /++ 7272 Reads one full grapheme cluster from an 7273 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7274 7275 For examples see the $(LREF Grapheme) below. 7276 7277 Note: 7278 This function modifies `inp` and thus `inp` 7279 must be an L-value. 7280 +/ 7281 Grapheme decodeGrapheme(Input)(ref Input inp) 7282 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7283 { 7284 return genericDecodeGrapheme!true(inp); 7285 } 7286 7287 @safe unittest 7288 { 7289 import std.algorithm.comparison : equal; 7290 7291 Grapheme gr; 7292 string s = " \u0020\u0308 "; 7293 gr = decodeGrapheme(s); 7294 assert(gr.length == 1 && gr[0] == ' '); 7295 gr = decodeGrapheme(s); 7296 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7297 s = "\u0300\u0308\u1100"; 7298 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7299 assert(equal(decodeGrapheme(s)[], "\u1100")); 7300 s = "\u11A8\u0308\uAC01"; 7301 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7302 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7303 7304 // Two Union Jacks of the Great Britain 7305 s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7306 assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7")); 7307 } 7308 7309 /++ 7310 $(P Iterate a string by $(LREF Grapheme).) 7311 7312 $(P Useful for doing string manipulation that needs to be aware 7313 of graphemes.) 7314 7315 See_Also: 7316 $(LREF byCodePoint) 7317 +/ 7318 auto byGrapheme(Range)(Range range) 7319 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7320 { 7321 // TODO: Bidirectional access 7322 static struct Result(R) 7323 { 7324 private R _range; 7325 private Grapheme _front; 7326 7327 bool empty() @property 7328 { 7329 return _front.length == 0; 7330 } 7331 7332 Grapheme front() @property 7333 { 7334 return _front; 7335 } 7336 7337 void popFront() 7338 { 7339 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7340 } 7341 7342 static if (isForwardRange!R) 7343 { 7344 Result save() @property 7345 { 7346 return Result(_range.save, _front); 7347 } 7348 } 7349 } 7350 7351 auto result = Result!(Range)(range); 7352 result.popFront(); 7353 return result; 7354 } 7355 7356 /// 7357 @safe unittest 7358 { 7359 import std.algorithm.comparison : equal; 7360 import std.range.primitives : walkLength; 7361 import std.range : take, drop; 7362 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7363 assert(text.walkLength == 5); // 5 code points 7364 7365 auto gText = text.byGrapheme; 7366 assert(gText.walkLength == 4); // 4 graphemes 7367 7368 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7369 assert(gText.drop(3).equal("l".byGrapheme)); 7370 } 7371 7372 // For testing non-forward-range input ranges 7373 version (StdUnittest) 7374 private static @safe struct InputRangeString 7375 { 7376 private string s; 7377 7378 bool empty() @property { return s.empty; } 7379 dchar front() @property { return s.front; } 7380 void popFront() { s.popFront(); } 7381 } 7382 7383 @safe unittest 7384 { 7385 import std.algorithm.comparison : equal; 7386 import std.array : array; 7387 import std.range : retro; 7388 import std.range.primitives : walkLength; 7389 assert("".byGrapheme.walkLength == 0); 7390 7391 auto reverse = "le\u0308on"; 7392 assert(reverse.walkLength == 5); 7393 7394 auto gReverse = reverse.byGrapheme; 7395 assert(gReverse.walkLength == 4); 7396 7397 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7398 {{ 7399 assert(text.walkLength == 5); 7400 static assert(isForwardRange!(typeof(text))); 7401 7402 auto gText = text.byGrapheme; 7403 static assert(isForwardRange!(typeof(gText))); 7404 assert(gText.walkLength == 4); 7405 assert(gText.array.retro.equal(gReverse)); 7406 }} 7407 7408 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7409 static assert(!isForwardRange!(typeof(nonForwardRange))); 7410 assert(nonForwardRange.walkLength == 4); 7411 } 7412 7413 // Issue 23474 7414 @safe pure unittest 7415 { 7416 import std.range.primitives : walkLength; 7417 assert(byGrapheme("\r\u0308").walkLength == 2); 7418 } 7419 7420 /++ 7421 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7422 7423 $(P Useful for converting the result to a string after doing operations 7424 on graphemes.) 7425 7426 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7427 +/ 7428 auto byCodePoint(Range)(Range range) 7429 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7430 { 7431 // TODO: Propagate bidirectional access 7432 static struct Result 7433 { 7434 private Range _range; 7435 private size_t i = 0; 7436 7437 bool empty() @property 7438 { 7439 return _range.empty; 7440 } 7441 7442 dchar front() @property 7443 { 7444 return _range.front[i]; 7445 } 7446 7447 void popFront() 7448 { 7449 ++i; 7450 7451 if (i >= _range.front.length) 7452 { 7453 _range.popFront(); 7454 i = 0; 7455 } 7456 } 7457 7458 static if (isForwardRange!Range) 7459 { 7460 Result save() @property 7461 { 7462 return Result(_range.save, i); 7463 } 7464 } 7465 } 7466 7467 return Result(range); 7468 } 7469 7470 /// Ditto 7471 auto byCodePoint(Range)(Range range) 7472 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7473 { 7474 import std.range.primitives : isBidirectionalRange, popBack; 7475 import std.traits : isNarrowString; 7476 static if (isNarrowString!Range) 7477 { 7478 static struct Result 7479 { 7480 private Range _range; 7481 @property bool empty() { return _range.empty; } 7482 @property dchar front(){ return _range.front; } 7483 void popFront(){ _range.popFront; } 7484 @property auto save() { return Result(_range.save); } 7485 @property dchar back(){ return _range.back; } 7486 void popBack(){ _range.popBack; } 7487 } 7488 static assert(isBidirectionalRange!(Result)); 7489 return Result(range); 7490 } 7491 else 7492 return range; 7493 } 7494 7495 /// 7496 @safe unittest 7497 { 7498 import std.array : array; 7499 import std.conv : text; 7500 import std.range : retro; 7501 7502 string s = "noe\u0308l"; // noël 7503 7504 // reverse it and convert the result to a string 7505 string reverse = s.byGrapheme 7506 .array 7507 .retro 7508 .byCodePoint 7509 .text; 7510 7511 assert(reverse == "le\u0308on"); // lëon 7512 } 7513 7514 @safe unittest 7515 { 7516 import std.algorithm.comparison : equal; 7517 import std.range.primitives : walkLength; 7518 import std.range : retro; 7519 assert("".byGrapheme.byCodePoint.equal("")); 7520 7521 string text = "noe\u0308l"; 7522 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7523 7524 auto gText = InputRangeString(text).byGrapheme; 7525 static assert(!isForwardRange!(typeof(gText))); 7526 7527 auto cpText = gText.byCodePoint; 7528 static assert(!isForwardRange!(typeof(cpText))); 7529 7530 assert(cpText.walkLength == text.walkLength); 7531 7532 auto plainCp = text.byCodePoint; 7533 static assert(isForwardRange!(typeof(plainCp))); 7534 assert(equal(plainCp, text)); 7535 assert(equal(retro(plainCp.save), retro(text.save))); 7536 // Check that we still have length for dstring 7537 assert("абвгд"d.byCodePoint.length == 5); 7538 } 7539 7540 /++ 7541 $(P A structure designed to effectively pack $(CHARACTERS) 7542 of a $(CLUSTER). 7543 ) 7544 7545 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7546 always refer to distinct objects. In most actual scenarios a `Grapheme` 7547 fits on the stack and avoids memory allocation overhead for all but quite 7548 long clusters. 7549 ) 7550 7551 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7552 +/ 7553 @safe struct Grapheme 7554 { 7555 import std.exception : enforce; 7556 import std.traits : isDynamicArray; 7557 7558 public: 7559 /// Ctor 7560 this(C)(const scope C[] chars...) 7561 if (is(C : dchar)) 7562 { 7563 this ~= chars; 7564 } 7565 7566 ///ditto 7567 this(Input)(Input seq) 7568 if (!isDynamicArray!Input 7569 && isInputRange!Input && is(ElementType!Input : dchar)) 7570 { 7571 this ~= seq; 7572 } 7573 7574 /// Gets a $(CODEPOINT) at the given index in this cluster. 7575 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7576 { 7577 assert(index < length); 7578 return read24(isBig ? ptr_ : small_.ptr, index); 7579 } 7580 7581 /++ 7582 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7583 7584 Warning: 7585 Use of this facility may invalidate grapheme cluster, 7586 see also $(LREF Grapheme.valid). 7587 +/ 7588 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7589 { 7590 assert(index < length); 7591 write24(isBig ? ptr_ : small_.ptr, ch, index); 7592 } 7593 7594 /// 7595 @safe unittest 7596 { 7597 auto g = Grapheme("A\u0302"); 7598 assert(g[0] == 'A'); 7599 assert(g.valid); 7600 g[1] = '~'; // ASCII tilda is not a combining mark 7601 assert(g[1] == '~'); 7602 assert(!g.valid); 7603 } 7604 7605 /++ 7606 Random-access range over Grapheme's $(CHARACTERS). 7607 7608 Warning: Invalidates when this Grapheme leaves the scope, 7609 attempts to use it then would lead to memory corruption. 7610 +/ 7611 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7612 { 7613 return sliceOverIndexed(a, b, &this); 7614 } 7615 7616 /// ditto 7617 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7618 { 7619 return sliceOverIndexed(0, length, &this); 7620 } 7621 7622 /// Grapheme cluster length in $(CODEPOINTS). 7623 @property size_t length() const @nogc nothrow pure 7624 { 7625 return isBig ? len_ : slen_ & 0x7F; 7626 } 7627 7628 /++ 7629 Append $(CHARACTER) `ch` to this grapheme. 7630 Warning: 7631 Use of this facility may invalidate grapheme cluster, 7632 see also `valid`. 7633 7634 See_Also: $(LREF Grapheme.valid) 7635 +/ 7636 ref opOpAssign(string op)(dchar ch) @trusted 7637 { 7638 static if (op == "~") 7639 { 7640 import std.internal.memory : enforceRealloc; 7641 if (!isBig) 7642 { 7643 if (slen_ == small_cap) 7644 convertToBig();// & fallthrough to "big" branch 7645 else 7646 { 7647 write24(small_.ptr, ch, smallLength); 7648 slen_++; 7649 return this; 7650 } 7651 } 7652 7653 assert(isBig); 7654 if (len_ == cap_) 7655 { 7656 import core.checkedint : addu, mulu; 7657 bool overflow; 7658 cap_ = addu(cap_, grow, overflow); 7659 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7660 if (overflow) assert(0); 7661 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7662 } 7663 write24(ptr_, ch, len_++); 7664 return this; 7665 } 7666 else 7667 static assert(false, "No operation "~op~" defined for Grapheme"); 7668 } 7669 7670 /// 7671 @safe unittest 7672 { 7673 import std.algorithm.comparison : equal; 7674 auto g = Grapheme("A"); 7675 assert(g.valid); 7676 g ~= '\u0301'; 7677 assert(g[].equal("A\u0301")); 7678 assert(g.valid); 7679 g ~= "B"; 7680 // not a valid grapheme cluster anymore 7681 assert(!g.valid); 7682 // still could be useful though 7683 assert(g[].equal("A\u0301B")); 7684 } 7685 7686 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7687 ref opOpAssign(string op, Input)(scope Input inp) 7688 if (isInputRange!Input && is(ElementType!Input : dchar)) 7689 { 7690 static if (op == "~") 7691 { 7692 foreach (dchar ch; inp) 7693 this ~= ch; 7694 return this; 7695 } 7696 else 7697 static assert(false, "No operation "~op~" defined for Grapheme"); 7698 } 7699 7700 // This is not a good `opEquals`, but formerly the automatically generated 7701 // opEquals was used, which was inferred `@safe` because of bugzilla 20655: 7702 // https://issues.dlang.org/show_bug.cgi?id=20655 7703 // This `@trusted opEquals` is only here to prevent breakage. 7704 bool opEquals(R)(const auto ref R other) const @trusted 7705 { 7706 return this.tupleof == other.tupleof; 7707 } 7708 7709 /++ 7710 True if this object contains valid extended grapheme cluster. 7711 Decoding primitives of this module always return a valid `Grapheme`. 7712 7713 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7714 render it no longer valid. Certain applications may chose to use 7715 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7716 entirely. 7717 +/ 7718 @property bool valid()() /*const*/ 7719 { 7720 auto r = this[]; 7721 genericDecodeGrapheme!false(r); 7722 return r.length == 0; 7723 } 7724 7725 this(this) @nogc nothrow pure @trusted 7726 { 7727 import std.internal.memory : enforceMalloc; 7728 if (isBig) 7729 {// dup it 7730 import core.checkedint : addu, mulu; 7731 bool overflow; 7732 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7733 if (overflow) assert(0); 7734 7735 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7736 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7737 ptr_ = p; 7738 } 7739 } 7740 7741 ~this() @nogc nothrow pure @trusted 7742 { 7743 import core.memory : pureFree; 7744 if (isBig) 7745 { 7746 pureFree(ptr_); 7747 } 7748 } 7749 7750 7751 private: 7752 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7753 // "out of the blue" grow rate, needs testing 7754 // (though graphemes are typically small < 9) 7755 enum grow = 20; 7756 enum small_cap = small_bytes/3; 7757 enum small_flag = 0x80, small_mask = 0x7F; 7758 // 16 bytes in 32bits, should be enough for the majority of cases 7759 union 7760 { 7761 struct 7762 { 7763 ubyte* ptr_; 7764 size_t cap_; 7765 size_t len_; 7766 size_t padding_; 7767 } 7768 struct 7769 { 7770 ubyte[small_bytes] small_; 7771 ubyte slen_; 7772 } 7773 } 7774 7775 void convertToBig() @nogc nothrow pure @trusted 7776 { 7777 import std.internal.memory : enforceMalloc; 7778 static assert(grow.max / 3 - 1 >= grow); 7779 enum nbytes = 3 * (grow + 1); 7780 size_t k = smallLength; 7781 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7782 for (int i=0; i<k; i++) 7783 write24(p, read24(small_.ptr, i), i); 7784 // now we can overwrite small array data 7785 ptr_ = p; 7786 len_ = slen_; 7787 assert(grow > len_); 7788 cap_ = grow; 7789 setBig(); 7790 } 7791 7792 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7793 7794 @property size_t smallLength() const @nogc nothrow pure 7795 { 7796 return slen_ & small_mask; 7797 } 7798 @property ubyte isBig() const @nogc nothrow pure 7799 { 7800 return slen_ & small_flag; 7801 } 7802 } 7803 7804 static assert(Grapheme.sizeof == size_t.sizeof*4); 7805 7806 7807 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7808 { 7809 import std.algorithm.comparison : equal; 7810 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7811 assert(byGrapheme("ЮУЗ").equal(data[])); 7812 } 7813 7814 /// 7815 @safe unittest 7816 { 7817 import std.algorithm.comparison : equal; 7818 import std.algorithm.iteration : filter; 7819 import std.range : isRandomAccessRange; 7820 7821 string bold = "ku\u0308hn"; 7822 7823 // note that decodeGrapheme takes parameter by ref 7824 auto first = decodeGrapheme(bold); 7825 7826 assert(first.length == 1); 7827 assert(first[0] == 'k'); 7828 7829 // the next grapheme is 2 characters long 7830 auto wideOne = decodeGrapheme(bold); 7831 // slicing a grapheme yields a random-access range of dchar 7832 assert(wideOne[].equal("u\u0308")); 7833 assert(wideOne.length == 2); 7834 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7835 7836 // all of the usual range manipulation is possible 7837 assert(wideOne[].filter!isMark().equal("\u0308")); 7838 7839 auto g = Grapheme("A"); 7840 assert(g.valid); 7841 g ~= '\u0301'; 7842 assert(g[].equal("A\u0301")); 7843 assert(g.valid); 7844 g ~= "B"; 7845 // not a valid grapheme cluster anymore 7846 assert(!g.valid); 7847 // still could be useful though 7848 assert(g[].equal("A\u0301B")); 7849 } 7850 7851 @safe unittest 7852 { 7853 auto g = Grapheme("A\u0302"); 7854 assert(g[0] == 'A'); 7855 assert(g.valid); 7856 g[1] = '~'; // ASCII tilda is not a combining mark 7857 assert(g[1] == '~'); 7858 assert(!g.valid); 7859 } 7860 7861 @safe unittest 7862 { 7863 import std.algorithm.comparison : equal; 7864 import std.algorithm.iteration : map; 7865 import std.conv : text; 7866 import std.range : iota; 7867 7868 // not valid clusters (but it just a test) 7869 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7870 assert(g[0] == 'a'); 7871 assert(g[1] == 'b'); 7872 assert(g[2] == 'c'); 7873 assert(g[3] == 'd'); 7874 assert(g[4] == 'e'); 7875 g[3] = 'Й'; 7876 assert(g[2] == 'c'); 7877 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7878 assert(g[4] == 'e'); 7879 assert(!g.valid); 7880 7881 g ~= 'ц'; 7882 g ~= '~'; 7883 assert(g[0] == 'a'); 7884 assert(g[1] == 'b'); 7885 assert(g[2] == 'c'); 7886 assert(g[3] == 'Й'); 7887 assert(g[4] == 'e'); 7888 assert(g[5] == 'ц'); 7889 assert(g[6] == '~'); 7890 assert(!g.valid); 7891 7892 Grapheme copy = g; 7893 copy[0] = 'X'; 7894 copy[1] = '-'; 7895 assert(g[0] == 'a' && copy[0] == 'X'); 7896 assert(g[1] == 'b' && copy[1] == '-'); 7897 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7898 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7899 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7900 copy ~= "xyz"; 7901 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7902 assert(!copy.valid); 7903 7904 Grapheme h; 7905 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7906 h ~= v; 7907 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7908 } 7909 7910 /++ 7911 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7912 This function uses simpler comparison rule thus achieving better performance 7913 than $(LREF icmp). However keep in mind the warning below.) 7914 7915 Params: 7916 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7917 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7918 7919 Returns: 7920 An `int` that is 0 if the strings match, 7921 <0 if `r1` is lexicographically "less" than `r2`, 7922 >0 if `r1` is lexicographically "greater" than `r2` 7923 7924 Warning: 7925 This function only handles 1:1 $(CODEPOINT) mapping 7926 and thus is not sufficient for certain alphabets 7927 like German, Greek and few others. 7928 7929 See_Also: 7930 $(LREF icmp) 7931 $(REF cmp, std,algorithm,comparison) 7932 +/ 7933 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7934 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7935 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7936 { 7937 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7938 import std.range.primitives : isInfinite; 7939 import std.utf : decodeFront; 7940 import std.traits : isDynamicArray; 7941 import std.typecons : Yes; 7942 static import std.ascii; 7943 7944 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7945 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7946 && !(isInfinite!S1 && isInfinite!S2) 7947 && __traits(compiles, 7948 { 7949 size_t s = size_t.sizeof / 2; 7950 r1 = r1[s .. $]; 7951 r2 = r2[s .. $]; 7952 })) 7953 {{ 7954 // ASCII optimization for dynamic arrays & similar. 7955 size_t i = 0; 7956 static if (isInfinite!S1) 7957 immutable end = r2.length; 7958 else static if (isInfinite!S2) 7959 immutable end = r1.length; 7960 else 7961 immutable end = r1.length > r2.length ? r2.length : r1.length; 7962 for (; i < end; ++i) 7963 { 7964 auto lhs = r1[i]; 7965 auto rhs = r2[i]; 7966 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7967 if (lhs == rhs) continue; 7968 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7969 if (lowDiff) return lowDiff; 7970 } 7971 static if (isInfinite!S1) 7972 return 1; 7973 else static if (isInfinite!S2) 7974 return -1; 7975 else 7976 return (r1.length > r2.length) - (r2.length > r1.length); 7977 7978 NonAsciiPath: 7979 r1 = r1[i .. $]; 7980 r2 = r2[i .. $]; 7981 // Fall through to standard case. 7982 }} 7983 7984 while (!r1.empty) 7985 { 7986 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 7987 if (r2.empty) 7988 return 1; 7989 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 7990 int diff = lhs - rhs; 7991 if (!diff) 7992 continue; 7993 if ((lhs | rhs) < 0x80) 7994 { 7995 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7996 if (!d) continue; 7997 return d; 7998 } 7999 size_t idx = simpleCaseTrie[lhs]; 8000 size_t idx2 = simpleCaseTrie[rhs]; 8001 // simpleCaseTrie is packed index table 8002 if (idx != EMPTY_CASE_TRIE) 8003 { 8004 if (idx2 != EMPTY_CASE_TRIE) 8005 {// both cased chars 8006 // adjust idx --> start of bucket 8007 idx = idx - sTable[idx].n; 8008 idx2 = idx2 - sTable[idx2].n; 8009 if (idx == idx2)// one bucket, equivalent chars 8010 continue; 8011 else// not the same bucket 8012 diff = sTable[idx].ch - sTable[idx2].ch; 8013 } 8014 else 8015 diff = sTable[idx - sTable[idx].n].ch - rhs; 8016 } 8017 else if (idx2 != EMPTY_CASE_TRIE) 8018 { 8019 diff = lhs - sTable[idx2 - sTable[idx2].n].ch; 8020 } 8021 // one of chars is not cased at all 8022 return diff; 8023 } 8024 return int(r2.empty) - 1; 8025 } 8026 8027 /// 8028 @safe @nogc pure nothrow unittest 8029 { 8030 assert(sicmp("Август", "авгусТ") == 0); 8031 // Greek also works as long as there is no 1:M mapping in sight 8032 assert(sicmp("ΌΎ", "όύ") == 0); 8033 // things like the following won't get matched as equal 8034 // Greek small letter iota with dialytika and tonos 8035 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8036 8037 // while icmp has no problem with that 8038 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 8039 assert(icmp("ΌΎ", "όύ") == 0); 8040 } 8041 8042 // overloads for the most common cases to reduce compile time 8043 @safe @nogc pure nothrow 8044 { 8045 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 8046 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 8047 8048 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 8049 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8050 8051 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 8052 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8053 } 8054 8055 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 8056 { 8057 import std.algorithm.searching : skipOver; 8058 import std.internal.unicode_tables : fullCaseTable; // generated file 8059 alias fTable = fullCaseTable; 8060 size_t idx = fullCaseTrie[lhs]; 8061 // fullCaseTrie is packed index table 8062 if (idx == EMPTY_CASE_TRIE) 8063 return lhs; 8064 immutable start = idx - fTable[idx].n; 8065 immutable end = fTable[idx].size + start; 8066 assert(fTable[start].entry_len == 1); 8067 for (idx=start; idx<end; idx++) 8068 { 8069 auto entryLen = fTable[idx].entry_len; 8070 if (entryLen == 1) 8071 { 8072 if (fTable[idx].seq[0] == rhs) 8073 { 8074 return 0; 8075 } 8076 } 8077 else 8078 {// OK it's a long chunk, like 'ss' for German 8079 dstring seq = fTable[idx].seq[0 .. entryLen]; 8080 if (rhs == seq[0] 8081 && rtail.skipOver(seq[1..$])) 8082 { 8083 // note that this path modifies rtail 8084 // iff we managed to get there 8085 return 0; 8086 } 8087 } 8088 } 8089 return fTable[start].seq[0]; // new remapped character for accurate diffs 8090 } 8091 8092 /++ 8093 Does case insensitive comparison of `r1` and `r2`. 8094 Follows the rules of full case-folding mapping. 8095 This includes matching as equal german ß with "ss" and 8096 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 8097 The cost of `icmp` being pedantically correct is 8098 slightly worse performance. 8099 8100 Params: 8101 r1 = a forward range of characters 8102 r2 = a forward range of characters 8103 8104 Returns: 8105 An `int` that is 0 if the strings match, 8106 <0 if `str1` is lexicographically "less" than `str2`, 8107 >0 if `str1` is lexicographically "greater" than `str2` 8108 8109 See_Also: 8110 $(LREF sicmp) 8111 $(REF cmp, std,algorithm,comparison) 8112 +/ 8113 int icmp(S1, S2)(S1 r1, S2 r2) 8114 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 8115 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8116 { 8117 import std.range.primitives : isInfinite; 8118 import std.traits : isDynamicArray; 8119 import std.utf : byDchar; 8120 static import std.ascii; 8121 8122 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8123 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8124 && !(isInfinite!S1 && isInfinite!S2) 8125 && __traits(compiles, 8126 { 8127 size_t s = size_t.max / 2; 8128 r1 = r1[s .. $]; 8129 r2 = r2[s .. $]; 8130 })) 8131 {{ 8132 // ASCII optimization for dynamic arrays & similar. 8133 size_t i = 0; 8134 static if (isInfinite!S1) 8135 immutable end = r2.length; 8136 else static if (isInfinite!S2) 8137 immutable end = r1.length; 8138 else 8139 immutable end = r1.length > r2.length ? r2.length : r1.length; 8140 for (; i < end; ++i) 8141 { 8142 auto lhs = r1[i]; 8143 auto rhs = r2[i]; 8144 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8145 if (lhs == rhs) continue; 8146 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8147 if (lowDiff) return lowDiff; 8148 } 8149 static if (isInfinite!S1) 8150 return 1; 8151 else static if (isInfinite!S2) 8152 return -1; 8153 else 8154 return (r1.length > r2.length) - (r2.length > r1.length); 8155 8156 NonAsciiPath: 8157 r1 = r1[i .. $]; 8158 r2 = r2[i .. $]; 8159 // Fall through to standard case. 8160 }} 8161 8162 auto str1 = r1.byDchar; 8163 auto str2 = r2.byDchar; 8164 8165 for (;;) 8166 { 8167 if (str1.empty) 8168 return str2.empty ? 0 : -1; 8169 immutable lhs = str1.front; 8170 if (str2.empty) 8171 return 1; 8172 immutable rhs = str2.front; 8173 str1.popFront(); 8174 str2.popFront(); 8175 if (!(lhs - rhs)) 8176 continue; 8177 // first try to match lhs to <rhs,right-tail> sequence 8178 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8179 if (!cmpLR) 8180 continue; 8181 // then rhs to <lhs,left-tail> sequence 8182 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8183 if (!cmpRL) 8184 continue; 8185 // cmpXX contain remapped codepoints 8186 // to obtain stable ordering of icmp 8187 return cmpLR - cmpRL; 8188 } 8189 } 8190 8191 /// 8192 @safe @nogc pure nothrow unittest 8193 { 8194 assert(icmp("Rußland", "Russland") == 0); 8195 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8196 } 8197 8198 /** 8199 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8200 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8201 */ 8202 @safe @nogc nothrow pure unittest 8203 { 8204 import std.utf : byDchar; 8205 8206 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8207 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8208 } 8209 8210 // test different character types 8211 @safe unittest 8212 { 8213 assert(icmp("Rußland", "Russland") == 0); 8214 assert(icmp("Rußland"w, "Russland") == 0); 8215 assert(icmp("Rußland", "Russland"w) == 0); 8216 assert(icmp("Rußland"w, "Russland"w) == 0); 8217 assert(icmp("Rußland"d, "Russland"w) == 0); 8218 assert(icmp("Rußland"w, "Russland"d) == 0); 8219 } 8220 8221 // overloads for the most common cases to reduce compile time 8222 @safe @nogc pure nothrow 8223 { 8224 int icmp(const(char)[] str1, const(char)[] str2) 8225 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8226 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8227 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8228 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8229 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8230 } 8231 8232 @safe unittest 8233 { 8234 import std.algorithm.sorting : sort; 8235 import std.conv : to; 8236 import std.exception : assertCTFEable; 8237 assertCTFEable!( 8238 { 8239 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8240 {{ 8241 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8242 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8243 { 8244 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8245 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8246 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8247 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8248 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8249 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8250 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8251 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8252 // Check example: 8253 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8254 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8255 } 8256 // check that the order is properly agnostic to the case 8257 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8258 sort!((a,b) => cfunc(a,b) < 0)(strs); 8259 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8260 }} 8261 assert(icmp("ßb", "ssa") > 0); 8262 // Check example: 8263 assert(icmp("Russland", "Rußland") == 0); 8264 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8265 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8266 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8267 // https://issues.dlang.org/show_bug.cgi?id=11057 8268 assert( icmp("K", "L") < 0 ); 8269 }); 8270 } 8271 8272 // https://issues.dlang.org/show_bug.cgi?id=17372 8273 @safe pure unittest 8274 { 8275 import std.algorithm.iteration : joiner, map; 8276 import std.algorithm.sorting : sort; 8277 import std.array : array; 8278 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8279 } 8280 8281 // This is package(std) for the moment to be used as a support tool for std.regex 8282 // It needs a better API 8283 /* 8284 Return a range of all $(CODEPOINTS) that casefold to 8285 and from this `ch`. 8286 */ 8287 package(std) auto simpleCaseFoldings(dchar ch) @safe 8288 { 8289 import std.internal.unicode_tables : simpleCaseTable; // generated file 8290 alias sTable = simpleCaseTable; 8291 static struct Range 8292 { 8293 @safe pure nothrow: 8294 uint idx; //if == uint.max, then read c. 8295 union 8296 { 8297 dchar c; // == 0 - empty range 8298 uint len; 8299 } 8300 @property bool isSmall() const { return idx == uint.max; } 8301 8302 this(dchar ch) 8303 { 8304 idx = uint.max; 8305 c = ch; 8306 } 8307 8308 this(uint start, uint size) 8309 { 8310 idx = start; 8311 len = size; 8312 } 8313 8314 @property dchar front() const 8315 { 8316 assert(!empty); 8317 if (isSmall) 8318 { 8319 return c; 8320 } 8321 auto ch = sTable[idx].ch; 8322 return ch; 8323 } 8324 8325 @property bool empty() const 8326 { 8327 if (isSmall) 8328 { 8329 return c == 0; 8330 } 8331 return len == 0; 8332 } 8333 8334 @property size_t length() const 8335 { 8336 if (isSmall) 8337 { 8338 return c == 0 ? 0 : 1; 8339 } 8340 return len; 8341 } 8342 8343 void popFront() 8344 { 8345 if (isSmall) 8346 c = 0; 8347 else 8348 { 8349 idx++; 8350 len--; 8351 } 8352 } 8353 } 8354 immutable idx = simpleCaseTrie[ch]; 8355 if (idx == EMPTY_CASE_TRIE) 8356 return Range(ch); 8357 auto entry = sTable[idx]; 8358 immutable start = idx - entry.n; 8359 return Range(start, entry.size); 8360 } 8361 8362 @safe unittest 8363 { 8364 import std.algorithm.comparison : equal; 8365 import std.algorithm.searching : canFind; 8366 import std.array : array; 8367 import std.exception : assertCTFEable; 8368 assertCTFEable!((){ 8369 auto r = simpleCaseFoldings('Э').array; 8370 assert(r.length == 2); 8371 assert(r.canFind('э') && r.canFind('Э')); 8372 auto sr = simpleCaseFoldings('~'); 8373 assert(sr.equal("~")); 8374 //A with ring above - casefolds to the same bucket as Angstrom sign 8375 sr = simpleCaseFoldings('Å'); 8376 assert(sr.length == 3); 8377 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8378 }); 8379 } 8380 8381 /++ 8382 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8383 +/ 8384 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8385 { 8386 return combiningClassTrie[ch]; 8387 } 8388 8389 /// 8390 @safe unittest 8391 { 8392 // shorten the code 8393 alias CC = combiningClass; 8394 8395 // combining tilda 8396 assert(CC('\u0303') == 230); 8397 // combining ring below 8398 assert(CC('\u0325') == 220); 8399 // the simple consequence is that "tilda" should be 8400 // placed after a "ring below" in a sequence 8401 } 8402 8403 @safe pure nothrow @nogc unittest 8404 { 8405 foreach (ch; 0 .. 0x80) 8406 assert(combiningClass(ch) == 0); 8407 assert(combiningClass('\u05BD') == 22); 8408 assert(combiningClass('\u0300') == 230); 8409 assert(combiningClass('\u0317') == 220); 8410 assert(combiningClass('\u1939') == 222); 8411 } 8412 8413 /// Unicode character decomposition type. 8414 enum UnicodeDecomposition { 8415 /// Canonical decomposition. The result is canonically equivalent sequence. 8416 Canonical, 8417 /** 8418 Compatibility decomposition. The result is compatibility equivalent sequence. 8419 Note: Compatibility decomposition is a $(B lossy) conversion, 8420 typically suitable only for fuzzy matching and internal processing. 8421 */ 8422 Compatibility 8423 } 8424 8425 /** 8426 Shorthand aliases for character decomposition type, passed as a 8427 template parameter to $(LREF decompose). 8428 */ 8429 enum { 8430 Canonical = UnicodeDecomposition.Canonical, 8431 Compatibility = UnicodeDecomposition.Compatibility 8432 } 8433 8434 /++ 8435 Try to canonically compose 2 $(CHARACTERS). 8436 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8437 8438 The assumption is that `first` comes before `second` in the original text, 8439 usually meaning that the first is a starter. 8440 8441 Note: Hangul syllables are not covered by this function. 8442 See `composeJamo` below. 8443 +/ 8444 public dchar compose(dchar first, dchar second) pure nothrow @safe 8445 { 8446 import std.algorithm.iteration : map; 8447 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8448 import std.range : assumeSorted; 8449 immutable packed = compositionJumpTrie[first]; 8450 if (packed == ushort.max) 8451 return dchar.init; 8452 // unpack offset and length 8453 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8454 // TODO: optimize this micro binary search (no more then 4-5 steps) 8455 auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted(); 8456 immutable target = r.lowerBound(second).length; 8457 if (target == cnt) 8458 return dchar.init; 8459 immutable entry = compositionTable[idx+target]; 8460 if (entry.rhs != second) 8461 return dchar.init; 8462 return entry.composed; 8463 } 8464 8465 /// 8466 @safe unittest 8467 { 8468 assert(compose('A','\u0308') == '\u00C4'); 8469 assert(compose('A', 'B') == dchar.init); 8470 assert(compose('C', '\u0301') == '\u0106'); 8471 // note that the starter is the first one 8472 // thus the following doesn't compose 8473 assert(compose('\u0308', 'A') == dchar.init); 8474 } 8475 8476 /++ 8477 Returns a full $(S_LINK Canonical decomposition, Canonical) 8478 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8479 decomposition of $(CHARACTER) `ch`. 8480 If no decomposition is available returns a $(LREF Grapheme) 8481 with the `ch` itself. 8482 8483 Note: 8484 This function also decomposes hangul syllables 8485 as prescribed by the standard. 8486 8487 See_Also: $(LREF decomposeHangul) for a restricted version 8488 that takes into account only hangul syllables but 8489 no other decompositions. 8490 +/ 8491 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8492 { 8493 import std.algorithm.searching : until; 8494 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8495 static if (decompType == Canonical) 8496 { 8497 alias table = decompCanonTable; 8498 alias mapping = canonMappingTrie; 8499 } 8500 else static if (decompType == Compatibility) 8501 { 8502 alias table = decompCompatTable; 8503 alias mapping = compatMappingTrie; 8504 } 8505 immutable idx = mapping[ch]; 8506 if (!idx) // not found, check hangul arithmetic decomposition 8507 return decomposeHangul(ch); 8508 auto decomp = table[idx..$].until(0); 8509 return Grapheme(decomp); 8510 } 8511 8512 /// 8513 @safe unittest 8514 { 8515 import std.algorithm.comparison : equal; 8516 8517 assert(compose('A','\u0308') == '\u00C4'); 8518 assert(compose('A', 'B') == dchar.init); 8519 assert(compose('C', '\u0301') == '\u0106'); 8520 // note that the starter is the first one 8521 // thus the following doesn't compose 8522 assert(compose('\u0308', 'A') == dchar.init); 8523 8524 assert(decompose('Ĉ')[].equal("C\u0302")); 8525 assert(decompose('D')[].equal("D")); 8526 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8527 assert(decompose!Compatibility('¹')[].equal("1")); 8528 } 8529 8530 //---------------------------------------------------------------------------- 8531 // Hangul specific composition/decomposition 8532 enum jamoSBase = 0xAC00; 8533 enum jamoLBase = 0x1100; 8534 enum jamoVBase = 0x1161; 8535 enum jamoTBase = 0x11A7; 8536 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8537 enum jamoNCount = jamoVCount * jamoTCount; 8538 enum jamoSCount = jamoLCount * jamoNCount; 8539 8540 // Tests if `ch` is a Hangul leading consonant jamo. 8541 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8542 { 8543 // first cmp rejects ~ 1M code points above leading jamo range 8544 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8545 } 8546 8547 // Tests if `ch` is a Hangul vowel jamo. 8548 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8549 { 8550 // first cmp rejects ~ 1M code points above trailing jamo range 8551 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8552 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8553 } 8554 8555 // Tests if `ch` is a Hangul trailnig consonant jamo. 8556 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8557 { 8558 // first cmp rejects ~ 1M code points above vowel range 8559 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8560 } 8561 8562 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8563 { 8564 int idxS = cast(int) ch - jamoSBase; 8565 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8566 } 8567 8568 // internal helper: compose hangul syllables leaving dchar.init in holes 8569 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8570 { 8571 for (size_t idx = 0; idx + 1 < seq.length; ) 8572 { 8573 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8574 { 8575 immutable int indexL = seq[idx] - jamoLBase; 8576 immutable int indexV = seq[idx+1] - jamoVBase; 8577 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8578 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8579 { 8580 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8581 seq[idx+1] = dchar.init; 8582 seq[idx+2] = dchar.init; 8583 idx += 3; 8584 } 8585 else 8586 { 8587 seq[idx] = jamoSBase + indexLV; 8588 seq[idx+1] = dchar.init; 8589 idx += 2; 8590 } 8591 } 8592 else 8593 idx++; 8594 } 8595 } 8596 8597 //---------------------------------------------------------------------------- 8598 public: 8599 8600 /** 8601 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8602 then this function returns $(LREF Grapheme) containing only `ch` as is. 8603 */ 8604 Grapheme decomposeHangul(dchar ch) nothrow pure @safe 8605 { 8606 immutable idxS = cast(int) ch - jamoSBase; 8607 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8608 immutable idxL = idxS / jamoNCount; 8609 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8610 immutable idxT = idxS % jamoTCount; 8611 8612 immutable partL = jamoLBase + idxL; 8613 immutable partV = jamoVBase + idxV; 8614 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8615 return Grapheme(partL, partV, jamoTBase + idxT); 8616 else // <L, V> decomposition 8617 return Grapheme(partL, partV); 8618 } 8619 8620 /// 8621 @safe unittest 8622 { 8623 import std.algorithm.comparison : equal; 8624 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8625 } 8626 8627 /++ 8628 Try to compose hangul syllable out of a leading consonant (`lead`), 8629 a `vowel` and optional `trailing` consonant jamos. 8630 8631 On success returns the composed LV or LVT hangul syllable. 8632 8633 If any of `lead` and `vowel` are not a valid hangul jamo 8634 of the respective $(CHARACTER) class returns dchar.init. 8635 +/ 8636 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8637 { 8638 if (!isJamoL(lead)) 8639 return dchar.init; 8640 immutable indexL = lead - jamoLBase; 8641 if (!isJamoV(vowel)) 8642 return dchar.init; 8643 immutable indexV = vowel - jamoVBase; 8644 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8645 immutable dchar syllable = jamoSBase + indexLV; 8646 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8647 } 8648 8649 /// 8650 @safe unittest 8651 { 8652 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8653 // leaving out T-vowel, or passing any codepoint 8654 // that is not trailing consonant composes an LV-syllable 8655 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8656 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8657 assert(composeJamo('\u1111', 'A') == dchar.init); 8658 assert(composeJamo('A', '\u1171') == dchar.init); 8659 } 8660 8661 @safe unittest 8662 { 8663 import std.algorithm.comparison : equal; 8664 import std.conv : text; 8665 8666 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8667 { 8668 Grapheme g = decompose!T(ch); 8669 assert(equal(g[], r), text(g[], " vs ", r)); 8670 } 8671 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8672 testDecomp!Canonical('\uF907', "\u9F9C"); 8673 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8674 testDecomp!Compatibility('\uA7F9', "\u0153"); 8675 8676 // check examples 8677 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8678 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8679 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8680 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8681 assert(composeJamo('\u1111', 'A') == dchar.init); 8682 assert(composeJamo('A', '\u1171') == dchar.init); 8683 } 8684 8685 /** 8686 Enumeration type for normalization forms, 8687 passed as template parameter for functions like $(LREF normalize). 8688 */ 8689 enum NormalizationForm { 8690 NFC, 8691 NFD, 8692 NFKC, 8693 NFKD 8694 } 8695 8696 8697 enum { 8698 /** 8699 Shorthand aliases from values indicating normalization forms. 8700 */ 8701 NFC = NormalizationForm.NFC, 8702 ///ditto 8703 NFD = NormalizationForm.NFD, 8704 ///ditto 8705 NFKC = NormalizationForm.NFKC, 8706 ///ditto 8707 NFKD = NormalizationForm.NFKD 8708 } 8709 8710 /++ 8711 Returns `input` string normalized to the chosen form. 8712 Form C is used by default. 8713 8714 For more information on normalization forms see 8715 the $(S_LINK Normalization, normalization section). 8716 8717 Note: 8718 In cases where the string in question is already normalized, 8719 it is returned unmodified and no memory allocation happens. 8720 +/ 8721 /* 8722 WARNING: @trusted lambda inside - handle with same care as @trusted 8723 functions 8724 8725 Despite being a template, the attributes do no harm since this doesn't work 8726 with user-defined range or character types anyway. 8727 */ 8728 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C) 8729 (return scope inout(C)[] input) 8730 { 8731 import std.algorithm.mutation : SwapStrategy; 8732 import std.algorithm.sorting : sort; 8733 import std.array : appender; 8734 import std.range : zip; 8735 8736 auto anchors = splitNormalized!norm(input); 8737 if (anchors[0] == input.length && anchors[1] == input.length) 8738 return input; 8739 dchar[] decomposed; 8740 decomposed.reserve(31); 8741 ubyte[] ccc; 8742 ccc.reserve(31); 8743 auto app = appender!(C[])(); 8744 do 8745 { 8746 app.put(input[0 .. anchors[0]]); 8747 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8748 static if (norm == NFD || norm == NFC) 8749 { 8750 foreach (dchar c; decompose!Canonical(ch)[]) 8751 decomposed ~= c; 8752 } 8753 else // NFKD & NFKC 8754 { 8755 foreach (dchar c; decompose!Compatibility(ch)[]) 8756 decomposed ~= c; 8757 } 8758 ccc.length = decomposed.length; 8759 size_t firstNonStable = 0; 8760 ubyte lastClazz = 0; 8761 8762 foreach (idx, dchar ch; decomposed) 8763 { 8764 immutable clazz = combiningClass(ch); 8765 ccc[idx] = clazz; 8766 if (clazz == 0 && lastClazz != 0) 8767 { 8768 // found a stable code point after unstable ones 8769 sort!("a[0] < b[0]", SwapStrategy.stable) 8770 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8771 firstNonStable = decomposed.length; 8772 } 8773 else if (clazz != 0 && lastClazz == 0) 8774 { 8775 // found first unstable code point after stable ones 8776 firstNonStable = idx; 8777 } 8778 lastClazz = clazz; 8779 } 8780 sort!("a[0] < b[0]", SwapStrategy.stable) 8781 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8782 static if (norm == NFC || norm == NFKC) 8783 { 8784 import std.algorithm.searching : countUntil; 8785 auto first = countUntil(ccc, 0); 8786 if (first >= 0) // no starters?? no recomposition 8787 { 8788 for (;;) 8789 { 8790 immutable second = recompose(first, decomposed, ccc); 8791 if (second == decomposed.length) 8792 break; 8793 first = second; 8794 } 8795 // 2nd pass for hangul syllables 8796 hangulRecompose(decomposed); 8797 } 8798 } 8799 static if (norm == NFD || norm == NFKD) 8800 app.put(decomposed); 8801 else 8802 { 8803 import std.algorithm.mutation : remove; 8804 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8805 app.put(decomposed[0 .. clean.length]); 8806 } 8807 // reset variables 8808 decomposed.length = 0; 8809 () @trusted { 8810 // assumeSafeAppend isn't considered pure as of writing, hence the 8811 // cast. It isn't pure in the sense that the elements after 8812 // the array in question are affected, but we don't use those 8813 // making the call pure for our purposes. 8814 (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})(); 8815 ccc.length = 0; 8816 (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})(); 8817 } (); 8818 input = input[anchors[1]..$]; 8819 // and move on 8820 anchors = splitNormalized!norm(input); 8821 } while (anchors[0] != input.length); 8822 app.put(input[0 .. anchors[0]]); 8823 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8824 } 8825 8826 /// 8827 @safe pure unittest 8828 { 8829 // any encoding works 8830 wstring greet = "Hello world"; 8831 assert(normalize(greet) is greet); // the same exact slice 8832 8833 // An example of a character with all 4 forms being different: 8834 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8835 assert(normalize!NFC("ϓ") == "\u03D3"); 8836 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8837 assert(normalize!NFKC("ϓ") == "\u038E"); 8838 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8839 } 8840 8841 @safe pure unittest 8842 { 8843 import std.conv : text; 8844 8845 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8846 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8847 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8848 8849 // test with dstring 8850 dstring greet = "Hello world"; 8851 assert(normalize(greet) is greet); // the same exact slice 8852 } 8853 8854 // canonically recompose given slice of code points, works in-place and mutates data 8855 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8856 { 8857 assert(input.length == ccc.length); 8858 int accumCC = -1;// so that it's out of 0 .. 255 range 8859 // writefln("recomposing %( %04x %)", input); 8860 // first one is always a starter thus we start at i == 1 8861 size_t i = start+1; 8862 for (; ; ) 8863 { 8864 if (i == input.length) 8865 break; 8866 immutable curCC = ccc[i]; 8867 // In any character sequence beginning with a starter S 8868 // a character C is blocked from S if and only if there 8869 // is some character B between S and C, and either B 8870 // is a starter or it has the same or higher combining class as C. 8871 //------------------------ 8872 // Applying to our case: 8873 // S is input[0] 8874 // accumCC is the maximum CCC of characters between C and S, 8875 // as ccc are sorted 8876 // C is input[i] 8877 8878 if (curCC > accumCC) 8879 { 8880 immutable comp = compose(input[start], input[i]); 8881 if (comp != dchar.init) 8882 { 8883 input[start] = comp; 8884 input[i] = dchar.init;// put a sentinel 8885 // current was merged so its CCC shouldn't affect 8886 // composing with the next one 8887 } 8888 else 8889 { 8890 // if it was a starter then accumCC is now 0, end of loop 8891 accumCC = curCC; 8892 if (accumCC == 0) 8893 break; 8894 } 8895 } 8896 else 8897 { 8898 // ditto here 8899 accumCC = curCC; 8900 if (accumCC == 0) 8901 break; 8902 } 8903 i++; 8904 } 8905 return i; 8906 } 8907 8908 // returns tuple of 2 indexes that delimit: 8909 // normalized text, piece that needs normalization and 8910 // the rest of input starting with stable code point 8911 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 8912 { 8913 import std.typecons : tuple; 8914 ubyte lastCC = 0; 8915 8916 foreach (idx, dchar ch; input) 8917 { 8918 static if (norm == NFC) 8919 if (ch < 0x0300) 8920 { 8921 lastCC = 0; 8922 continue; 8923 } 8924 immutable ubyte CC = combiningClass(ch); 8925 if (lastCC > CC && CC != 0) 8926 { 8927 return seekStable!norm(idx, input); 8928 } 8929 8930 if (notAllowedIn!norm(ch)) 8931 { 8932 return seekStable!norm(idx, input); 8933 } 8934 lastCC = CC; 8935 } 8936 return tuple(input.length, input.length); 8937 } 8938 8939 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8940 { 8941 import std.typecons : tuple; 8942 import std.utf : codeLength; 8943 8944 auto br = input[0 .. idx]; 8945 size_t region_start = 0;// default 8946 for (;;) 8947 { 8948 if (br.empty)// start is 0 8949 break; 8950 dchar ch = br.back; 8951 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8952 { 8953 region_start = br.length - codeLength!C(ch); 8954 break; 8955 } 8956 br.popFront(); 8957 } 8958 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8959 size_t region_end=input.length;// end is $ by default 8960 foreach (i, dchar ch; input[idx..$]) 8961 { 8962 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8963 { 8964 region_end = i+idx; 8965 break; 8966 } 8967 } 8968 // writeln("Region to normalize: ", input[region_start .. region_end]); 8969 return tuple(region_start, region_end); 8970 } 8971 8972 /** 8973 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 8974 form `norm`. 8975 */ 8976 public bool allowedIn(NormalizationForm norm)(dchar ch) 8977 { 8978 return !notAllowedIn!norm(ch); 8979 } 8980 8981 /// 8982 @safe unittest 8983 { 8984 // e.g. Cyrillic is always allowed, so is ASCII 8985 assert(allowedIn!NFC('я')); 8986 assert(allowedIn!NFD('я')); 8987 assert(allowedIn!NFKC('я')); 8988 assert(allowedIn!NFKD('я')); 8989 assert(allowedIn!NFC('Z')); 8990 } 8991 8992 // not user friendly name but more direct 8993 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 8994 { 8995 static if (norm == NFC) 8996 alias qcTrie = nfcQCTrie; 8997 else static if (norm == NFD) 8998 alias qcTrie = nfdQCTrie; 8999 else static if (norm == NFKC) 9000 alias qcTrie = nfkcQCTrie; 9001 else static if (norm == NFKD) 9002 alias qcTrie = nfkdQCTrie; 9003 else 9004 static assert("Unknown normalization form "~norm); 9005 return qcTrie[ch]; 9006 } 9007 9008 @safe unittest 9009 { 9010 assert(allowedIn!NFC('я')); 9011 assert(allowedIn!NFD('я')); 9012 assert(allowedIn!NFKC('я')); 9013 assert(allowedIn!NFKD('я')); 9014 assert(allowedIn!NFC('Z')); 9015 } 9016 9017 } 9018 9019 version (std_uni_bootstrap) 9020 { 9021 // old version used for bootstrapping of gen_uni.d that generates 9022 // up to date optimal versions of all of isXXX functions 9023 @safe pure nothrow @nogc public bool isWhite(dchar c) 9024 { 9025 import std.ascii : isWhite; 9026 return isWhite(c) || 9027 c == lineSep || c == paraSep || 9028 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 9029 (c >= '\u2000' && c <= '\u200A') || 9030 c == '\u202F' || c == '\u205F' || c == '\u3000'; 9031 } 9032 } 9033 else 9034 { 9035 9036 // trusted -> avoid bounds check 9037 @trusted pure nothrow @nogc private 9038 { 9039 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 9040 9041 // hide template instances behind functions 9042 // https://issues.dlang.org/show_bug.cgi?id=13232 9043 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 9044 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 9045 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 9046 9047 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 9048 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 9049 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 9050 9051 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 9052 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 9053 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 9054 } 9055 9056 public: 9057 9058 /++ 9059 Whether or not `c` is a Unicode whitespace $(CHARACTER). 9060 (general Unicode category: Part of C0(tab, vertical tab, form feed, 9061 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 9062 +/ 9063 @safe pure nothrow @nogc 9064 public bool isWhite(dchar c) 9065 { 9066 import std.internal.unicode_tables : isWhiteGen; // generated file 9067 return isWhiteGen(c); // call pregenerated binary search 9068 } 9069 9070 /++ 9071 Return whether `c` is a Unicode lowercase $(CHARACTER). 9072 +/ 9073 @safe pure nothrow @nogc 9074 bool isLower(dchar c) 9075 { 9076 import std.ascii : isLower, isASCII; 9077 if (isASCII(c)) 9078 return isLower(c); 9079 return lowerCaseTrie[c]; 9080 } 9081 9082 @safe unittest 9083 { 9084 import std.ascii : isLower; 9085 foreach (v; 0 .. 0x80) 9086 assert(isLower(v) == .isLower(v)); 9087 assert(.isLower('я')); 9088 assert(.isLower('й')); 9089 assert(!.isLower('Ж')); 9090 // Greek HETA 9091 assert(!.isLower('\u0370')); 9092 assert(.isLower('\u0371')); 9093 assert(!.isLower('\u039C')); // capital MU 9094 assert(.isLower('\u03B2')); // beta 9095 // from extended Greek 9096 assert(!.isLower('\u1F18')); 9097 assert(.isLower('\u1F00')); 9098 foreach (v; unicode.lowerCase.byCodepoint) 9099 assert(.isLower(v) && !isUpper(v)); 9100 } 9101 9102 9103 /++ 9104 Return whether `c` is a Unicode uppercase $(CHARACTER). 9105 +/ 9106 @safe pure nothrow @nogc 9107 bool isUpper(dchar c) 9108 { 9109 import std.ascii : isUpper, isASCII; 9110 if (isASCII(c)) 9111 return isUpper(c); 9112 return upperCaseTrie[c]; 9113 } 9114 9115 @safe unittest 9116 { 9117 import std.ascii : isLower; 9118 foreach (v; 0 .. 0x80) 9119 assert(isLower(v) == .isLower(v)); 9120 assert(!isUpper('й')); 9121 assert(isUpper('Ж')); 9122 // Greek HETA 9123 assert(isUpper('\u0370')); 9124 assert(!isUpper('\u0371')); 9125 assert(isUpper('\u039C')); // capital MU 9126 assert(!isUpper('\u03B2')); // beta 9127 // from extended Greek 9128 assert(!isUpper('\u1F00')); 9129 assert(isUpper('\u1F18')); 9130 foreach (v; unicode.upperCase.byCodepoint) 9131 assert(isUpper(v) && !.isLower(v)); 9132 } 9133 9134 9135 //TODO: Hidden for now, needs better API. 9136 //Other transforms could use better API as well, but this one is a new primitive. 9137 @safe pure nothrow @nogc 9138 private dchar toTitlecase(dchar c) 9139 { 9140 // optimize ASCII case 9141 if (c < 0xAA) 9142 { 9143 if (c < 'a') 9144 return c; 9145 if (c <= 'z') 9146 return c - 32; 9147 return c; 9148 } 9149 size_t idx = toTitleSimpleIndex(c); 9150 if (idx != ushort.max) 9151 { 9152 return toTitleTab(idx); 9153 } 9154 return c; 9155 } 9156 9157 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9158 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9159 9160 // generic toUpper/toLower on whole string, creates new or returns as is 9161 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9162 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9163 { 9164 import std.array : appender, array; 9165 import std.ascii : isASCII; 9166 import std.utf : byDchar, codeLength; 9167 9168 alias C = ElementEncodingType!S; 9169 9170 auto r = s.byDchar; 9171 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9172 { 9173 auto cOuter = r.front; 9174 ushort idx = indexFn(cOuter); 9175 if (idx == ushort.max) 9176 continue; 9177 auto result = appender!(C[])(); 9178 result.reserve(s.length); 9179 result.put(s[0 .. i]); 9180 foreach (dchar c; s[i .. $].byDchar) 9181 { 9182 if (c.isASCII) 9183 { 9184 result.put(asciiConvert(c)); 9185 } 9186 else 9187 { 9188 idx = indexFn(c); 9189 if (idx == ushort.max) 9190 result.put(c); 9191 else if (idx < maxIdx) 9192 { 9193 c = tableFn(idx); 9194 result.put(c); 9195 } 9196 else 9197 { 9198 auto val = tableFn(idx); 9199 // unpack length + codepoint 9200 immutable uint len = val >> 24; 9201 result.put(cast(dchar)(val & 0xFF_FFFF)); 9202 foreach (j; idx+1 .. idx+len) 9203 result.put(tableFn(j)); 9204 } 9205 } 9206 } 9207 return result.data; 9208 } 9209 9210 static if (isSomeString!S) 9211 return s; 9212 else 9213 return s.array; 9214 } 9215 9216 // https://issues.dlang.org/show_bug.cgi?id=12428 9217 @safe unittest 9218 { 9219 import std.array : replicate; 9220 auto s = "abcdefghij".replicate(300); 9221 s = s[0 .. 10]; 9222 9223 toUpper(s); 9224 9225 assert(s == "abcdefghij"); 9226 } 9227 9228 // https://issues.dlang.org/show_bug.cgi?id=18993 9229 @safe unittest 9230 { 9231 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9232 } 9233 9234 9235 // generic toUpper/toLower on whole range, returns range 9236 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9237 // Accept range of dchar's 9238 if (isInputRange!Range && 9239 isSomeChar!(ElementEncodingType!Range) && 9240 ElementEncodingType!Range.sizeof == dchar.sizeof) 9241 { 9242 static struct ToCaserImpl 9243 { 9244 @property bool empty() 9245 { 9246 return !nLeft && r.empty; 9247 } 9248 9249 @property auto front() 9250 { 9251 import std.ascii : isASCII; 9252 9253 if (!nLeft) 9254 { 9255 dchar c = r.front; 9256 if (c.isASCII) 9257 { 9258 buf[0] = asciiConvert(c); 9259 nLeft = 1; 9260 } 9261 else 9262 { 9263 const idx = indexFn(c); 9264 if (idx == ushort.max) 9265 { 9266 buf[0] = c; 9267 nLeft = 1; 9268 } 9269 else if (idx < maxIdx) 9270 { 9271 buf[0] = tableFn(idx); 9272 nLeft = 1; 9273 } 9274 else 9275 { 9276 immutable val = tableFn(idx); 9277 // unpack length + codepoint 9278 nLeft = val >> 24; 9279 if (nLeft == 0) 9280 nLeft = 1; 9281 assert(nLeft <= buf.length); 9282 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9283 foreach (j; 1 .. nLeft) 9284 buf[nLeft - j - 1] = tableFn(idx + j); 9285 } 9286 } 9287 } 9288 return buf[nLeft - 1]; 9289 } 9290 9291 void popFront() 9292 { 9293 if (!nLeft) 9294 front; 9295 assert(nLeft); 9296 --nLeft; 9297 if (!nLeft) 9298 r.popFront(); 9299 } 9300 9301 static if (isForwardRange!Range) 9302 { 9303 @property auto save() 9304 { 9305 auto ret = this; 9306 ret.r = r.save; 9307 return ret; 9308 } 9309 } 9310 9311 private: 9312 Range r; 9313 uint nLeft; 9314 dchar[3] buf = void; 9315 } 9316 9317 return ToCaserImpl(str); 9318 } 9319 9320 /********************* 9321 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9322 * or a string to upper or lower case. 9323 * 9324 * Does not allocate memory. 9325 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9326 * are treated as $(REF replacementDchar, std,utf). 9327 * 9328 * Params: 9329 * str = string or range of characters 9330 * 9331 * Returns: 9332 * an input range of `dchar`s 9333 * 9334 * See_Also: 9335 * $(LREF toUpper), $(LREF toLower) 9336 */ 9337 9338 auto asLowerCase(Range)(Range str) 9339 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9340 !isConvertibleToString!Range) 9341 { 9342 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9343 { 9344 import std.utf : byDchar; 9345 9346 // Decode first 9347 return asLowerCase(str.byDchar); 9348 } 9349 else 9350 { 9351 static import std.ascii; 9352 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9353 } 9354 } 9355 9356 /// ditto 9357 auto asUpperCase(Range)(Range str) 9358 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9359 !isConvertibleToString!Range) 9360 { 9361 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9362 { 9363 import std.utf : byDchar; 9364 9365 // Decode first 9366 return asUpperCase(str.byDchar); 9367 } 9368 else 9369 { 9370 static import std.ascii; 9371 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9372 } 9373 } 9374 9375 /// 9376 @safe pure unittest 9377 { 9378 import std.algorithm.comparison : equal; 9379 9380 assert("hEllo".asUpperCase.equal("HELLO")); 9381 } 9382 9383 // explicitly undocumented 9384 auto asLowerCase(Range)(auto ref Range str) 9385 if (isConvertibleToString!Range) 9386 { 9387 import std.traits : StringTypeOf; 9388 return asLowerCase!(StringTypeOf!Range)(str); 9389 } 9390 9391 // explicitly undocumented 9392 auto asUpperCase(Range)(auto ref Range str) 9393 if (isConvertibleToString!Range) 9394 { 9395 import std.traits : StringTypeOf; 9396 return asUpperCase!(StringTypeOf!Range)(str); 9397 } 9398 9399 @safe unittest 9400 { 9401 static struct TestAliasedString 9402 { 9403 string get() @safe @nogc pure nothrow { return _s; } 9404 alias get this; 9405 @disable this(this); 9406 string _s; 9407 } 9408 9409 static bool testAliasedString(alias func, Args...)(string s, Args args) 9410 { 9411 import std.algorithm.comparison : equal; 9412 auto a = func(TestAliasedString(s), args); 9413 auto b = func(s, args); 9414 static if (is(typeof(equal(a, b)))) 9415 { 9416 // For ranges, compare contents instead of object identity. 9417 return equal(a, b); 9418 } 9419 else 9420 { 9421 return a == b; 9422 } 9423 } 9424 assert(testAliasedString!asLowerCase("hEllo")); 9425 assert(testAliasedString!asUpperCase("hEllo")); 9426 assert(testAliasedString!asCapitalized("hEllo")); 9427 } 9428 9429 @safe unittest 9430 { 9431 import std.array : array; 9432 9433 auto a = "HELLo".asLowerCase; 9434 auto savea = a.save; 9435 auto s = a.array; 9436 assert(s == "hello"); 9437 s = savea.array; 9438 assert(s == "hello"); 9439 9440 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9441 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9442 9443 foreach (i, slwr; lower) 9444 { 9445 import std.utf : byChar; 9446 9447 auto sx = slwr.asUpperCase.byChar.array; 9448 assert(sx == toUpper(slwr)); 9449 auto sy = upper[i].asLowerCase.byChar.array; 9450 assert(sy == toLower(upper[i])); 9451 } 9452 9453 // Not necessary to call r.front 9454 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9455 { 9456 } 9457 9458 import std.algorithm.comparison : equal; 9459 9460 "HELLo"w.asLowerCase.equal("hello"d); 9461 "HELLo"w.asUpperCase.equal("HELLO"d); 9462 "HELLo"d.asLowerCase.equal("hello"d); 9463 "HELLo"d.asUpperCase.equal("HELLO"d); 9464 9465 import std.utf : byChar; 9466 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9467 } 9468 9469 // generic capitalizer on whole range, returns range 9470 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9471 Range)(Range str) 9472 // Accept range of dchar's 9473 if (isInputRange!Range && 9474 isSomeChar!(ElementEncodingType!Range) && 9475 ElementEncodingType!Range.sizeof == dchar.sizeof) 9476 { 9477 static struct ToCapitalizerImpl 9478 { 9479 @property bool empty() 9480 { 9481 return lower ? lwr.empty : !nLeft && r.empty; 9482 } 9483 9484 @property auto front() 9485 { 9486 if (lower) 9487 return lwr.front; 9488 9489 if (!nLeft) 9490 { 9491 immutable dchar c = r.front; 9492 const idx = indexFnUpper(c); 9493 if (idx == ushort.max) 9494 { 9495 buf[0] = c; 9496 nLeft = 1; 9497 } 9498 else if (idx < maxIdxUpper) 9499 { 9500 buf[0] = tableFnUpper(idx); 9501 nLeft = 1; 9502 } 9503 else 9504 { 9505 immutable val = tableFnUpper(idx); 9506 // unpack length + codepoint 9507 nLeft = val >> 24; 9508 if (nLeft == 0) 9509 nLeft = 1; 9510 assert(nLeft <= buf.length); 9511 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9512 foreach (j; 1 .. nLeft) 9513 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9514 } 9515 } 9516 return buf[nLeft - 1]; 9517 } 9518 9519 void popFront() 9520 { 9521 if (lower) 9522 lwr.popFront(); 9523 else 9524 { 9525 if (!nLeft) 9526 front; 9527 assert(nLeft); 9528 --nLeft; 9529 if (!nLeft) 9530 { 9531 r.popFront(); 9532 lwr = r.asLowerCase(); 9533 lower = true; 9534 } 9535 } 9536 } 9537 9538 static if (isForwardRange!Range) 9539 { 9540 @property auto save() 9541 { 9542 auto ret = this; 9543 ret.r = r.save; 9544 ret.lwr = lwr.save; 9545 return ret; 9546 } 9547 } 9548 9549 private: 9550 Range r; 9551 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9552 bool lower = false; // false for first character, true for rest of string 9553 dchar[3] buf = void; 9554 uint nLeft = 0; 9555 } 9556 9557 return ToCapitalizerImpl(str); 9558 } 9559 9560 /********************* 9561 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9562 * or string, meaning convert the first 9563 * character to upper case and subsequent characters to lower case. 9564 * 9565 * Does not allocate memory. 9566 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9567 * are treated as $(REF replacementDchar, std,utf). 9568 * 9569 * Params: 9570 * str = string or range of characters 9571 * 9572 * Returns: 9573 * an InputRange of dchars 9574 * 9575 * See_Also: 9576 * $(LREF toUpper), $(LREF toLower) 9577 * $(LREF asUpperCase), $(LREF asLowerCase) 9578 */ 9579 9580 auto asCapitalized(Range)(Range str) 9581 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9582 !isConvertibleToString!Range) 9583 { 9584 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9585 { 9586 import std.utf : byDchar; 9587 9588 // Decode first 9589 return toCapitalizer!UpperTriple(str.byDchar); 9590 } 9591 else 9592 { 9593 return toCapitalizer!UpperTriple(str); 9594 } 9595 } 9596 9597 /// 9598 @safe pure unittest 9599 { 9600 import std.algorithm.comparison : equal; 9601 9602 assert("hEllo".asCapitalized.equal("Hello")); 9603 } 9604 9605 auto asCapitalized(Range)(auto ref Range str) 9606 if (isConvertibleToString!Range) 9607 { 9608 import std.traits : StringTypeOf; 9609 return asCapitalized!(StringTypeOf!Range)(str); 9610 } 9611 9612 @safe pure nothrow @nogc unittest 9613 { 9614 auto r = "hEllo".asCapitalized(); 9615 assert(r.front == 'H'); 9616 } 9617 9618 @safe unittest 9619 { 9620 import std.array : array; 9621 9622 auto a = "hELLo".asCapitalized; 9623 auto savea = a.save; 9624 auto s = a.array; 9625 assert(s == "Hello"); 9626 s = savea.array; 9627 assert(s == "Hello"); 9628 9629 string[2][] cases = 9630 [ 9631 ["", ""], 9632 ["h", "H"], 9633 ["H", "H"], 9634 ["3", "3"], 9635 ["123", "123"], 9636 ["h123A", "H123a"], 9637 ["феж", "Феж"], 9638 ["\u1Fe2", "\u03a5\u0308\u0300"], 9639 ]; 9640 9641 foreach (i; 0 .. cases.length) 9642 { 9643 import std.utf : byChar; 9644 9645 auto r = cases[i][0].asCapitalized.byChar.array; 9646 auto result = cases[i][1]; 9647 assert(r == result); 9648 } 9649 9650 // Don't call r.front 9651 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9652 { 9653 } 9654 9655 import std.algorithm.comparison : equal; 9656 9657 "HELLo"w.asCapitalized.equal("Hello"d); 9658 "hElLO"w.asCapitalized.equal("Hello"d); 9659 "hello"d.asCapitalized.equal("Hello"d); 9660 "HELLO"d.asCapitalized.equal("Hello"d); 9661 9662 import std.utf : byChar; 9663 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9664 } 9665 9666 // TODO: helper, I wish std.utf was more flexible (and stright) 9667 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9668 { 9669 if (c <= 0x7F) 9670 { 9671 buf[idx] = cast(char) c; 9672 idx++; 9673 } 9674 else if (c <= 0x7FF) 9675 { 9676 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9677 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9678 idx += 2; 9679 } 9680 else if (c <= 0xFFFF) 9681 { 9682 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9683 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9684 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9685 idx += 3; 9686 } 9687 else if (c <= 0x10FFFF) 9688 { 9689 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9690 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9691 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9692 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9693 idx += 4; 9694 } 9695 else 9696 assert(0); 9697 return idx; 9698 } 9699 9700 @safe unittest 9701 { 9702 char[] s = "abcd".dup; 9703 size_t i = 0; 9704 i = encodeTo(s, i, 'X'); 9705 assert(s == "Xbcd"); 9706 9707 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9708 assert(s == "X\xC2\xA9d"); 9709 } 9710 9711 // TODO: helper, I wish std.utf was more flexible (and stright) 9712 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9713 { 9714 import std.utf : UTFException; 9715 if (c <= 0xFFFF) 9716 { 9717 if (0xD800 <= c && c <= 0xDFFF) 9718 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9719 buf[idx] = cast(wchar) c; 9720 idx++; 9721 } 9722 else if (c <= 0x10FFFF) 9723 { 9724 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9725 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9726 idx += 2; 9727 } 9728 else 9729 assert(0); 9730 return idx; 9731 } 9732 9733 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9734 { 9735 buf[idx] = c; 9736 idx++; 9737 return idx; 9738 } 9739 9740 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9741 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9742 { 9743 import std.utf : decode, codeLength; 9744 size_t curIdx = 0; 9745 size_t destIdx = 0; 9746 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9747 size_t lastUnchanged = 0; 9748 // in-buffer move of bytes to a new start index 9749 // the trick is that it may not need to copy at all 9750 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9751 { 9752 // Interestingly we may just bump pointer for a while 9753 // then have to copy if a re-cased char was smaller the original 9754 // later we may regain pace with char that got bigger 9755 // In the end it sometimes flip-flops between the 2 cases below 9756 if (dest == from) 9757 return to; 9758 // got to copy 9759 foreach (C c; str[from .. to]) 9760 str[dest++] = c; 9761 return dest; 9762 } 9763 while (curIdx != s.length) 9764 { 9765 size_t startIdx = curIdx; 9766 immutable ch = decode(s, curIdx); 9767 // TODO: special case for ASCII 9768 immutable caseIndex = indexFn(ch); 9769 if (caseIndex == ushort.max) // unchanged, skip over 9770 { 9771 continue; 9772 } 9773 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9774 { 9775 // previous cased chars had the same length as uncased ones 9776 // thus can just adjust pointer 9777 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9778 lastUnchanged = curIdx; 9779 immutable cased = tableFn(caseIndex); 9780 immutable casedLen = codeLength!C(cased); 9781 if (casedLen + destIdx > curIdx) // no place to fit cased char 9782 { 9783 // switch to slow codepath, where we allocate 9784 return slowToCase(s, startIdx, destIdx); 9785 } 9786 else 9787 { 9788 destIdx = encodeTo(s, destIdx, cased); 9789 } 9790 } 9791 else // 1:m codepoint mapping, slow codepath 9792 { 9793 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9794 lastUnchanged = curIdx; 9795 return slowToCase(s, startIdx, destIdx); 9796 } 9797 assert(destIdx <= curIdx); 9798 } 9799 if (lastUnchanged != s.length) 9800 { 9801 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9802 } 9803 s = s[0 .. destIdx]; 9804 } 9805 9806 // helper to precalculate size of case-converted string 9807 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9808 { 9809 size_t toCaseLength(C)(const scope C[] str) 9810 { 9811 import std.utf : decode, codeLength; 9812 size_t codeLen = 0; 9813 size_t lastNonTrivial = 0; 9814 size_t curIdx = 0; 9815 while (curIdx != str.length) 9816 { 9817 immutable startIdx = curIdx; 9818 immutable ch = decode(str, curIdx); 9819 immutable ushort caseIndex = indexFn(ch); 9820 if (caseIndex == ushort.max) 9821 continue; 9822 else if (caseIndex < maxIdx) 9823 { 9824 codeLen += startIdx - lastNonTrivial; 9825 lastNonTrivial = curIdx; 9826 immutable cased = tableFn(caseIndex); 9827 codeLen += codeLength!C(cased); 9828 } 9829 else 9830 { 9831 codeLen += startIdx - lastNonTrivial; 9832 lastNonTrivial = curIdx; 9833 immutable val = tableFn(caseIndex); 9834 immutable len = val >> 24; 9835 immutable dchar cased = val & 0xFF_FFFF; 9836 codeLen += codeLength!C(cased); 9837 foreach (j; caseIndex+1 .. caseIndex+len) 9838 codeLen += codeLength!C(tableFn(j)); 9839 } 9840 } 9841 if (lastNonTrivial != str.length) 9842 codeLen += str.length - lastNonTrivial; 9843 return codeLen; 9844 } 9845 } 9846 9847 @safe unittest 9848 { 9849 alias toLowerLength = toCaseLength!(LowerTriple); 9850 assert(toLowerLength("abcd") == 4); 9851 assert(toLowerLength("аБВгд456") == 10+3); 9852 } 9853 9854 // slower code path that preallocates and then copies 9855 // case-converted stuf to the new string 9856 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9857 { 9858 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9859 size_t destIdx) @trusted pure 9860 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9861 { 9862 import std.utf : decode; 9863 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9864 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9865 C[] ns = new C[trueLength]; 9866 ns[0 .. destIdx] = s[0 .. destIdx]; 9867 size_t lastUnchanged = curIdx; 9868 while (curIdx != s.length) 9869 { 9870 immutable startIdx = curIdx; // start of current codepoint 9871 immutable ch = decode(s, curIdx); 9872 immutable caseIndex = indexFn(ch); 9873 if (caseIndex == ushort.max) // skip over 9874 { 9875 continue; 9876 } 9877 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9878 { 9879 immutable cased = tableFn(caseIndex); 9880 auto toCopy = startIdx - lastUnchanged; 9881 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9882 lastUnchanged = curIdx; 9883 destIdx += toCopy; 9884 destIdx = encodeTo(ns, destIdx, cased); 9885 } 9886 else // 1:m codepoint mapping, slow codepath 9887 { 9888 auto toCopy = startIdx - lastUnchanged; 9889 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9890 lastUnchanged = curIdx; 9891 destIdx += toCopy; 9892 auto val = tableFn(caseIndex); 9893 // unpack length + codepoint 9894 immutable uint len = val >> 24; 9895 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9896 foreach (j; caseIndex+1 .. caseIndex+len) 9897 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9898 } 9899 } 9900 if (lastUnchanged != s.length) 9901 { 9902 auto toCopy = s.length - lastUnchanged; 9903 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9904 destIdx += toCopy; 9905 } 9906 assert(ns.length == destIdx); 9907 s = ns; 9908 } 9909 } 9910 9911 /++ 9912 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9913 For a few characters string length may increase after the transformation, 9914 in such a case the function reallocates exactly once. 9915 If `s` does not have any uppercase characters, then `s` is unaltered. 9916 +/ 9917 void toLowerInPlace(C)(ref C[] s) @trusted pure 9918 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9919 { 9920 toCaseInPlace!(LowerTriple)(s); 9921 } 9922 // overloads for the most common cases to reduce compile time 9923 @safe pure /*TODO nothrow*/ 9924 { 9925 void toLowerInPlace(ref char[] s) 9926 { toLowerInPlace!char(s); } 9927 void toLowerInPlace(ref wchar[] s) 9928 { toLowerInPlace!wchar(s); } 9929 void toLowerInPlace(ref dchar[] s) 9930 { toLowerInPlace!dchar(s); } 9931 } 9932 9933 /++ 9934 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9935 For a few characters string length may increase after the transformation, 9936 in such a case the function reallocates exactly once. 9937 If `s` does not have any lowercase characters, then `s` is unaltered. 9938 +/ 9939 void toUpperInPlace(C)(ref C[] s) @trusted pure 9940 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9941 { 9942 toCaseInPlace!(UpperTriple)(s); 9943 } 9944 // overloads for the most common cases to reduce compile time/code size 9945 @safe pure /*TODO nothrow*/ 9946 { 9947 void toUpperInPlace(ref char[] s) 9948 { toUpperInPlace!char(s); } 9949 void toUpperInPlace(ref wchar[] s) 9950 { toUpperInPlace!wchar(s); } 9951 void toUpperInPlace(ref dchar[] s) 9952 { toUpperInPlace!dchar(s); } 9953 } 9954 9955 /++ 9956 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9957 is returned. Otherwise `c` is returned. 9958 9959 Warning: certain alphabets like German and Greek have no 1:1 9960 upper-lower mapping. Use overload of toLower which takes full string instead. 9961 +/ 9962 @safe pure nothrow @nogc 9963 dchar toLower(dchar c) 9964 { 9965 // optimize ASCII case 9966 if (c < 0xAA) 9967 { 9968 if (c < 'A') 9969 return c; 9970 if (c <= 'Z') 9971 return c + 32; 9972 return c; 9973 } 9974 size_t idx = toLowerSimpleIndex(c); 9975 if (idx != ushort.max) 9976 { 9977 return toLowerTab(idx); 9978 } 9979 return c; 9980 } 9981 9982 /++ 9983 Creates a new array which is identical to `s` except that all of its 9984 characters are converted to lowercase (by performing Unicode lowercase mapping). 9985 If none of `s` characters were affected, then `s` itself is returned if `s` is a 9986 `string`-like type. 9987 9988 Params: 9989 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 9990 of characters 9991 Returns: 9992 An array with the same element type as `s`. 9993 +/ 9994 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 9995 if (isSomeString!S) 9996 { 9997 static import std.ascii; 9998 return toCase!(LowerTriple, std.ascii.toLower)(s); 9999 } 10000 10001 /// ditto 10002 ElementEncodingType!S[] toLower(S)(S s) 10003 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10004 { 10005 static import std.ascii; 10006 return toCase!(LowerTriple, std.ascii.toLower)(s); 10007 } 10008 10009 // overloads for the most common cases to reduce compile time 10010 @safe pure /*TODO nothrow*/ 10011 { 10012 string toLower(return scope string s) 10013 { return toLower!string(s); } 10014 wstring toLower(return scope wstring s) 10015 { return toLower!wstring(s); } 10016 dstring toLower(return scope dstring s) 10017 { return toLower!dstring(s); } 10018 10019 @safe unittest 10020 { 10021 // https://issues.dlang.org/show_bug.cgi?id=16663 10022 10023 static struct String 10024 { 10025 string data; 10026 alias data this; 10027 } 10028 10029 void foo() 10030 { 10031 auto u = toLower(String("")); 10032 } 10033 } 10034 } 10035 10036 10037 @safe unittest 10038 { 10039 static import std.ascii; 10040 import std.format : format; 10041 foreach (ch; 0 .. 0x80) 10042 assert(std.ascii.toLower(ch) == toLower(ch)); 10043 assert(toLower('Я') == 'я'); 10044 assert(toLower('Δ') == 'δ'); 10045 foreach (ch; unicode.upperCase.byCodepoint) 10046 { 10047 dchar low = ch.toLower(); 10048 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 10049 } 10050 assert(toLower("АЯ") == "ая"); 10051 10052 assert("\u1E9E".toLower == "\u00df"); 10053 assert("\u00df".toUpper == "SS"); 10054 } 10055 10056 // https://issues.dlang.org/show_bug.cgi?id=9629 10057 @safe unittest 10058 { 10059 wchar[] test = "hello þ world"w.dup; 10060 auto piece = test[6 .. 7]; 10061 toUpperInPlace(piece); 10062 assert(test == "hello Þ world"); 10063 } 10064 10065 10066 @safe unittest 10067 { 10068 import std.algorithm.comparison : cmp; 10069 string s1 = "FoL"; 10070 string s2 = toLower(s1); 10071 assert(cmp(s2, "fol") == 0, s2); 10072 assert(s2 != s1); 10073 10074 char[] s3 = s1.dup; 10075 toLowerInPlace(s3); 10076 assert(s3 == s2); 10077 10078 s1 = "A\u0100B\u0101d"; 10079 s2 = toLower(s1); 10080 s3 = s1.dup; 10081 assert(cmp(s2, "a\u0101b\u0101d") == 0); 10082 assert(s2 !is s1); 10083 toLowerInPlace(s3); 10084 assert(s3 == s2); 10085 10086 s1 = "A\u0460B\u0461d"; 10087 s2 = toLower(s1); 10088 s3 = s1.dup; 10089 assert(cmp(s2, "a\u0461b\u0461d") == 0); 10090 assert(s2 !is s1); 10091 toLowerInPlace(s3); 10092 assert(s3 == s2); 10093 10094 s1 = "\u0130"; 10095 s2 = toLower(s1); 10096 s3 = s1.dup; 10097 assert(s2 == "i\u0307"); 10098 assert(s2 !is s1); 10099 toLowerInPlace(s3); 10100 assert(s3 == s2); 10101 10102 // Test on wchar and dchar strings. 10103 assert(toLower("Some String"w) == "some string"w); 10104 assert(toLower("Some String"d) == "some string"d); 10105 10106 // https://issues.dlang.org/show_bug.cgi?id=12455 10107 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 10108 assert(isUpper(c)); 10109 assert(toLower(c) == 'i'); 10110 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 10111 // check simple-case toUpper too 10112 c = '\u1f87'; 10113 assert(isLower(c)); 10114 assert(toUpper(c) == '\u1F8F'); 10115 } 10116 10117 @safe pure unittest 10118 { 10119 import std.algorithm.comparison : cmp, equal; 10120 import std.utf : byCodeUnit; 10121 auto r1 = "FoL".byCodeUnit; 10122 assert(r1.toLower.cmp("fol") == 0); 10123 auto r2 = "A\u0460B\u0461d".byCodeUnit; 10124 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 10125 } 10126 10127 /++ 10128 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 10129 is returned. Otherwise `c` is returned. 10130 10131 Warning: 10132 Certain alphabets like German and Greek have no 1:1 10133 upper-lower mapping. Use overload of toUpper which takes full string instead. 10134 10135 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 10136 to produce an algorithm that can convert a range of characters to upper case 10137 without allocating memory. 10138 A string can then be produced by using $(REF copy, std,algorithm,mutation) 10139 to send it to an $(REF appender, std,array). 10140 +/ 10141 @safe pure nothrow @nogc 10142 dchar toUpper(dchar c) 10143 { 10144 // optimize ASCII case 10145 if (c < 0xAA) 10146 { 10147 if (c < 'a') 10148 return c; 10149 if (c <= 'z') 10150 return c - 32; 10151 return c; 10152 } 10153 size_t idx = toUpperSimpleIndex(c); 10154 if (idx != ushort.max) 10155 { 10156 return toUpperTab(idx); 10157 } 10158 return c; 10159 } 10160 10161 /// 10162 @safe unittest 10163 { 10164 import std.algorithm.iteration : map; 10165 import std.algorithm.mutation : copy; 10166 import std.array : appender; 10167 10168 auto abuf = appender!(char[])(); 10169 "hello".map!toUpper.copy(abuf); 10170 assert(abuf.data == "HELLO"); 10171 } 10172 10173 @safe unittest 10174 { 10175 static import std.ascii; 10176 import std.format : format; 10177 foreach (ch; 0 .. 0x80) 10178 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10179 assert(toUpper('я') == 'Я'); 10180 assert(toUpper('δ') == 'Δ'); 10181 auto title = unicode.Titlecase_Letter; 10182 foreach (ch; unicode.lowerCase.byCodepoint) 10183 { 10184 dchar up = ch.toUpper(); 10185 assert(up == ch || isUpper(up) || title[up], 10186 format("%x -> %x", ch, up)); 10187 } 10188 } 10189 10190 /++ 10191 Allocates a new array which is identical to `s` except that all of its 10192 characters are converted to uppercase (by performing Unicode uppercase mapping). 10193 If none of `s` characters were affected, then `s` itself is returned if `s` 10194 is a `string`-like type. 10195 10196 Params: 10197 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10198 of characters 10199 Returns: 10200 An new array with the same element type as `s`. 10201 +/ 10202 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10203 if (isSomeString!S) 10204 { 10205 static import std.ascii; 10206 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10207 } 10208 10209 /// ditto 10210 ElementEncodingType!S[] toUpper(S)(S s) 10211 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10212 { 10213 static import std.ascii; 10214 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10215 } 10216 10217 // overloads for the most common cases to reduce compile time 10218 @safe pure /*TODO nothrow*/ 10219 { 10220 string toUpper(return scope string s) 10221 { return toUpper!string(s); } 10222 wstring toUpper(return scope wstring s) 10223 { return toUpper!wstring(s); } 10224 dstring toUpper(return scope dstring s) 10225 { return toUpper!dstring(s); } 10226 10227 @safe unittest 10228 { 10229 // https://issues.dlang.org/show_bug.cgi?id=16663 10230 10231 static struct String 10232 { 10233 string data; 10234 alias data this; 10235 } 10236 10237 void foo() 10238 { 10239 auto u = toUpper(String("")); 10240 } 10241 } 10242 } 10243 10244 @safe unittest 10245 { 10246 import std.algorithm.comparison : cmp; 10247 10248 string s1 = "FoL"; 10249 string s2; 10250 char[] s3; 10251 10252 s2 = toUpper(s1); 10253 s3 = s1.dup; toUpperInPlace(s3); 10254 assert(s3 == s2, s3); 10255 assert(cmp(s2, "FOL") == 0); 10256 assert(s2 !is s1); 10257 10258 s1 = "a\u0100B\u0101d"; 10259 s2 = toUpper(s1); 10260 s3 = s1.dup; toUpperInPlace(s3); 10261 assert(s3 == s2); 10262 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10263 assert(s2 !is s1); 10264 10265 s1 = "a\u0460B\u0461d"; 10266 s2 = toUpper(s1); 10267 s3 = s1.dup; toUpperInPlace(s3); 10268 assert(s3 == s2); 10269 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10270 assert(s2 !is s1); 10271 } 10272 10273 @safe unittest 10274 { 10275 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10276 { 10277 import std.format : format; 10278 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10279 auto low = s.toLower() , up = s.toUpper(); 10280 auto lowInp = s.dup, upInp = s.dup; 10281 lowInp.toLowerInPlace(); 10282 upInp.toUpperInPlace(); 10283 assert(low == trueLow, format(diff, low, trueLow)); 10284 assert(up == trueUp, format(diff, up, trueUp)); 10285 assert(lowInp == trueLow, 10286 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10287 assert(upInp == trueUp, 10288 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10289 } 10290 static foreach (S; AliasSeq!(dstring, wstring, string)) 10291 {{ 10292 10293 S easy = "123"; 10294 S good = "abCФеж"; 10295 S awful = "\u0131\u023f\u2126"; 10296 S wicked = "\u0130\u1FE2"; 10297 auto options = [easy, good, awful, wicked]; 10298 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10299 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10300 10301 foreach (val; [easy, good]) 10302 { 10303 auto e = val.dup; 10304 auto g = e; 10305 e.toUpperInPlace(); 10306 assert(e is g); 10307 e.toLowerInPlace(); 10308 assert(e is g); 10309 } 10310 foreach (i, v; options) 10311 { 10312 doTest(v, upper[i], lower[i]); 10313 } 10314 10315 // a few combinatorial runs 10316 foreach (i; 0 .. options.length) 10317 foreach (j; i .. options.length) 10318 foreach (k; j .. options.length) 10319 { 10320 auto sample = options[i] ~ options[j] ~ options[k]; 10321 auto sample2 = options[k] ~ options[j] ~ options[i]; 10322 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10323 lower[i] ~ lower[j] ~ lower[k]); 10324 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10325 lower[k] ~ lower[j] ~ lower[i]); 10326 } 10327 }} 10328 } 10329 10330 // test random access ranges 10331 @safe pure unittest 10332 { 10333 import std.algorithm.comparison : cmp; 10334 import std.utf : byCodeUnit; 10335 auto s1 = "FoL".byCodeUnit; 10336 assert(s1.toUpper.cmp("FOL") == 0); 10337 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10338 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10339 } 10340 10341 /++ 10342 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10343 (general Unicode category: Alphabetic). 10344 +/ 10345 @safe pure nothrow @nogc 10346 bool isAlpha(dchar c) 10347 { 10348 // optimization 10349 if (c < 0xAA) 10350 { 10351 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); 10352 } 10353 10354 return alphaTrie[c]; 10355 } 10356 10357 @safe unittest 10358 { 10359 auto alpha = unicode("Alphabetic"); 10360 foreach (ch; alpha.byCodepoint) 10361 assert(isAlpha(ch)); 10362 foreach (ch; 0 .. 0x4000) 10363 assert((ch in alpha) == isAlpha(ch)); 10364 } 10365 10366 10367 /++ 10368 Returns whether `c` is a Unicode mark 10369 (general Unicode category: Mn, Me, Mc). 10370 +/ 10371 @safe pure nothrow @nogc 10372 bool isMark(dchar c) 10373 { 10374 return markTrie[c]; 10375 } 10376 10377 @safe unittest 10378 { 10379 auto mark = unicode("Mark"); 10380 foreach (ch; mark.byCodepoint) 10381 assert(isMark(ch)); 10382 foreach (ch; 0 .. 0x4000) 10383 assert((ch in mark) == isMark(ch)); 10384 } 10385 10386 /++ 10387 Returns whether `c` is a Unicode numerical $(CHARACTER) 10388 (general Unicode category: Nd, Nl, No). 10389 +/ 10390 @safe pure nothrow @nogc 10391 bool isNumber(dchar c) 10392 { 10393 // optimization for ascii case 10394 if (c <= 0x7F) 10395 { 10396 return c >= '0' && c <= '9'; 10397 } 10398 else 10399 { 10400 return numberTrie[c]; 10401 } 10402 } 10403 10404 @safe unittest 10405 { 10406 auto n = unicode("N"); 10407 foreach (ch; n.byCodepoint) 10408 assert(isNumber(ch)); 10409 foreach (ch; 0 .. 0x4000) 10410 assert((ch in n) == isNumber(ch)); 10411 } 10412 10413 /++ 10414 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10415 (general Unicode category: Alphabetic, Nd, Nl, No). 10416 10417 Params: 10418 c = any Unicode character 10419 Returns: 10420 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10421 categories 10422 +/ 10423 @safe pure nothrow @nogc 10424 bool isAlphaNum(dchar c) 10425 { 10426 static import std.ascii; 10427 10428 // optimization for ascii case 10429 if (std.ascii.isASCII(c)) 10430 { 10431 return std.ascii.isAlphaNum(c); 10432 } 10433 else 10434 { 10435 return isAlpha(c) || isNumber(c); 10436 } 10437 } 10438 10439 @safe unittest 10440 { 10441 auto n = unicode("N"); 10442 auto alpha = unicode("Alphabetic"); 10443 10444 foreach (ch; n.byCodepoint) 10445 assert(isAlphaNum(ch)); 10446 10447 foreach (ch; alpha.byCodepoint) 10448 assert(isAlphaNum(ch)); 10449 10450 foreach (ch; 0 .. 0x4000) 10451 { 10452 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10453 } 10454 } 10455 10456 /++ 10457 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10458 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10459 +/ 10460 @safe pure nothrow @nogc 10461 bool isPunctuation(dchar c) 10462 { 10463 static import std.ascii; 10464 10465 // optimization for ascii case 10466 if (c <= 0x7F) 10467 { 10468 return std.ascii.isPunctuation(c); 10469 } 10470 else 10471 { 10472 return punctuationTrie[c]; 10473 } 10474 } 10475 10476 @safe unittest 10477 { 10478 assert(isPunctuation('\u0021')); 10479 assert(isPunctuation('\u0028')); 10480 assert(isPunctuation('\u0029')); 10481 assert(isPunctuation('\u002D')); 10482 assert(isPunctuation('\u005F')); 10483 assert(isPunctuation('\u00AB')); 10484 assert(isPunctuation('\u00BB')); 10485 foreach (ch; unicode("P").byCodepoint) 10486 assert(isPunctuation(ch)); 10487 } 10488 10489 /++ 10490 Returns whether `c` is a Unicode symbol $(CHARACTER) 10491 (general Unicode category: Sm, Sc, Sk, So). 10492 +/ 10493 @safe pure nothrow @nogc 10494 bool isSymbol(dchar c) 10495 { 10496 return symbolTrie[c]; 10497 } 10498 10499 @safe unittest 10500 { 10501 import std.format : format; 10502 assert(isSymbol('\u0024')); 10503 assert(isSymbol('\u002B')); 10504 assert(isSymbol('\u005E')); 10505 assert(isSymbol('\u00A6')); 10506 foreach (ch; unicode("S").byCodepoint) 10507 assert(isSymbol(ch), format("%04x", ch)); 10508 } 10509 10510 /++ 10511 Returns whether `c` is a Unicode space $(CHARACTER) 10512 (general Unicode category: Zs) 10513 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10514 For commonly used less strict semantics see $(LREF isWhite). 10515 +/ 10516 @safe pure nothrow @nogc 10517 bool isSpace(dchar c) 10518 { 10519 import std.internal.unicode_tables : isSpaceGen; // generated file 10520 return isSpaceGen(c); 10521 } 10522 10523 @safe unittest 10524 { 10525 assert(isSpace('\u0020')); 10526 auto space = unicode.Zs; 10527 foreach (ch; space.byCodepoint) 10528 assert(isSpace(ch)); 10529 foreach (ch; 0 .. 0x1000) 10530 assert(isSpace(ch) == space[ch]); 10531 } 10532 10533 10534 /++ 10535 Returns whether `c` is a Unicode graphical $(CHARACTER) 10536 (general Unicode category: L, M, N, P, S, Zs). 10537 10538 +/ 10539 @safe pure nothrow @nogc 10540 bool isGraphical(dchar c) 10541 { 10542 return graphicalTrie[c]; 10543 } 10544 10545 10546 @safe unittest 10547 { 10548 auto set = unicode("Graphical"); 10549 import std.format : format; 10550 foreach (ch; set.byCodepoint) 10551 assert(isGraphical(ch), format("%4x", ch)); 10552 foreach (ch; 0 .. 0x4000) 10553 assert((ch in set) == isGraphical(ch)); 10554 } 10555 10556 10557 /++ 10558 Returns whether `c` is a Unicode control $(CHARACTER) 10559 (general Unicode category: Cc). 10560 +/ 10561 @safe pure nothrow @nogc 10562 bool isControl(dchar c) 10563 { 10564 import std.internal.unicode_tables : isControlGen; // generated file 10565 return isControlGen(c); 10566 } 10567 10568 @safe unittest 10569 { 10570 assert(isControl('\u0000')); 10571 assert(isControl('\u0081')); 10572 assert(!isControl('\u0100')); 10573 auto cc = unicode.Cc; 10574 foreach (ch; cc.byCodepoint) 10575 assert(isControl(ch)); 10576 foreach (ch; 0 .. 0x1000) 10577 assert(isControl(ch) == cc[ch]); 10578 } 10579 10580 10581 /++ 10582 Returns whether `c` is a Unicode formatting $(CHARACTER) 10583 (general Unicode category: Cf). 10584 +/ 10585 @safe pure nothrow @nogc 10586 bool isFormat(dchar c) 10587 { 10588 import std.internal.unicode_tables : isFormatGen; // generated file 10589 return isFormatGen(c); 10590 } 10591 10592 10593 @safe unittest 10594 { 10595 assert(isFormat('\u00AD')); 10596 foreach (ch; unicode("Format").byCodepoint) 10597 assert(isFormat(ch)); 10598 } 10599 10600 // code points for private use, surrogates are not likely to change in near feature 10601 // if need be they can be generated from unicode data as well 10602 10603 /++ 10604 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10605 (general Unicode category: Co). 10606 +/ 10607 @safe pure nothrow @nogc 10608 bool isPrivateUse(dchar c) 10609 { 10610 return (0x00_E000 <= c && c <= 0x00_F8FF) 10611 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10612 || (0x10_0000 <= c && c <= 0x10_FFFD); 10613 } 10614 10615 /++ 10616 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10617 (general Unicode category: Cs). 10618 +/ 10619 @safe pure nothrow @nogc 10620 bool isSurrogate(dchar c) 10621 { 10622 return (0xD800 <= c && c <= 0xDFFF); 10623 } 10624 10625 /++ 10626 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10627 +/ 10628 @safe pure nothrow @nogc 10629 bool isSurrogateHi(dchar c) 10630 { 10631 return (0xD800 <= c && c <= 0xDBFF); 10632 } 10633 10634 /++ 10635 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10636 +/ 10637 @safe pure nothrow @nogc 10638 bool isSurrogateLo(dchar c) 10639 { 10640 return (0xDC00 <= c && c <= 0xDFFF); 10641 } 10642 10643 /++ 10644 Returns whether `c` is a Unicode non-character i.e. 10645 a $(CODEPOINT) with no assigned abstract character. 10646 (general Unicode category: Cn) 10647 +/ 10648 @safe pure nothrow @nogc 10649 bool isNonCharacter(dchar c) 10650 { 10651 return nonCharacterTrie[c]; 10652 } 10653 10654 @safe unittest 10655 { 10656 auto set = unicode("Cn"); 10657 foreach (ch; set.byCodepoint) 10658 assert(isNonCharacter(ch)); 10659 } 10660 10661 private: 10662 // load static data from pre-generated tables into usable datastructures 10663 10664 10665 @safe auto asSet(const (ubyte)[] compressed) pure 10666 { 10667 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10668 } 10669 10670 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10671 { 10672 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10673 } 10674 10675 @safe pure nothrow @nogc @property 10676 { 10677 // It's important to use auto return here, so that the compiler 10678 // only runs semantic on the return type if the function gets 10679 // used. Also these are functions rather than templates to not 10680 // increase the object size of the caller. 10681 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10682 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10683 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10684 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10685 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10686 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10687 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10688 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10689 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10690 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10691 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10692 10693 //normalization quick-check tables 10694 auto nfcQCTrie() 10695 { 10696 import std.internal.unicode_norm : nfcQCTrieEntries; 10697 static immutable res = asTrie(nfcQCTrieEntries); 10698 return res; 10699 } 10700 10701 auto nfdQCTrie() 10702 { 10703 import std.internal.unicode_norm : nfdQCTrieEntries; 10704 static immutable res = asTrie(nfdQCTrieEntries); 10705 return res; 10706 } 10707 10708 auto nfkcQCTrie() 10709 { 10710 import std.internal.unicode_norm : nfkcQCTrieEntries; 10711 static immutable res = asTrie(nfkcQCTrieEntries); 10712 return res; 10713 } 10714 10715 auto nfkdQCTrie() 10716 { 10717 import std.internal.unicode_norm : nfkdQCTrieEntries; 10718 static immutable res = asTrie(nfkdQCTrieEntries); 10719 return res; 10720 } 10721 10722 //grapheme breaking algorithm tables 10723 auto spacingMarkTrie() 10724 { 10725 import std.internal.unicode_grapheme : spacingMarkTrieEntries; 10726 static immutable res = asTrie(spacingMarkTrieEntries); 10727 return res; 10728 } 10729 10730 auto graphemeExtendTrie() 10731 { 10732 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10733 static immutable res = asTrie(graphemeExtendTrieEntries); 10734 return res; 10735 } 10736 10737 auto hangLV() 10738 { 10739 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10740 static immutable res = asTrie(hangulLVTrieEntries); 10741 return res; 10742 } 10743 10744 auto hangLVT() 10745 { 10746 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10747 static immutable res = asTrie(hangulLVTTrieEntries); 10748 return res; 10749 } 10750 10751 auto prependTrie() 10752 { 10753 import std.internal.unicode_grapheme : prependTrieEntries; 10754 static immutable res = asTrie(prependTrieEntries); 10755 return res; 10756 } 10757 10758 auto graphemeControlTrie() 10759 { 10760 import std.internal.unicode_grapheme : controlTrieEntries; 10761 static immutable res = asTrie(controlTrieEntries); 10762 return res; 10763 } 10764 10765 auto xpictoTrie() 10766 { 10767 import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; 10768 static immutable res = asTrie(Extended_PictographicTrieEntries); 10769 return res; 10770 } 10771 10772 // tables below are used for composition/decomposition 10773 auto combiningClassTrie() 10774 { 10775 import std.internal.unicode_comp : combiningClassTrieEntries; 10776 static immutable res = asTrie(combiningClassTrieEntries); 10777 return res; 10778 } 10779 10780 auto compatMappingTrie() 10781 { 10782 import std.internal.unicode_decomp : compatMappingTrieEntries; 10783 static immutable res = asTrie(compatMappingTrieEntries); 10784 return res; 10785 } 10786 10787 auto canonMappingTrie() 10788 { 10789 import std.internal.unicode_decomp : canonMappingTrieEntries; 10790 static immutable res = asTrie(canonMappingTrieEntries); 10791 return res; 10792 } 10793 10794 auto compositionJumpTrie() 10795 { 10796 import std.internal.unicode_comp : compositionJumpTrieEntries; 10797 static immutable res = asTrie(compositionJumpTrieEntries); 10798 return res; 10799 } 10800 10801 //case conversion tables 10802 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10803 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10804 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10805 //simple case conversion tables 10806 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10807 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10808 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10809 10810 } 10811 10812 }// version (!std_uni_bootstrap)