1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 $(LREF popGrapheme) 20 )) 21 $(TR $(TD Comparison) $(TD 22 $(LREF icmp) 23 $(LREF sicmp) 24 )) 25 $(TR $(TD Classification) $(TD 26 $(LREF isAlpha) 27 $(LREF isAlphaNum) 28 $(LREF isCodepointSet) 29 $(LREF isControl) 30 $(LREF isFormat) 31 $(LREF isGraphical) 32 $(LREF isIntegralPair) 33 $(LREF isMark) 34 $(LREF isNonCharacter) 35 $(LREF isNumber) 36 $(LREF isPrivateUse) 37 $(LREF isPunctuation) 38 $(LREF isSpace) 39 $(LREF isSurrogate) 40 $(LREF isSurrogateHi) 41 $(LREF isSurrogateLo) 42 $(LREF isSymbol) 43 $(LREF isWhite) 44 )) 45 $(TR $(TD Normalization) $(TD 46 $(LREF NFC) 47 $(LREF NFD) 48 $(LREF NFKD) 49 $(LREF NormalizationForm) 50 $(LREF normalize) 51 )) 52 $(TR $(TD Decompose) $(TD 53 $(LREF decompose) 54 $(LREF decomposeHangul) 55 $(LREF UnicodeDecomposition) 56 )) 57 $(TR $(TD Compose) $(TD 58 $(LREF compose) 59 $(LREF composeJamo) 60 )) 61 $(TR $(TD Sets) $(TD 62 $(LREF CodepointInterval) 63 $(LREF CodepointSet) 64 $(LREF InversionList) 65 $(LREF unicode) 66 )) 67 $(TR $(TD Trie) $(TD 68 $(LREF codepointSetTrie) 69 $(LREF CodepointSetTrie) 70 $(LREF codepointTrie) 71 $(LREF CodepointTrie) 72 $(LREF toTrie) 73 $(LREF toDelegate) 74 )) 75 $(TR $(TD Casing) $(TD 76 $(LREF asCapitalized) 77 $(LREF asLowerCase) 78 $(LREF asUpperCase) 79 $(LREF isLower) 80 $(LREF isUpper) 81 $(LREF toLower) 82 $(LREF toLowerInPlace) 83 $(LREF toUpper) 84 $(LREF toUpperInPlace) 85 )) 86 $(TR $(TD Utf8Matcher) $(TD 87 $(LREF isUtfMatcher) 88 $(LREF MatcherConcept) 89 $(LREF utfMatcher) 90 )) 91 $(TR $(TD Separators) $(TD 92 $(LREF lineSep) 93 $(LREF nelSep) 94 $(LREF paraSep) 95 )) 96 $(TR $(TD Building blocks) $(TD 97 $(LREF allowedIn) 98 $(LREF combiningClass) 99 $(LREF Grapheme) 100 )) 101 )) 102 103 $(P All primitives listed operate on Unicode characters and 104 sets of characters. For functions which operate on ASCII characters 105 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 106 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 107 used throughout this module see the $(S_LINK Terminology, terminology) section 108 below. 109 ) 110 $(P The focus of this module is the core needs of developing Unicode-aware 111 applications. To that effect it provides the following optimized primitives: 112 ) 113 $(UL 114 $(LI Character classification by category and common properties: 115 $(LREF isAlpha), $(LREF isWhite) and others. 116 ) 117 $(LI 118 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 119 ) 120 $(LI 121 Converting text to any of the four normalization forms via $(LREF normalize). 122 ) 123 $(LI 124 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 125 by user-perceived characters, that is by $(LREF Grapheme) clusters. 126 ) 127 $(LI 128 Decomposing and composing of individual character(s) according to canonical 129 or compatibility rules, see $(LREF compose) and $(LREF decompose), 130 including the specific version for Hangul syllables $(LREF composeJamo) 131 and $(LREF decomposeHangul). 132 ) 133 ) 134 $(P It's recognized that an application may need further enhancements 135 and extensions, such as less commonly known algorithms, 136 or tailoring existing ones for region specific needs. To help users 137 with building any extra functionality beyond the core primitives, 138 the module provides: 139 ) 140 $(UL 141 $(LI 142 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 143 Besides the typical set algebra it provides an unusual feature: 144 a D source code generator for detection of $(CODEPOINTS) in this set. 145 This is a boon for meta-programming parser frameworks, 146 and is used internally to power classification in small 147 sets like $(LREF isWhite). 148 ) 149 $(LI 150 A way to construct optimal packed multi-stage tables also known as a 151 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 152 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 153 construct custom tries that map dchar to value. 154 The end result is a fast and predictable $(BIGOH 1) lookup that powers 155 functions like $(LREF isAlpha) and $(LREF combiningClass), 156 but for user-defined data sets. 157 ) 158 $(LI 159 A useful technique for Unicode-aware parsers that perform 160 character classification of encoded $(CODEPOINTS) 161 is to avoid unnecassary decoding at all costs. 162 $(LREF utfMatcher) provides an improvement over the usual workflow 163 of decode-classify-process, combining the decoding and classification 164 steps. By extracting necessary bits directly from encoded 165 $(S_LINK Code unit, code units) matchers achieve 166 significant performance improvements. See $(LREF MatcherConcept) for 167 the common interface of UTF matchers. 168 ) 169 $(LI 170 Generally useful building blocks for customized normalization: 171 $(LREF combiningClass) for querying combining class 172 and $(LREF allowedIn) for testing the Quick_Check 173 property of a given normalization form. 174 ) 175 $(LI 176 Access to a large selection of commonly used sets of $(CODEPOINTS). 177 $(S_LINK Unicode properties, Supported sets) include Script, 178 Block and General Category. The exact contents of a set can be 179 observed in the CLDR utility, on the 180 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 181 of the Unicode website. 182 See $(LREF unicode) for easy and (optionally) compile-time checked set 183 queries. 184 ) 185 ) 186 $(SECTION Synopsis) 187 --- 188 import std.uni; 189 void main() 190 { 191 // initialize code point sets using script/block or property name 192 // now 'set' contains code points from both scripts. 193 auto set = unicode("Cyrillic") | unicode("Armenian"); 194 // same thing but simpler and checked at compile-time 195 auto ascii = unicode.ASCII; 196 auto currency = unicode.Currency_Symbol; 197 198 // easy set ops 199 auto a = set & ascii; 200 assert(a.empty); // as it has no intersection with ascii 201 a = set | ascii; 202 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 203 204 // some properties of code point sets 205 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 206 // testing presence of a code point in a set 207 // is just fine, it is O(logN) 208 assert(!b['$']); 209 assert(!b['\u058F']); // Armenian dram sign 210 assert(b['¥']); 211 212 // building fast lookup tables, these guarantee O(1) complexity 213 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 214 auto oneTrie = toTrie!1(b); 215 // 2-level far more compact but typically slightly slower 216 auto twoTrie = toTrie!2(b); 217 // 3-level even smaller, and a bit slower yet 218 auto threeTrie = toTrie!3(b); 219 assert(oneTrie['£']); 220 assert(twoTrie['£']); 221 assert(threeTrie['£']); 222 223 // build the trie with the most sensible trie level 224 // and bind it as a functor 225 auto cyrillicOrArmenian = toDelegate(set); 226 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 227 assert(balance == "ընկեր!"); 228 // compatible with bool delegate(dchar) 229 bool delegate(dchar) bindIt = cyrillicOrArmenian; 230 231 // Normalization 232 string s = "Plain ascii (and not only), is always normalized!"; 233 assert(s is normalize(s));// is the same string 234 235 string nonS = "A\u0308ffin"; // A ligature 236 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 237 assert(nS == "Äffin"); 238 assert(nS != nonS); 239 string composed = "Äffin"; 240 241 assert(normalize!NFD(composed) == "A\u0308ffin"); 242 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 243 assert(normalize!NFKD("2¹⁰") == "210"); 244 } 245 --- 246 $(SECTION Terminology) 247 $(P The following is a list of important Unicode notions 248 and definitions. Any conventions used specifically in this 249 module alone are marked as such. The descriptions are based on the formal 250 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 251 chapter three of The Unicode Standard Core Specification.) 252 ) 253 $(P $(DEF Abstract character) A unit of information used for the organization, 254 control, or representation of textual data. 255 Note that: 256 $(UL 257 $(LI When representing data, the nature of that data 258 is generally symbolic as opposed to some other 259 kind of data (for example, visual). 260 ) 261 $(LI An abstract character has no concrete form 262 and should not be confused with a $(S_LINK Glyph, glyph). 263 ) 264 $(LI An abstract character does not necessarily 265 correspond to what a user thinks of as a “character” 266 and should not be confused with a $(LREF Grapheme). 267 ) 268 $(LI The abstract characters encoded (see Encoded character) 269 are known as Unicode abstract characters. 270 ) 271 $(LI Abstract characters not directly 272 encoded by the Unicode Standard can often be 273 represented by the use of combining character sequences. 274 ) 275 ) 276 ) 277 $(P $(DEF Canonical decomposition) 278 The decomposition of a character or character sequence 279 that results from recursively applying the canonical 280 mappings found in the Unicode Character Database 281 and these described in Conjoining Jamo Behavior 282 (section 12 of 283 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 284 ) 285 $(P $(DEF Canonical composition) 286 The precise definition of the Canonical composition 287 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 288 Unicode Conformance) section 11. 289 Informally it's the process that does the reverse of the canonical 290 decomposition with the addition of certain rules 291 that e.g. prevent legacy characters from appearing in the composed result. 292 ) 293 $(P $(DEF Canonical equivalent) 294 Two character sequences are said to be canonical equivalents if 295 their full canonical decompositions are identical. 296 ) 297 $(P $(DEF Character) Typically differs by context. 298 For the purpose of this documentation the term $(I character) 299 implies $(I encoded character), that is, a code point having 300 an assigned abstract character (a symbolic meaning). 301 ) 302 $(P $(DEF Code point) Any value in the Unicode codespace; 303 that is, the range of integers from 0 to 10FFFF (hex). 304 Not all code points are assigned to encoded characters. 305 ) 306 $(P $(DEF Code unit) The minimal bit combination that can represent 307 a unit of encoded text for processing or interchange. 308 Depending on the encoding this could be: 309 8-bit code units in the UTF-8 (`char`), 310 16-bit code units in the UTF-16 (`wchar`), 311 and 32-bit code units in the UTF-32 (`dchar`). 312 $(I Note that in UTF-32, a code unit is a code point 313 and is represented by the D `dchar` type.) 314 ) 315 $(P $(DEF Combining character) A character with the General Category 316 of Combining Mark(M). 317 $(UL 318 $(LI All characters with non-zero canonical combining class 319 are combining characters, but the reverse is not the case: 320 there are combining characters with a zero combining class. 321 ) 322 $(LI These characters are not normally used in isolation 323 unless they are being described. They include such characters 324 as accents, diacritics, Hebrew points, Arabic vowel signs, 325 and Indic matras. 326 ) 327 ) 328 ) 329 $(P $(DEF Combining class) 330 A numerical value used by the Unicode Canonical Ordering Algorithm 331 to determine which sequences of combining marks are to be 332 considered canonically equivalent and which are not. 333 ) 334 $(P $(DEF Compatibility decomposition) 335 The decomposition of a character or character sequence that results 336 from recursively applying both the compatibility mappings and 337 the canonical mappings found in the Unicode Character Database, and those 338 described in Conjoining Jamo Behavior no characters 339 can be further decomposed. 340 ) 341 $(P $(DEF Compatibility equivalent) 342 Two character sequences are said to be compatibility 343 equivalents if their full compatibility decompositions are identical. 344 ) 345 $(P $(DEF Encoded character) An association (or mapping) 346 between an abstract character and a code point. 347 ) 348 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 349 having been rasterized or otherwise imaged onto some display surface. 350 ) 351 $(P $(DEF Grapheme base) A character with the property 352 Grapheme_Base, or any standard Korean syllable block. 353 ) 354 $(P $(DEF Grapheme cluster) Defined as the text between 355 grapheme boundaries as specified by Unicode Standard Annex #29, 356 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 357 Important general properties of a grapheme: 358 $(UL 359 $(LI The grapheme cluster represents a horizontally segmentable 360 unit of text, consisting of some grapheme base (which may 361 consist of a Korean syllable) together with any number of 362 nonspacing marks applied to it. 363 ) 364 $(LI A grapheme cluster typically starts with a grapheme base 365 and then extends across any subsequent sequence of nonspacing marks. 366 A grapheme cluster is most directly relevant to text rendering and 367 processes such as cursor placement and text selection in editing, 368 but may also be relevant to comparison and searching. 369 ) 370 $(LI For many processes, a grapheme cluster behaves as if it was a 371 single character with the same properties as its grapheme base. 372 Effectively, nonspacing marks apply $(I graphically) to the base, 373 but do not change its properties. 374 ) 375 ) 376 $(P This module defines a number of primitives that work with graphemes: 377 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 378 All of them are using $(I extended grapheme) boundaries 379 as defined in the aforementioned standard annex. 380 ) 381 ) 382 $(P $(DEF Nonspacing mark) A combining character with the 383 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 384 ) 385 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 386 ) 387 $(SECTION Normalization) 388 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 389 or $(S_LINK Compatibility equivalent, compatibility equivalent) 390 characters in the Unicode Standard make it necessary to have a full, formal 391 definition of equivalence for Unicode strings. 392 String equivalence is determined by a process called normalization, 393 whereby strings are converted into forms which are compared 394 directly for identity. This is the primary goal of the normalization process, 395 see the function $(LREF normalize) to convert into any of 396 the four defined forms. 397 ) 398 $(P A very important attribute of the Unicode Normalization Forms 399 is that they must remain stable between versions of the Unicode Standard. 400 A Unicode string normalized to a particular Unicode Normalization Form 401 in one version of the standard is guaranteed to remain in that Normalization 402 Form for implementations of future versions of the standard. 403 ) 404 $(P The Unicode Standard specifies four normalization forms. 405 Informally, two of these forms are defined by maximal decomposition 406 of equivalent sequences, and two of these forms are defined 407 by maximal $(I composition) of equivalent sequences. 408 $(UL 409 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 410 canonical decomposition) of a character sequence.) 411 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 412 compatibility decomposition) of a character sequence.) 413 $(LI Normalization Form C (NFC): The canonical composition of the 414 $(S_LINK Canonical decomposition, canonical decomposition) 415 of a coded character sequence.) 416 $(LI Normalization Form KC (NFKC): The canonical composition 417 of the $(S_LINK Compatibility decomposition, 418 compatibility decomposition) of a character sequence) 419 ) 420 ) 421 $(P The choice of the normalization form depends on the particular use case. 422 NFC is the best form for general text, since it's more compatible with 423 strings converted from legacy encodings. NFKC is the preferred form for 424 identifiers, especially where there are security concerns. NFD and NFKD 425 are the most useful for internal processing. 426 ) 427 $(SECTION Construction of lookup tables) 428 $(P The Unicode standard describes a set of algorithms that 429 depend on having the ability to quickly look up various properties 430 of a code point. Given the codespace of about 1 million $(CODEPOINTS), 431 it is not a trivial task to provide a space-efficient solution for 432 the multitude of properties. 433 ) 434 $(P Common approaches such as hash-tables or binary search over 435 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 436 Hash-tables have enormous memory footprint and binary search 437 over intervals is not fast enough for some heavy-duty algorithms. 438 ) 439 $(P The recommended solution (see Unicode Implementation Guidelines) 440 is using multi-stage tables that are an implementation of the 441 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 442 keys and a fixed number of stages. For the remainder of the section 443 this will be called a fixed trie. The following describes a particular 444 implementation that is aimed for the speed of access at the expense 445 of ideal size savings. 446 ) 447 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 448 Split the number of bits in a key (code point, 21 bits) into 2 components 449 (e.g. 15 and 8). The first is the number of bits in the index of the trie 450 and the other is number of bits in each page of the trie. 451 The layout of the trie is then an array of size 2^^bits-of-index followed 452 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 453 ) 454 $(P The number of pages is variable (but not less then 1) 455 unlike the number of entries in the index. The slots of the index 456 all have to contain a number of a page that is present. The lookup is then 457 just a couple of operations - slice the upper bits, 458 lookup an index for these, take a page at this index and use 459 the lower bits as an offset within this page. 460 461 Assuming that pages are laid out consequently 462 in one array at `pages`, the pseudo-code is: 463 ) 464 --- 465 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 466 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 467 --- 468 $(P Where if `elemsPerPage` is a power of 2 the whole process is 469 a handful of simple instructions and 2 array reads. Subsequent levels 470 of the trie are introduced by recursing on this notion - the index array 471 is treated as values. The number of bits in index is then again 472 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 473 ) 474 475 $(P For completeness a level 1 trie is simply an array. 476 The current implementation takes advantage of bit-packing values 477 when the range is known to be limited in advance (such as `bool`). 478 See also $(LREF BitPacked) for enforcing it manually. 479 The major size advantage however comes from the fact 480 that multiple $(B identical pages on every level are merged) by construction. 481 ) 482 $(P The process of constructing a trie is more involved and is hidden from 483 the user in a form of the convenience functions $(LREF codepointTrie), 484 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 485 In general a set or built-in AA with `dchar` type 486 can be turned into a trie. The trie object in this module 487 is read-only (immutable); it's effectively frozen after construction. 488 ) 489 $(SECTION Unicode properties) 490 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 491 with specific helpers per category nested within. Consult the 492 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 493 when in doubt about the contents of a particular set. 494 ) 495 $(P General category sets listed below are only accessible with the 496 $(LREF unicode) shorthand accessor.) 497 $(BOOKTABLE $(B General category ), 498 $(TR $(TH Abb.) $(TH Long form) 499 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 500 $(TR $(TD L) $(TD Letter) 501 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 502 $(TR $(TD Ll) $(TD Lowercase_Letter) 503 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 504 $(TR $(TD Lm) $(TD Modifier_Letter) 505 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 506 $(TR $(TD Lo) $(TD Other_Letter) 507 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 508 $(TR $(TD Lt) $(TD Titlecase_Letter) 509 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 510 $(TR $(TD Lu) $(TD Uppercase_Letter) 511 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 512 $(TR $(TD M) $(TD Mark) 513 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 514 $(TR $(TD Mc) $(TD Spacing_Mark) 515 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 516 $(TR $(TD Me) $(TD Enclosing_Mark) 517 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 518 $(TR $(TD Mn) $(TD Nonspacing_Mark) 519 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 520 $(TR $(TD C) $(TD Other) 521 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 522 $(TR $(TD Cc) $(TD Control) $(TD Pf) 523 $(TD Final_Punctuation) $(TD -) $(TD Any)) 524 $(TR $(TD Cf) $(TD Format) 525 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 526 ) 527 $(P Sets for other commonly useful properties that are 528 accessible with $(LREF unicode):) 529 $(BOOKTABLE $(B Common binary properties), 530 $(TR $(TH Name) $(TH Name) $(TH Name)) 531 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 532 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 533 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 534 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 535 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 536 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 537 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 538 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 539 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 540 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 541 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 542 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 543 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 544 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 545 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 546 $(TR $(TD ID_Continue) $(TD Other_Math) ) 547 ) 548 $(P Below is the table with block names accepted by $(LREF unicode.block). 549 Note that the shorthand version $(LREF unicode) requires "In" 550 to be prepended to the names of blocks so as to disambiguate 551 scripts and blocks. 552 ) 553 $(BOOKTABLE $(B Blocks), 554 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 555 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 556 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 557 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 558 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 559 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 560 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 561 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 562 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 563 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 564 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 565 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 566 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 567 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 568 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 569 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 570 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 571 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 572 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 573 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 574 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 575 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 576 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 577 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 578 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 579 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 580 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 581 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 582 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 583 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 584 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 585 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 586 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 587 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 588 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 589 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 590 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 591 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 592 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 593 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 594 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 595 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 596 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 597 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 598 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 599 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 600 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 601 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 602 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 603 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 604 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 605 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 606 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 607 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 608 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 609 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 610 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 611 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 612 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 613 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 614 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 615 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 616 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 617 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 618 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 619 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 620 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 621 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 622 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 623 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 624 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 625 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 626 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 627 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 628 ) 629 $(P Below is the table with script names accepted by $(LREF unicode.script) 630 and by the shorthand version $(LREF unicode):) 631 $(BOOKTABLE $(B Scripts), 632 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 633 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 634 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 635 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 636 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 637 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 638 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 639 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 640 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 641 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 642 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 643 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 644 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 645 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 646 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 647 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 648 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 649 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 650 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 651 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 652 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 653 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 654 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 655 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 656 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 657 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 658 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 659 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 660 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 661 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 662 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 663 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 664 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 665 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 666 ) 667 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 668 $(BOOKTABLE $(B Hangul syllable type), 669 $(TR $(TH Abb.) $(TH Long form)) 670 $(TR $(TD L) $(TD Leading_Jamo)) 671 $(TR $(TD LV) $(TD LV_Syllable)) 672 $(TR $(TD LVT) $(TD LVT_Syllable) ) 673 $(TR $(TD T) $(TD Trailing_Jamo)) 674 $(TR $(TD V) $(TD Vowel_Jamo)) 675 ) 676 References: 677 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 678 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 679 $(HTTP www.unicode.org, The Unicode Consortium), 680 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 681 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 682 $(HTTP www.unicode.org/uni2book/ch05.pdf, 683 Unicode Implementation Guidelines) 684 $(HTTP www.unicode.org/uni2book/ch03.pdf, 685 Unicode Conformance) 686 Trademarks: 687 Unicode(tm) is a trademark of Unicode, Inc. 688 689 Copyright: Copyright 2013 - 690 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 691 Authors: Dmitry Olshansky 692 Source: $(PHOBOSSRC std/uni/package.d) 693 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 694 695 Macros: 696 697 SECTION = <h3><a id="$1">$0</a></h3> 698 DEF = <div><a id="$1"><i>$0</i></a></div> 699 S_LINK = <a href="#$1">$+</a> 700 CODEPOINT = $(S_LINK Code point, code point) 701 CODEPOINTS = $(S_LINK Code point, code points) 702 CHARACTER = $(S_LINK Character, character) 703 CHARACTERS = $(S_LINK Character, characters) 704 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 705 +/ 706 module std.uni; 707 708 import std.meta : AliasSeq; 709 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 710 front, hasLength, hasSlicing, isForwardRange, isInputRange, 711 isRandomAccessRange, popFront, put, save; 712 import std.traits : isAutodecodableString, isConvertibleToString, isIntegral, 713 isSomeChar, isSomeString, Unqual, isDynamicArray; 714 // debug = std_uni; 715 716 import std.internal.unicode_tables; // generated file 717 718 debug(std_uni) import std.stdio; // writefln, writeln 719 720 private: 721 722 723 void copyBackwards(T,U)(T[] src, U[] dest) 724 { 725 assert(src.length == dest.length); 726 for (size_t i=src.length; i-- > 0; ) 727 dest[i] = src[i]; 728 } 729 730 void copyForward(T,U)(T[] src, U[] dest) 731 { 732 assert(src.length == dest.length); 733 for (size_t i=0; i<src.length; i++) 734 dest[i] = src[i]; 735 } 736 737 // TODO: update to reflect all major CPUs supporting unaligned reads 738 version (X86) 739 enum hasUnalignedReads = true; 740 else version (X86_64) 741 enum hasUnalignedReads = true; 742 else version (SystemZ) 743 enum hasUnalignedReads = true; 744 else 745 enum hasUnalignedReads = false; // better be safe then sorry 746 747 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 748 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 749 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 750 751 // test the intro example 752 @safe unittest 753 { 754 import std.algorithm.searching : find; 755 // initialize code point sets using script/block or property name 756 // set contains code points from both scripts. 757 auto set = unicode("Cyrillic") | unicode("Armenian"); 758 // or simpler and statically-checked look 759 auto ascii = unicode.ASCII; 760 auto currency = unicode.Currency_Symbol; 761 762 // easy set ops 763 auto a = set & ascii; 764 assert(a.empty); // as it has no intersection with ascii 765 a = set | ascii; 766 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 767 768 // some properties of code point sets 769 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 770 // testing presence of a code point in a set 771 // is just fine, it is O(logN) 772 assert(!b['$']); 773 assert(!b['\u058F']); // Armenian dram sign 774 assert(b['¥']); 775 776 // building fast lookup tables, these guarantee O(1) complexity 777 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 778 auto oneTrie = toTrie!1(b); 779 // 2-level far more compact but typically slightly slower 780 auto twoTrie = toTrie!2(b); 781 // 3-level even smaller, and a bit slower yet 782 auto threeTrie = toTrie!3(b); 783 assert(oneTrie['£']); 784 assert(twoTrie['£']); 785 assert(threeTrie['£']); 786 787 // build the trie with the most sensible trie level 788 // and bind it as a functor 789 auto cyrillicOrArmenian = toDelegate(set); 790 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 791 assert(balance == "ընկեր!"); 792 // compatible with bool delegate(dchar) 793 bool delegate(dchar) bindIt = cyrillicOrArmenian; 794 795 // Normalization 796 string s = "Plain ascii (and not only), is always normalized!"; 797 assert(s is normalize(s));// is the same string 798 799 string nonS = "A\u0308ffin"; // A ligature 800 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 801 assert(nS == "Äffin"); 802 assert(nS != nonS); 803 string composed = "Äffin"; 804 805 assert(normalize!NFD(composed) == "A\u0308ffin"); 806 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 807 assert(normalize!NFKD("2¹⁰") == "210"); 808 } 809 810 enum lastDchar = 0x10FFFF; 811 812 auto force(T, F)(F from) 813 if (isIntegral!T && !is(T == F)) 814 { 815 assert(from <= T.max && from >= T.min); 816 return cast(T) from; 817 } 818 819 auto force(T, F)(F from) 820 if (isBitPacked!T && !is(T == F)) 821 { 822 assert(from <= 2^^bitSizeOf!T-1); 823 return T(cast(TypeOfBitPacked!T) from); 824 } 825 826 auto force(T, F)(F from) 827 if (is(T == F)) 828 { 829 return from; 830 } 831 832 // repeat X times the bit-pattern in val assuming it's length is 'bits' 833 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 834 { 835 static if (times == 1) 836 return val; 837 else static if (bits == 1) 838 { 839 static if (times == size_t.sizeof*8) 840 return val ? size_t.max : 0; 841 else 842 return val ? (1 << times)-1 : 0; 843 } 844 else static if (times % 2) 845 return (replicateBits!(times-1, bits)(val)<<bits) | val; 846 else 847 return replicateBits!(times/2, bits*2)((val << bits) | val); 848 } 849 850 @safe pure nothrow @nogc unittest // for replicate 851 { 852 import std.algorithm.iteration : sum, map; 853 import std.range : iota; 854 size_t m = 0b111; 855 size_t m2 = 0b01; 856 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 857 { 858 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 859 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 860 } 861 } 862 863 // multiple arrays squashed into one memory block 864 struct MultiArray(Types...) 865 { 866 import std.range.primitives : isOutputRange; 867 this(size_t[] sizes...) @safe pure nothrow 868 { 869 assert(dim == sizes.length); 870 size_t full_size; 871 foreach (i, v; Types) 872 { 873 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 874 sz[i] = sizes[i]; 875 static if (i >= 1) 876 offsets[i] = offsets[i-1] + 877 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 878 } 879 880 storage = new size_t[full_size]; 881 } 882 883 this(const(size_t)[] raw_offsets, 884 const(size_t)[] raw_sizes, 885 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 886 { 887 offsets[] = raw_offsets[]; 888 sz[] = raw_sizes[]; 889 storage = data; 890 } 891 892 @property auto slice(size_t n)()inout pure nothrow @nogc 893 { 894 auto ptr = raw_ptr!n; 895 return packedArrayView!(Types[n])(ptr, sz[n]); 896 } 897 898 @property auto ptr(size_t n)()inout pure nothrow @nogc 899 { 900 auto ptr = raw_ptr!n; 901 return inout(PackedPtr!(Types[n]))(ptr); 902 } 903 904 template length(size_t n) 905 { 906 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 907 908 @property void length(size_t new_size) 909 { 910 if (new_size > sz[n]) 911 {// extend 912 size_t delta = (new_size - sz[n]); 913 sz[n] += delta; 914 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 915 storage.length += delta;// extend space at end 916 // raw_slice!x must follow resize as it could be moved! 917 // next stmts move all data past this array, last-one-goes-first 918 static if (n != dim-1) 919 { 920 auto start = raw_ptr!(n+1); 921 // len includes delta 922 size_t len = (storage.ptr+storage.length-start); 923 924 copyBackwards(start[0 .. len-delta], start[delta .. len]); 925 926 start[0 .. delta] = 0; 927 // offsets are used for raw_slice, ptr etc. 928 foreach (i; n+1 .. dim) 929 offsets[i] += delta; 930 } 931 } 932 else if (new_size < sz[n]) 933 {// shrink 934 size_t delta = (sz[n] - new_size); 935 sz[n] -= delta; 936 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 937 // move all data past this array, forward direction 938 static if (n != dim-1) 939 { 940 auto start = raw_ptr!(n+1); 941 size_t len = (storage.ptr+storage.length-start); 942 copyForward(start[0 .. len-delta], start[delta .. len]); 943 944 // adjust offsets last, they affect raw_slice 945 foreach (i; n+1 .. dim) 946 offsets[i] -= delta; 947 } 948 storage.length -= delta; 949 } 950 // else - NOP 951 } 952 } 953 954 @property size_t bytes(size_t n=size_t.max)() const @safe 955 { 956 static if (n == size_t.max) 957 return storage.length*size_t.sizeof; 958 else static if (n != Types.length-1) 959 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 960 else 961 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 962 } 963 964 void store(OutRange)(scope OutRange sink) const 965 if (isOutputRange!(OutRange, char)) 966 { 967 import std.format.write : formattedWrite; 968 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 969 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 970 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 971 } 972 973 private: 974 import std.meta : staticMap; 975 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 976 { 977 static if (n == 0) 978 return storage.ptr; 979 else 980 { 981 return storage.ptr+offsets[n]; 982 } 983 } 984 enum dim = Types.length; 985 size_t[dim] offsets;// offset for level x 986 size_t[dim] sz;// size of level x 987 alias bitWidth = staticMap!(bitSizeOf, Types); 988 size_t[] storage; 989 } 990 991 @system unittest 992 { 993 import std.conv : text; 994 enum dg = (){ 995 // sizes are: 996 // lvl0: 3, lvl1 : 2, lvl2: 1 997 auto m = MultiArray!(int, ubyte, int)(3,2,1); 998 999 static void check(size_t k, T)(ref T m, int n) 1000 { 1001 foreach (i; 0 .. n) 1002 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1003 } 1004 1005 static void checkB(size_t k, T)(ref T m, int n) 1006 { 1007 foreach (i; 0 .. n) 1008 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1009 } 1010 1011 static void fill(size_t k, T)(ref T m, int n) 1012 { 1013 foreach (i; 0 .. n) 1014 m.slice!(k)[i] = force!ubyte(i+1); 1015 } 1016 1017 static void fillB(size_t k, T)(ref T m, int n) 1018 { 1019 foreach (i; 0 .. n) 1020 m.slice!(k)[i] = force!ubyte(n-i); 1021 } 1022 1023 m.length!1 = 100; 1024 fill!1(m, 100); 1025 check!1(m, 100); 1026 1027 m.length!0 = 220; 1028 fill!0(m, 220); 1029 check!1(m, 100); 1030 check!0(m, 220); 1031 1032 m.length!2 = 17; 1033 fillB!2(m, 17); 1034 checkB!2(m, 17); 1035 check!0(m, 220); 1036 check!1(m, 100); 1037 1038 m.length!2 = 33; 1039 checkB!2(m, 17); 1040 fillB!2(m, 33); 1041 checkB!2(m, 33); 1042 check!0(m, 220); 1043 check!1(m, 100); 1044 1045 m.length!1 = 195; 1046 fillB!1(m, 195); 1047 checkB!1(m, 195); 1048 checkB!2(m, 33); 1049 check!0(m, 220); 1050 1051 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1052 marr.length!0 = 15; 1053 marr.length!1 = 30; 1054 fill!1(marr, 30); 1055 fill!0(marr, 15); 1056 check!1(marr, 30); 1057 check!0(marr, 15); 1058 return 0; 1059 }; 1060 enum ct = dg(); 1061 auto rt = dg(); 1062 } 1063 1064 @system unittest 1065 {// more bitpacking tests 1066 import std.conv : text; 1067 1068 alias Bitty = 1069 MultiArray!(BitPacked!(size_t, 3) 1070 , BitPacked!(size_t, 4) 1071 , BitPacked!(size_t, 3) 1072 , BitPacked!(size_t, 6) 1073 , bool); 1074 alias fn1 = sliceBits!(13, 16); 1075 alias fn2 = sliceBits!( 9, 13); 1076 alias fn3 = sliceBits!( 6, 9); 1077 alias fn4 = sliceBits!( 0, 6); 1078 static void check(size_t lvl, MA)(ref MA arr){ 1079 for (size_t i = 0; i< arr.length!lvl; i++) 1080 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1081 } 1082 1083 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1084 for (size_t i = 0; i< arr.length!lvl; i++) 1085 arr.slice!(lvl)[i] = i; 1086 } 1087 Bitty m1; 1088 1089 m1.length!4 = 10; 1090 m1.length!3 = 2^^6; 1091 m1.length!2 = 2^^3; 1092 m1.length!1 = 2^^4; 1093 m1.length!0 = 2^^3; 1094 1095 m1.length!4 = 2^^16; 1096 1097 for (size_t i = 0; i< m1.length!4; i++) 1098 m1.slice!(4)[i] = i % 2; 1099 1100 fillIdx!1(m1); 1101 check!1(m1); 1102 fillIdx!2(m1); 1103 check!2(m1); 1104 fillIdx!3(m1); 1105 check!3(m1); 1106 fillIdx!0(m1); 1107 check!0(m1); 1108 check!3(m1); 1109 check!2(m1); 1110 check!1(m1); 1111 for (size_t i=0; i < 2^^16; i++) 1112 { 1113 m1.slice!(4)[i] = i % 2; 1114 m1.slice!(0)[fn1(i)] = fn1(i); 1115 m1.slice!(1)[fn2(i)] = fn2(i); 1116 m1.slice!(2)[fn3(i)] = fn3(i); 1117 m1.slice!(3)[fn4(i)] = fn4(i); 1118 } 1119 for (size_t i=0; i < 2^^16; i++) 1120 { 1121 assert(m1.slice!(4)[i] == i % 2); 1122 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1123 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1124 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1125 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1126 } 1127 } 1128 1129 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1130 { 1131 import std.math.algebraic : nextPow2; 1132 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1133 static if (bits > 8*size_t.sizeof) 1134 { 1135 static assert(bits % (size_t.sizeof*8) == 0); 1136 return new_len * bits/(8*size_t.sizeof); 1137 } 1138 else 1139 { 1140 enum factor = size_t.sizeof*8/bits; 1141 return (new_len+factor-1)/factor; // rounded up 1142 } 1143 } 1144 1145 template isBitPackableType(T) 1146 { 1147 enum isBitPackableType = isBitPacked!T 1148 || isIntegral!T || is(T == bool) || isSomeChar!T; 1149 } 1150 1151 //============================================================================ 1152 template PackedArrayView(T) 1153 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1154 && isBitPackableType!U) || isBitPackableType!T) 1155 { 1156 import std.math.algebraic : nextPow2; 1157 private enum bits = bitSizeOf!T; 1158 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1159 } 1160 1161 //unsafe and fast access to a chunk of RAM as if it contains packed values 1162 template PackedPtr(T) 1163 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1164 && isBitPackableType!U) || isBitPackableType!T) 1165 { 1166 import std.math.algebraic : nextPow2; 1167 private enum bits = bitSizeOf!T; 1168 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1169 } 1170 1171 struct PackedPtrImpl(T, size_t bits) 1172 { 1173 pure nothrow: 1174 static assert(isPow2OrZero(bits)); 1175 1176 this(inout(size_t)* ptr)inout @safe @nogc 1177 { 1178 origin = ptr; 1179 } 1180 1181 private T simpleIndex(size_t n) inout 1182 { 1183 immutable q = n / factor; 1184 immutable r = n % factor; 1185 return cast(T)((origin[q] >> bits*r) & mask); 1186 } 1187 1188 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1189 in 1190 { 1191 static if (isIntegral!T) 1192 assert(val <= mask); 1193 } 1194 do 1195 { 1196 immutable q = n / factor; 1197 immutable r = n % factor; 1198 immutable tgt_shift = bits*r; 1199 immutable word = origin[q]; 1200 origin[q] = (word & ~(mask << tgt_shift)) 1201 | (cast(size_t) val << tgt_shift); 1202 } 1203 1204 static if (factor == bytesPerWord// can safely pack by byte 1205 || factor == 1 // a whole word at a time 1206 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1207 && hasUnalignedReads)) // this needs unaligned reads 1208 { 1209 static if (factor == bytesPerWord) 1210 alias U = ubyte; 1211 else static if (factor == bytesPerWord/2) 1212 alias U = ushort; 1213 else static if (factor == bytesPerWord/4) 1214 alias U = uint; 1215 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1216 alias U = ulong; 1217 1218 T opIndex(size_t idx) inout 1219 { 1220 T ret; 1221 version (LittleEndian) 1222 ret = __ctfe ? simpleIndex(idx) : 1223 cast(inout(T))(cast(U*) origin)[idx]; 1224 else 1225 ret = simpleIndex(idx); 1226 return ret; 1227 } 1228 1229 static if (isBitPacked!T) // lack of user-defined implicit conversion 1230 { 1231 void opIndexAssign(T val, size_t idx) 1232 { 1233 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1234 } 1235 } 1236 1237 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1238 { 1239 version (LittleEndian) 1240 { 1241 if (__ctfe) 1242 simpleWrite(val, idx); 1243 else 1244 (cast(U*) origin)[idx] = cast(U) val; 1245 } 1246 else 1247 simpleWrite(val, idx); 1248 } 1249 } 1250 else 1251 { 1252 T opIndex(size_t n) inout 1253 { 1254 return simpleIndex(n); 1255 } 1256 1257 static if (isBitPacked!T) // lack of user-defined implicit conversion 1258 { 1259 void opIndexAssign(T val, size_t idx) 1260 { 1261 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1262 } 1263 } 1264 1265 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1266 { 1267 return simpleWrite(val, n); 1268 } 1269 } 1270 1271 private: 1272 // factor - number of elements in one machine word 1273 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1274 enum bytesPerWord = size_t.sizeof; 1275 size_t* origin; 1276 } 1277 1278 // data is packed only by power of two sized packs per word, 1279 // thus avoiding mul/div overhead at the cost of ultimate packing 1280 // this construct doesn't own memory, only provides access, see MultiArray for usage 1281 struct PackedArrayViewImpl(T, size_t bits) 1282 { 1283 pure nothrow: 1284 1285 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1286 { 1287 ptr = inout(PackedPtr!(T))(origin); 1288 ofs = offset; 1289 limit = items; 1290 } 1291 1292 bool zeros(size_t s, size_t e) 1293 in 1294 { 1295 assert(s <= e); 1296 } 1297 do 1298 { 1299 s += ofs; 1300 e += ofs; 1301 immutable pad_s = roundUp(s); 1302 if ( s >= e) 1303 { 1304 foreach (i; s .. e) 1305 if (ptr[i]) 1306 return false; 1307 return true; 1308 } 1309 immutable pad_e = roundDown(e); 1310 size_t i; 1311 for (i=s; i<pad_s; i++) 1312 if (ptr[i]) 1313 return false; 1314 // all in between is x*factor elements 1315 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1316 if (ptr.origin[j]) 1317 return false; 1318 for (; i<e; i++) 1319 if (ptr[i]) 1320 return false; 1321 return true; 1322 } 1323 1324 T opIndex(size_t idx) inout 1325 in 1326 { 1327 assert(idx < limit); 1328 } 1329 do 1330 { 1331 return ptr[ofs + idx]; 1332 } 1333 1334 static if (isBitPacked!T) // lack of user-defined implicit conversion 1335 { 1336 void opIndexAssign(T val, size_t idx) 1337 { 1338 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1339 } 1340 } 1341 1342 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1343 in 1344 { 1345 assert(idx < limit); 1346 } 1347 do 1348 { 1349 ptr[ofs + idx] = val; 1350 } 1351 1352 static if (isBitPacked!T) // lack of user-defined implicit conversions 1353 { 1354 void opSliceAssign(T val, size_t start, size_t end) 1355 { 1356 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1357 } 1358 } 1359 1360 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1361 in 1362 { 1363 assert(start <= end); 1364 assert(end <= limit); 1365 } 1366 do 1367 { 1368 // account for ofsetted view 1369 start += ofs; 1370 end += ofs; 1371 // rounded to factor granularity 1372 immutable pad_start = roundUp(start);// rounded up 1373 if (pad_start >= end) //rounded up >= then end of slice 1374 { 1375 //nothing to gain, use per element assignment 1376 foreach (i; start .. end) 1377 ptr[i] = val; 1378 return; 1379 } 1380 immutable pad_end = roundDown(end); // rounded down 1381 size_t i; 1382 for (i=start; i<pad_start; i++) 1383 ptr[i] = val; 1384 // all in between is x*factor elements 1385 if (pad_start != pad_end) 1386 { 1387 immutable repval = replicateBits!(factor, bits)(val); 1388 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1389 ptr.origin[j] = repval;// so speed it up by factor 1390 } 1391 for (; i<end; i++) 1392 ptr[i] = val; 1393 } 1394 1395 auto opSlice(size_t from, size_t to)inout 1396 in 1397 { 1398 assert(from <= to); 1399 assert(ofs + to <= limit); 1400 } 1401 do 1402 { 1403 return typeof(this)(ptr.origin, ofs + from, to - from); 1404 } 1405 1406 auto opSlice(){ return opSlice(0, length); } 1407 1408 bool opEquals(T)(auto ref T arr) const 1409 { 1410 if (limit != arr.limit) 1411 return false; 1412 size_t s1 = ofs, s2 = arr.ofs; 1413 size_t e1 = s1 + limit, e2 = s2 + limit; 1414 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1415 { 1416 return ptr.origin[s1/factor .. e1/factor] 1417 == arr.ptr.origin[s2/factor .. e2/factor]; 1418 } 1419 for (size_t i=0;i<limit; i++) 1420 if (this[i] != arr[i]) 1421 return false; 1422 return true; 1423 } 1424 1425 @property size_t length()const{ return limit; } 1426 1427 private: 1428 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1429 auto roundDown()(size_t val){ return val/factor*factor; } 1430 // factor - number of elements in one machine word 1431 enum factor = size_t.sizeof*8/bits; 1432 PackedPtr!(T) ptr; 1433 size_t ofs, limit; 1434 } 1435 1436 1437 private struct SliceOverIndexed(T) 1438 { 1439 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1440 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1441 auto opIndex(size_t idx)const 1442 in 1443 { 1444 assert(idx < to - from); 1445 } 1446 do 1447 { 1448 return (*arr)[from+idx]; 1449 } 1450 1451 static if (assignableIndex) 1452 void opIndexAssign(Item val, size_t idx) 1453 in 1454 { 1455 assert(idx < to - from); 1456 } 1457 do 1458 { 1459 (*arr)[from+idx] = val; 1460 } 1461 1462 auto opSlice(size_t a, size_t b) 1463 { 1464 return typeof(this)(from+a, from+b, arr); 1465 } 1466 1467 // static if (assignableSlice) 1468 void opSliceAssign(T)(T val, size_t start, size_t end) 1469 { 1470 (*arr)[start+from .. end+from] = val; 1471 } 1472 1473 auto opSlice() 1474 { 1475 return typeof(this)(from, to, arr); 1476 } 1477 1478 @property size_t length()const { return to-from;} 1479 1480 alias opDollar = length; 1481 1482 @property bool empty()const { return from == to; } 1483 1484 @property auto front()const { return (*arr)[from]; } 1485 1486 static if (assignableIndex) 1487 @property void front(Item val) { (*arr)[from] = val; } 1488 1489 @property auto back()const { return (*arr)[to-1]; } 1490 1491 static if (assignableIndex) 1492 @property void back(Item val) { (*arr)[to-1] = val; } 1493 1494 @property auto save() inout { return this; } 1495 1496 void popFront() { from++; } 1497 1498 void popBack() { to--; } 1499 1500 bool opEquals(T)(auto ref T arr) const 1501 { 1502 if (arr.length != length) 1503 return false; 1504 for (size_t i=0; i <length; i++) 1505 if (this[i] != arr[i]) 1506 return false; 1507 return true; 1508 } 1509 private: 1510 alias Item = typeof(T.init[0]); 1511 size_t from, to; 1512 T* arr; 1513 } 1514 1515 @safe pure nothrow @nogc unittest 1516 { 1517 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1518 } 1519 1520 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1521 if (is(Unqual!T == T)) 1522 { 1523 return SliceOverIndexed!(const(T))(a, b, x); 1524 } 1525 1526 // BUG? inout is out of reach 1527 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1528 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1529 if (is(Unqual!T == T)) 1530 { 1531 return SliceOverIndexed!T(a, b, x); 1532 } 1533 1534 @system unittest 1535 { 1536 int[] idxArray = [2, 3, 5, 8, 13]; 1537 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1538 1539 assert(!sliced.empty); 1540 assert(sliced.front == 2); 1541 sliced.front = 1; 1542 assert(sliced.front == 1); 1543 assert(sliced.back == 13); 1544 sliced.popFront(); 1545 assert(sliced.front == 3); 1546 assert(sliced.back == 13); 1547 sliced.back = 11; 1548 assert(sliced.back == 11); 1549 sliced.popBack(); 1550 1551 assert(sliced.front == 3); 1552 assert(sliced[$-1] == 8); 1553 sliced = sliced[]; 1554 assert(sliced[0] == 3); 1555 assert(sliced.back == 8); 1556 sliced = sliced[1..$]; 1557 assert(sliced.front == 5); 1558 sliced = sliced[0..$-1]; 1559 assert(sliced[$-1] == 5); 1560 1561 int[] other = [2, 5]; 1562 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1563 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1564 assert(idxArray[0 .. 2] == [-1, -1]); 1565 uint[] nullArr = null; 1566 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1567 assert(nullSlice.empty); 1568 } 1569 1570 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1571 { 1572 return inout(PackedArrayView!T)(ptr, 0, items); 1573 } 1574 1575 1576 //============================================================================ 1577 // Partially unrolled binary search using Shar's method 1578 //============================================================================ 1579 1580 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1581 { 1582 import core.bitop : bsr; 1583 import std.array : replace; 1584 import std.conv : to; 1585 assert(isPow2OrZero(size)); 1586 string code = ` 1587 import core.bitop : bsr; 1588 auto power = bsr(m)+1; 1589 switch (power){`; 1590 size_t i = bsr(size); 1591 foreach_reverse (val; 0 .. bsr(size)) 1592 { 1593 auto v = 2^^val; 1594 code ~= ` 1595 case pow: 1596 if (pred(range[idx+m], needle)) 1597 idx += m; 1598 goto case; 1599 `.replace("m", to!string(v)) 1600 .replace("pow", to!string(i)); 1601 i--; 1602 } 1603 code ~= ` 1604 case 0: 1605 if (pred(range[idx], needle)) 1606 idx += 1; 1607 goto default; 1608 `; 1609 code ~= ` 1610 default: 1611 }`; 1612 return code; 1613 } 1614 1615 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1616 { 1617 // See also: std.math.isPowerOf2() 1618 return (sz & (sz-1)) == 0; 1619 } 1620 1621 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1622 if (is(T : ElementType!Range)) 1623 { 1624 assert(isPow2OrZero(range.length)); 1625 size_t idx = 0, m = range.length/2; 1626 while (m != 0) 1627 { 1628 if (pred(range[idx+m], needle)) 1629 idx += m; 1630 m /= 2; 1631 } 1632 if (pred(range[idx], needle)) 1633 idx += 1; 1634 return idx; 1635 } 1636 1637 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1638 if (is(T : ElementType!Range)) 1639 { 1640 assert(isPow2OrZero(range.length)); 1641 size_t idx = 0, m = range.length/2; 1642 enum max = 1 << 10; 1643 while (m >= max) 1644 { 1645 if (pred(range[idx+m], needle)) 1646 idx += m; 1647 m /= 2; 1648 } 1649 mixin(genUnrolledSwitchSearch(max)); 1650 return idx; 1651 } 1652 1653 template sharMethod(alias uniLowerBound) 1654 { 1655 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1656 if (is(T : ElementType!Range)) 1657 { 1658 import std.functional : binaryFun; 1659 import std.math.algebraic : nextPow2, truncPow2; 1660 alias pred = binaryFun!_pred; 1661 if (range.length == 0) 1662 return 0; 1663 if (isPow2OrZero(range.length)) 1664 return uniLowerBound!pred(range, needle); 1665 size_t n = truncPow2(range.length); 1666 if (pred(range[n-1], needle)) 1667 {// search in another 2^^k area that fully covers the tail of range 1668 size_t k = nextPow2(range.length - n + 1); 1669 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1670 } 1671 else 1672 return uniLowerBound!pred(range[0 .. n], needle); 1673 } 1674 } 1675 1676 alias sharLowerBound = sharMethod!uniformLowerBound; 1677 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1678 1679 @safe unittest 1680 { 1681 import std.array : array; 1682 import std.range : assumeSorted, iota; 1683 1684 auto stdLowerBound(T)(T[] range, T needle) 1685 { 1686 return assumeSorted(range).lowerBound(needle).length; 1687 } 1688 immutable MAX = 5*1173; 1689 auto arr = array(iota(5, MAX, 5)); 1690 assert(arr.length == MAX/5-1); 1691 foreach (i; 0 .. MAX+5) 1692 { 1693 auto st = stdLowerBound(arr, i); 1694 assert(st == sharLowerBound(arr, i)); 1695 assert(st == sharSwitchLowerBound(arr, i)); 1696 } 1697 arr = []; 1698 auto st = stdLowerBound(arr, 33); 1699 assert(st == sharLowerBound(arr, 33)); 1700 assert(st == sharSwitchLowerBound(arr, 33)); 1701 } 1702 //============================================================================ 1703 1704 @safe 1705 { 1706 // hope to see simillar stuff in public interface... once Allocators are out 1707 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1708 1709 @trusted size_t genericReplace(Policy=void, T, Range) 1710 (ref T dest, size_t from, size_t to, Range stuff) 1711 { 1712 import std.algorithm.mutation : copy; 1713 size_t delta = to - from; 1714 size_t stuff_end = from+stuff.length; 1715 if (stuff.length > delta) 1716 {// replace increases length 1717 delta = stuff.length - delta;// now, new is > old by delta 1718 static if (is(Policy == void)) 1719 dest.length = dest.length+delta;//@@@BUG lame @property 1720 else 1721 dest = Policy.realloc(dest, dest.length+delta); 1722 copyBackwards(dest[to .. dest.length-delta], 1723 dest[to+delta .. dest.length]); 1724 copyForward(stuff, dest[from .. stuff_end]); 1725 } 1726 else if (stuff.length == delta) 1727 { 1728 copy(stuff, dest[from .. to]); 1729 } 1730 else 1731 {// replace decreases length by delta 1732 delta = delta - stuff.length; 1733 copy(stuff, dest[from .. stuff_end]); 1734 copyForward(dest[to .. dest.length], 1735 dest[stuff_end .. dest.length-delta]); 1736 static if (is(Policy == void)) 1737 dest.length = dest.length - delta;//@@@BUG lame @property 1738 else 1739 dest = Policy.realloc(dest, dest.length-delta); 1740 } 1741 return stuff_end; 1742 } 1743 1744 1745 // Simple storage manipulation policy 1746 @safe private struct GcPolicy 1747 { 1748 import std.traits : isDynamicArray; 1749 1750 static T[] dup(T)(const T[] arr) 1751 { 1752 return arr.dup; 1753 } 1754 1755 static T[] alloc(T)(size_t size) 1756 { 1757 return new T[size]; 1758 } 1759 1760 static T[] realloc(T)(T[] arr, size_t sz) 1761 { 1762 arr.length = sz; 1763 return arr; 1764 } 1765 1766 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1767 { 1768 replaceInPlace(dest, from, to, stuff); 1769 } 1770 1771 static void append(T, V)(ref T[] arr, V value) 1772 if (!isInputRange!V) 1773 { 1774 arr ~= force!T(value); 1775 } 1776 1777 static void append(T, V)(ref T[] arr, V value) 1778 if (isInputRange!V) 1779 { 1780 insertInPlace(arr, arr.length, value); 1781 } 1782 1783 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1784 if (isDynamicArray!T && is(Unqual!T == T)) 1785 { 1786 debug 1787 { 1788 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1789 } 1790 arr = null; 1791 } 1792 1793 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1794 if (isDynamicArray!T && !is(Unqual!T == T)) 1795 { 1796 arr = null; 1797 } 1798 } 1799 1800 // ditto 1801 @safe struct ReallocPolicy 1802 { 1803 import std.range.primitives : hasLength; 1804 1805 static T[] dup(T)(const T[] arr) 1806 { 1807 auto result = alloc!T(arr.length); 1808 result[] = arr[]; 1809 return result; 1810 } 1811 1812 static T[] alloc(T)(size_t size) @trusted 1813 { 1814 import std.internal.memory : enforceMalloc; 1815 1816 import core.checkedint : mulu; 1817 bool overflow; 1818 size_t nbytes = mulu(size, T.sizeof, overflow); 1819 if (overflow) assert(0); 1820 1821 auto ptr = cast(T*) enforceMalloc(nbytes); 1822 return ptr[0 .. size]; 1823 } 1824 1825 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1826 { 1827 import std.internal.memory : enforceRealloc; 1828 if (!size) 1829 { 1830 destroy(arr); 1831 return null; 1832 } 1833 1834 import core.checkedint : mulu; 1835 bool overflow; 1836 size_t nbytes = mulu(size, T.sizeof, overflow); 1837 if (overflow) assert(0); 1838 1839 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1840 return ptr[0 .. size]; 1841 } 1842 1843 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1844 { 1845 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1846 } 1847 1848 static void append(T, V)(ref T[] arr, V value) 1849 if (!isInputRange!V) 1850 { 1851 if (arr.length == size_t.max) assert(0); 1852 arr = realloc(arr, arr.length+1); 1853 arr[$-1] = force!T(value); 1854 } 1855 1856 pure @safe unittest 1857 { 1858 int[] arr; 1859 ReallocPolicy.append(arr, 3); 1860 1861 import std.algorithm.comparison : equal; 1862 assert(equal(arr, [3])); 1863 } 1864 1865 static void append(T, V)(ref T[] arr, V value) 1866 if (isInputRange!V && hasLength!V) 1867 { 1868 import core.checkedint : addu; 1869 bool overflow; 1870 size_t nelems = addu(arr.length, value.length, overflow); 1871 if (overflow) assert(0); 1872 1873 arr = realloc(arr, nelems); 1874 1875 import std.algorithm.mutation : copy; 1876 copy(value, arr[$-value.length..$]); 1877 } 1878 1879 pure @safe unittest 1880 { 1881 int[] arr; 1882 ReallocPolicy.append(arr, [1,2,3]); 1883 1884 import std.algorithm.comparison : equal; 1885 assert(equal(arr, [1,2,3])); 1886 } 1887 1888 static void destroy(T)(scope ref T[] arr) @trusted 1889 { 1890 import core.memory : pureFree; 1891 if (arr.ptr) 1892 pureFree(arr.ptr); 1893 arr = null; 1894 } 1895 } 1896 1897 //build hack 1898 alias _RealArray = CowArray!ReallocPolicy; 1899 1900 pure @safe unittest 1901 { 1902 import std.algorithm.comparison : equal; 1903 1904 with(ReallocPolicy) 1905 { 1906 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1907 string file = __FILE__, size_t line = __LINE__) 1908 { 1909 { 1910 replaceImpl(orig, from, to, toReplace); 1911 scope(exit) destroy(orig); 1912 if (!equal(orig, result)) 1913 return false; 1914 } 1915 return true; 1916 } 1917 static T[] arr(T)(T[] args... ) 1918 { 1919 return dup(args); 1920 } 1921 1922 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1923 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1924 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1925 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1926 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1927 } 1928 } 1929 1930 /** 1931 Tests if T is some kind a set of code points. Intended for template constraints. 1932 */ 1933 public template isCodepointSet(T) 1934 { 1935 static if (is(T dummy == InversionList!(Args), Args...)) 1936 enum isCodepointSet = true; 1937 else 1938 enum isCodepointSet = false; 1939 } 1940 1941 /** 1942 Tests if `T` is a pair of integers that implicitly convert to `V`. 1943 The following code must compile for any pair `T`: 1944 --- 1945 (T x){ V a = x[0]; V b = x[1];} 1946 --- 1947 The following must not compile: 1948 --- 1949 (T x){ V c = x[2];} 1950 --- 1951 */ 1952 public template isIntegralPair(T, V=uint) 1953 { 1954 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1955 && !is(typeof((T x){ V c = x[2]; })); 1956 } 1957 1958 1959 /** 1960 The recommended default type for set of $(CODEPOINTS). 1961 For details, see the current implementation: $(LREF InversionList). 1962 */ 1963 public alias CodepointSet = InversionList!GcPolicy; 1964 1965 1966 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1967 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1968 // hence below doesn't seem to work 1969 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1970 1971 /** 1972 The recommended type of $(REF Tuple, std,_typecons) 1973 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1974 Any interval type should pass $(LREF isIntegralPair) trait. 1975 */ 1976 public struct CodepointInterval 1977 { 1978 pure: 1979 uint[2] _tuple; 1980 alias _tuple this; 1981 1982 @safe pure nothrow @nogc: 1983 1984 this(uint low, uint high) 1985 { 1986 _tuple[0] = low; 1987 _tuple[1] = high; 1988 } 1989 bool opEquals(T)(T val) const 1990 { 1991 return this[0] == val[0] && this[1] == val[1]; 1992 } 1993 @property ref inout(uint) a() return inout { return _tuple[0]; } 1994 @property ref inout(uint) b() return inout { return _tuple[1]; } 1995 } 1996 1997 /** 1998 $(P 1999 `InversionList` is a set of $(CODEPOINTS) 2000 represented as an array of open-right [a, b$(RPAREN) 2001 intervals (see $(LREF CodepointInterval) above). 2002 The name comes from the way the representation reads left to right. 2003 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2004 plus a singular value 60 looks like this: 2005 ) 2006 --- 2007 10, 50, 60, 61, 80, 90 2008 --- 2009 $(P 2010 The way to read this is: start with negative meaning that all numbers 2011 smaller then the next one are not present in this set (and positive - 2012 the contrary). Then switch positive/negative after each 2013 number passed from left to right. 2014 ) 2015 $(P This way negative spans until 10, then positive until 50, 2016 then negative until 60, then positive until 61, and so on. 2017 As seen this provides a space-efficient storage of highly redundant data 2018 that comes in long runs. A description which Unicode $(CHARACTER) 2019 properties fit nicely. The technique itself could be seen as a variation 2020 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2021 ) 2022 2023 $(P Sets are value types (just like `int` is) thus they 2024 are never aliased. 2025 ) 2026 Example: 2027 --- 2028 auto a = CodepointSet('a', 'z'+1); 2029 auto b = CodepointSet('A', 'Z'+1); 2030 auto c = a; 2031 a = a | b; 2032 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2033 assert(a != c); 2034 --- 2035 $(P See also $(LREF unicode) for simpler construction of sets 2036 from predefined ones. 2037 ) 2038 2039 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2040 The value semantics are achieved by using the 2041 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2042 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2043 ) 2044 2045 Note: 2046 $(P It's not recommended to rely on the template parameters 2047 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2048 The type and parameters may change when the standard 2049 allocators design is finalized. 2050 Use $(LREF isCodepointSet) with templates or just stick with the default 2051 alias $(LREF CodepointSet) throughout the whole code base. 2052 ) 2053 */ 2054 public struct InversionList(SP=GcPolicy) 2055 { 2056 import std.range : assumeSorted; 2057 2058 /** 2059 Construct from another code point set of any type. 2060 */ 2061 this(Set)(Set set) pure 2062 if (isCodepointSet!Set) 2063 { 2064 uint[] arr; 2065 foreach (v; set.byInterval) 2066 { 2067 arr ~= v.a; 2068 arr ~= v.b; 2069 } 2070 data = CowArray!(SP).reuse(arr); 2071 } 2072 2073 /** 2074 Construct a set from a forward range of code point intervals. 2075 */ 2076 this(Range)(Range intervals) pure 2077 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2078 { 2079 uint[] arr; 2080 foreach (v; intervals) 2081 { 2082 SP.append(arr, v.a); 2083 SP.append(arr, v.b); 2084 } 2085 data = CowArray!(SP).reuse(arr); 2086 sanitize(); //enforce invariant: sort intervals etc. 2087 } 2088 2089 //helper function that avoids sanity check to be CTFE-friendly 2090 private static fromIntervals(Range)(Range intervals) pure 2091 { 2092 import std.algorithm.iteration : map; 2093 import std.range : roundRobin; 2094 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2095 intervals.save.map!"a[1]"()); 2096 InversionList set; 2097 set.data = CowArray!(SP)(flattened); 2098 return set; 2099 } 2100 //ditto untill sort is CTFE-able 2101 private static fromIntervals()(uint[] intervals...) pure 2102 in 2103 { 2104 import std.conv : text; 2105 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2106 for (uint i = 0; i < intervals.length; i += 2) 2107 { 2108 auto a = intervals[i], b = intervals[i+1]; 2109 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2110 } 2111 } 2112 do 2113 { 2114 InversionList set; 2115 set.data = CowArray!(SP)(intervals); 2116 return set; 2117 } 2118 2119 /** 2120 Construct a set from plain values of code point intervals. 2121 */ 2122 this()(uint[] intervals...) 2123 in 2124 { 2125 import std.conv : text; 2126 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2127 for (uint i = 0; i < intervals.length; i += 2) 2128 { 2129 auto a = intervals[i], b = intervals[i+1]; 2130 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2131 } 2132 } 2133 do 2134 { 2135 data = CowArray!(SP)(intervals); 2136 sanitize(); //enforce invariant: sort intervals etc. 2137 } 2138 2139 /// 2140 pure @safe unittest 2141 { 2142 import std.algorithm.comparison : equal; 2143 2144 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2145 foreach (v; 'a'..'z'+1) 2146 assert(set[v]); 2147 // Cyrillic lowercase interval 2148 foreach (v; 'а'..'я'+1) 2149 assert(set[v]); 2150 //specific order is not required, intervals may interesect 2151 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2152 //the same end result 2153 assert(set2.byInterval.equal(set.byInterval)); 2154 // test constructor this(Range)(Range intervals) 2155 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2156 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2157 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2158 foreach (v; '♔'..'♟'+1) 2159 assert(set3[v]); 2160 } 2161 2162 /** 2163 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2164 */ 2165 @property auto byInterval() scope 2166 { 2167 // TODO: change this to data[] once the -dip1000 errors have been fixed 2168 // see e.g. https://github.com/dlang/phobos/pull/6638 2169 import std.array : array; 2170 return Intervals!(typeof(data.array))(data.array); 2171 } 2172 2173 @safe unittest 2174 { 2175 import std.algorithm.comparison : equal; 2176 import std.typecons : tuple; 2177 2178 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2179 2180 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2181 } 2182 2183 package(std) @property const(CodepointInterval)[] intervals() const 2184 { 2185 import std.array : array; 2186 return Intervals!(typeof(data[]))(data[]).array; 2187 } 2188 2189 /** 2190 Tests the presence of code point `val` in this set. 2191 */ 2192 bool opIndex(uint val) const 2193 { 2194 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2195 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2196 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2197 } 2198 2199 /// 2200 pure @safe unittest 2201 { 2202 auto gothic = unicode.Gothic; 2203 // Gothic letter ahsa 2204 assert(gothic['\U00010330']); 2205 // no ascii in Gothic obviously 2206 assert(!gothic['$']); 2207 } 2208 2209 2210 // Linear scan for `ch`. Useful only for small sets. 2211 // TODO: 2212 // used internally in std.regex 2213 // should be properly exposed in a public API ? 2214 package(std) auto scanFor()(dchar ch) const 2215 { 2216 immutable len = data.length; 2217 for (size_t i = 0; i < len; i++) 2218 if (ch < data[i]) 2219 return i & 1; 2220 return 0; 2221 } 2222 2223 /// Number of $(CODEPOINTS) in this set 2224 @property size_t length() 2225 { 2226 size_t sum = 0; 2227 foreach (iv; byInterval) 2228 { 2229 sum += iv.b - iv.a; 2230 } 2231 return sum; 2232 } 2233 2234 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2235 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2236 //============================================================================ 2237 public: 2238 /** 2239 $(P Sets support natural syntax for set algebra, namely: ) 2240 $(BOOKTABLE , 2241 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2242 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2243 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2244 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2245 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2246 ) 2247 */ 2248 This opBinary(string op, U)(U rhs) 2249 if (isCodepointSet!U || is(U:dchar)) 2250 { 2251 static if (op == "&" || op == "|" || op == "~") 2252 {// symmetric ops thus can swap arguments to reuse r-value 2253 static if (is(U:dchar)) 2254 { 2255 auto tmp = this; 2256 mixin("tmp "~op~"= rhs; "); 2257 return tmp; 2258 } 2259 else 2260 { 2261 static if (is(Unqual!U == U)) 2262 { 2263 // try hard to reuse r-value 2264 mixin("rhs "~op~"= this;"); 2265 return rhs; 2266 } 2267 else 2268 { 2269 auto tmp = this; 2270 mixin("tmp "~op~"= rhs;"); 2271 return tmp; 2272 } 2273 } 2274 } 2275 else static if (op == "-") // anti-symmetric 2276 { 2277 auto tmp = this; 2278 tmp -= rhs; 2279 return tmp; 2280 } 2281 else 2282 static assert(0, "no operator "~op~" defined for Set"); 2283 } 2284 2285 /// 2286 pure @safe unittest 2287 { 2288 import std.algorithm.comparison : equal; 2289 import std.range : iota; 2290 2291 auto lower = unicode.LowerCase; 2292 auto upper = unicode.UpperCase; 2293 auto ascii = unicode.ASCII; 2294 2295 assert((lower & upper).empty); // no intersection 2296 auto lowerASCII = lower & ascii; 2297 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2298 // throw away all of the lowercase ASCII 2299 assert((ascii - lower).length == 128 - 26); 2300 2301 auto onlyOneOf = lower ~ ascii; 2302 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2303 assert(onlyOneOf['$']); // ASCII and not lowercase 2304 assert(!onlyOneOf['a']); // ASCII and lowercase 2305 assert(onlyOneOf['я']); // not ASCII but lowercase 2306 2307 // throw away all cased letters from ASCII 2308 auto noLetters = ascii - (lower | upper); 2309 assert(noLetters.length == 128 - 26*2); 2310 } 2311 2312 /// The 'op=' versions of the above overloaded operators. 2313 ref This opOpAssign(string op, U)(U rhs) 2314 if (isCodepointSet!U || is(U:dchar)) 2315 { 2316 static if (op == "|") // union 2317 { 2318 static if (is(U:dchar)) 2319 { 2320 this.addInterval(rhs, rhs+1); 2321 return this; 2322 } 2323 else 2324 return this.add(rhs); 2325 } 2326 else static if (op == "&") // intersection 2327 return this.intersect(rhs);// overloaded 2328 else static if (op == "-") // set difference 2329 return this.sub(rhs);// overloaded 2330 else static if (op == "~") // symmetric set difference 2331 { 2332 auto copy = this & rhs; 2333 this |= rhs; 2334 this -= copy; 2335 return this; 2336 } 2337 else 2338 static assert(0, "no operator "~op~" defined for Set"); 2339 } 2340 2341 /** 2342 Tests the presence of codepoint `ch` in this set, 2343 the same as $(LREF opIndex). 2344 */ 2345 bool opBinaryRight(string op: "in", U)(U ch) const 2346 if (is(U : dchar)) 2347 { 2348 return this[ch]; 2349 } 2350 2351 /// 2352 pure @safe unittest 2353 { 2354 assert('я' in unicode.Cyrillic); 2355 assert(!('z' in unicode.Cyrillic)); 2356 } 2357 2358 2359 2360 /** 2361 * Obtains a set that is the inversion of this set. 2362 * 2363 * See_Also: $(LREF inverted) 2364 */ 2365 auto opUnary(string op: "!")() 2366 { 2367 return this.inverted; 2368 } 2369 2370 /** 2371 A range that spans each $(CODEPOINT) in this set. 2372 */ 2373 @property auto byCodepoint() 2374 { 2375 static struct CodepointRange 2376 { 2377 this(This set) 2378 { 2379 r = set.byInterval; 2380 if (!r.empty) 2381 cur = r.front.a; 2382 } 2383 2384 @property dchar front() const 2385 { 2386 return cast(dchar) cur; 2387 } 2388 2389 @property bool empty() const 2390 { 2391 return r.empty; 2392 } 2393 2394 void popFront() 2395 { 2396 cur++; 2397 while (cur >= r.front.b) 2398 { 2399 r.popFront(); 2400 if (r.empty) 2401 break; 2402 cur = r.front.a; 2403 } 2404 } 2405 private: 2406 uint cur; 2407 typeof(This.init.byInterval) r; 2408 } 2409 2410 return CodepointRange(this); 2411 } 2412 2413 /// 2414 pure @safe unittest 2415 { 2416 import std.algorithm.comparison : equal; 2417 import std.range : iota; 2418 2419 auto set = unicode.ASCII; 2420 set.byCodepoint.equal(iota(0, 0x80)); 2421 } 2422 2423 /** 2424 $(P Obtain textual representation of this set in from of 2425 open-right intervals and feed it to `sink`. 2426 ) 2427 $(P Used by various standard formatting facilities such as 2428 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2429 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2430 ) 2431 Example: 2432 --- 2433 import std.conv; 2434 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2435 --- 2436 */ 2437 2438 private import std.format.spec : FormatSpec; 2439 2440 /*************************************** 2441 * Obtain a textual representation of this InversionList 2442 * in form of open-right intervals. 2443 * 2444 * The formatting flag is applied individually to each value, for example: 2445 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2446 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2447 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2448 */ 2449 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2450 { 2451 import std.format.write : formatValue; 2452 auto range = byInterval; 2453 if (range.empty) 2454 return; 2455 2456 while (1) 2457 { 2458 auto i = range.front; 2459 range.popFront(); 2460 2461 put(sink, "["); 2462 formatValue(sink, i.a, fmt); 2463 put(sink, ".."); 2464 formatValue(sink, i.b, fmt); 2465 put(sink, ")"); 2466 if (range.empty) return; 2467 put(sink, " "); 2468 } 2469 } 2470 2471 /// 2472 pure @safe unittest 2473 { 2474 import std.conv : to; 2475 import std.format : format; 2476 import std.uni : unicode; 2477 2478 // This was originally using Cyrillic script. 2479 // Unfortunately this is a pretty active range for changes, 2480 // and hence broke in an update. 2481 // Therefore the range Basic latin was used instead as it 2482 // unlikely to ever change. 2483 2484 assert(unicode.InBasic_latin.to!string == "[0..128)"); 2485 2486 // The specs '%s' and '%d' are equivalent to the to!string call above. 2487 assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); 2488 2489 assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); 2490 assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); 2491 } 2492 2493 pure @safe unittest 2494 { 2495 import std.exception : assertThrown; 2496 import std.format : format, FormatException; 2497 assertThrown!FormatException(format("%z", unicode.ASCII)); 2498 } 2499 2500 2501 /** 2502 Add an interval [a, b$(RPAREN) to this set. 2503 */ 2504 ref add()(uint a, uint b) 2505 { 2506 addInterval(a, b); 2507 return this; 2508 } 2509 2510 /// 2511 pure @safe unittest 2512 { 2513 CodepointSet someSet; 2514 someSet.add('0', '5').add('A','Z'+1); 2515 someSet.add('5', '9'+1); 2516 assert(someSet['0']); 2517 assert(someSet['5']); 2518 assert(someSet['9']); 2519 assert(someSet['Z']); 2520 } 2521 2522 private: 2523 2524 package(std) // used from: std.regex.internal.parser 2525 ref intersect(U)(U rhs) 2526 if (isCodepointSet!U) 2527 { 2528 Marker mark; 2529 foreach ( i; rhs.byInterval) 2530 { 2531 mark = this.dropUpTo(i.a, mark); 2532 mark = this.skipUpTo(i.b, mark); 2533 } 2534 this.dropUpTo(uint.max, mark); 2535 return this; 2536 } 2537 2538 ref intersect()(dchar ch) 2539 { 2540 foreach (i; byInterval) 2541 if (i.a <= ch && ch < i.b) 2542 return this = This.init.add(ch, ch+1); 2543 this = This.init; 2544 return this; 2545 } 2546 2547 pure @safe unittest 2548 { 2549 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2550 } 2551 2552 ref sub()(dchar ch) 2553 { 2554 return subChar(ch); 2555 } 2556 2557 // same as the above except that skip & drop parts are swapped 2558 package(std) // used from: std.regex.internal.parser 2559 ref sub(U)(U rhs) 2560 if (isCodepointSet!U) 2561 { 2562 Marker mark; 2563 foreach (i; rhs.byInterval) 2564 { 2565 mark = this.skipUpTo(i.a, mark); 2566 mark = this.dropUpTo(i.b, mark); 2567 } 2568 return this; 2569 } 2570 2571 package(std) // used from: std.regex.internal.parse 2572 ref add(U)(U rhs) 2573 if (isCodepointSet!U) 2574 { 2575 Marker start; 2576 foreach (i; rhs.byInterval) 2577 { 2578 start = addInterval(i.a, i.b, start); 2579 } 2580 return this; 2581 } 2582 2583 // end of mixin-able part 2584 //============================================================================ 2585 public: 2586 /** 2587 Obtains a set that is the inversion of this set. 2588 2589 See the '!' $(LREF opUnary) for the same but using operators. 2590 */ 2591 @property auto inverted() 2592 { 2593 InversionList inversion = this; 2594 if (inversion.data.length == 0) 2595 { 2596 inversion.addInterval(0, lastDchar+1); 2597 return inversion; 2598 } 2599 if (inversion.data[0] != 0) 2600 genericReplace(inversion.data, 0, 0, [0]); 2601 else 2602 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2603 if (data[data.length-1] != lastDchar+1) 2604 genericReplace(inversion.data, 2605 inversion.data.length, inversion.data.length, [lastDchar+1]); 2606 else 2607 genericReplace(inversion.data, 2608 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2609 2610 return inversion; 2611 } 2612 2613 /// 2614 pure @safe unittest 2615 { 2616 auto set = unicode.ASCII; 2617 // union with the inverse gets all of the code points in the Unicode 2618 assert((set | set.inverted).length == 0x110000); 2619 // no intersection with the inverse 2620 assert((set & set.inverted).empty); 2621 } 2622 2623 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2624 { 2625 import std.algorithm.searching : countUntil; 2626 import std.format : format; 2627 enum maxBinary = 3; 2628 static string linearScope(R)(R ivals, string indent) 2629 { 2630 string result = indent~"{\n"; 2631 string deeper = indent~" "; 2632 foreach (ival; ivals) 2633 { 2634 immutable span = ival[1] - ival[0]; 2635 assert(span != 0); 2636 if (span == 1) 2637 { 2638 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2639 } 2640 else if (span == 2) 2641 { 2642 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2643 deeper, ival[0], ival[0]+1); 2644 } 2645 else 2646 { 2647 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2648 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2649 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2650 } 2651 } 2652 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2653 return result; 2654 } 2655 2656 static string binaryScope(R)(R ivals, string indent) @safe 2657 { 2658 // time to do unrolled comparisons? 2659 if (ivals.length < maxBinary) 2660 return linearScope(ivals, indent); 2661 else 2662 return bisect(ivals, ivals.length/2, indent); 2663 } 2664 2665 // not used yet if/elsebinary search is far better with DMD as of 2.061 2666 // and GDC is doing fine job either way 2667 static string switchScope(R)(R ivals, string indent) 2668 { 2669 string result = indent~"switch (ch){\n"; 2670 string deeper = indent~" "; 2671 foreach (ival; ivals) 2672 { 2673 if (ival[0]+1 == ival[1]) 2674 { 2675 result ~= format("%scase %s: return true;\n", 2676 deeper, ival[0]); 2677 } 2678 else 2679 { 2680 result ~= format("%scase %s: .. case %s: return true;\n", 2681 deeper, ival[0], ival[1]-1); 2682 } 2683 } 2684 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2685 return result; 2686 } 2687 2688 static string bisect(R)(R range, size_t idx, string indent) 2689 { 2690 string deeper = indent ~ " "; 2691 // bisect on one [a, b) interval at idx 2692 string result = indent~"{\n"; 2693 // less branch, < a 2694 result ~= format("%sif (ch < %s)\n%s", 2695 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2696 // middle point, >= a && < b 2697 result ~= format("%selse if (ch < %s) return true;\n", 2698 deeper, range[idx][1]); 2699 // greater or equal branch, >= b 2700 result ~= format("%selse\n%s", 2701 deeper, binaryScope(range[idx+1..$], deeper)); 2702 return result~indent~"}\n"; 2703 } 2704 2705 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2706 funcName.empty ? "function" : funcName); 2707 // special case first bisection to be on ASCII vs beyond 2708 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2709 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2710 code ~= binaryScope(range, ""); 2711 else 2712 code ~= bisect(range, tillAscii, ""); 2713 return code; 2714 } 2715 2716 /** 2717 Generates string with D source code of unary function with name of 2718 `funcName` taking a single `dchar` argument. If `funcName` is empty 2719 the code is adjusted to be a lambda function. 2720 2721 The function generated tests if the $(CODEPOINT) passed 2722 belongs to this set or not. The result is to be used with string mixin. 2723 The intended usage area is aggressive optimization via meta programming 2724 in parser generators and the like. 2725 2726 Note: Use with care for relatively small or regular sets. It 2727 could end up being slower then just using multi-staged tables. 2728 2729 Example: 2730 --- 2731 import std.stdio; 2732 2733 // construct set directly from [a, b$RPAREN intervals 2734 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2735 writeln(set); 2736 writeln(set.toSourceCode("func")); 2737 --- 2738 2739 The above outputs something along the lines of: 2740 --- 2741 bool func(dchar ch) @safe pure nothrow @nogc 2742 { 2743 if (ch < 45) 2744 { 2745 if (ch == 10 || ch == 11) return true; 2746 return false; 2747 } 2748 else if (ch < 65) return true; 2749 else 2750 { 2751 if (ch < 100) return false; 2752 if (ch < 200) return true; 2753 return false; 2754 } 2755 } 2756 --- 2757 */ 2758 string toSourceCode(string funcName="") 2759 { 2760 import std.array : array; 2761 auto range = byInterval.array(); 2762 return toSourceCode(range, funcName); 2763 } 2764 2765 /** 2766 True if this set doesn't contain any $(CODEPOINTS). 2767 */ 2768 @property bool empty() const 2769 { 2770 return data.length == 0; 2771 } 2772 2773 /// 2774 pure @safe unittest 2775 { 2776 CodepointSet emptySet; 2777 assert(emptySet.length == 0); 2778 assert(emptySet.empty); 2779 } 2780 2781 private: 2782 alias This = typeof(this); 2783 alias Marker = size_t; 2784 2785 // a random-access range of integral pairs 2786 static struct Intervals(Range) 2787 { 2788 import std.range.primitives : hasAssignableElements; 2789 2790 this(Range sp) scope 2791 { 2792 slice = sp; 2793 start = 0; 2794 end = sp.length; 2795 } 2796 2797 this(Range sp, size_t s, size_t e) scope 2798 { 2799 slice = sp; 2800 start = s; 2801 end = e; 2802 } 2803 2804 @property auto front()const 2805 { 2806 immutable a = slice[start]; 2807 immutable b = slice[start+1]; 2808 return CodepointInterval(a, b); 2809 } 2810 2811 //may break sorted property - but we need std.sort to access it 2812 //hence package(std) protection attribute 2813 static if (hasAssignableElements!Range) 2814 package(std) @property void front(CodepointInterval val) 2815 { 2816 slice[start] = val.a; 2817 slice[start+1] = val.b; 2818 } 2819 2820 @property auto back()const 2821 { 2822 immutable a = slice[end-2]; 2823 immutable b = slice[end-1]; 2824 return CodepointInterval(a, b); 2825 } 2826 2827 //ditto about package 2828 static if (hasAssignableElements!Range) 2829 package(std) @property void back(CodepointInterval val) 2830 { 2831 slice[end-2] = val.a; 2832 slice[end-1] = val.b; 2833 } 2834 2835 void popFront() 2836 { 2837 start += 2; 2838 } 2839 2840 void popBack() 2841 { 2842 end -= 2; 2843 } 2844 2845 auto opIndex(size_t idx) const 2846 { 2847 immutable a = slice[start+idx*2]; 2848 immutable b = slice[start+idx*2+1]; 2849 return CodepointInterval(a, b); 2850 } 2851 2852 //ditto about package 2853 static if (hasAssignableElements!Range) 2854 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2855 { 2856 slice[start+idx*2] = val.a; 2857 slice[start+idx*2+1] = val.b; 2858 } 2859 2860 auto opSlice(size_t s, size_t e) 2861 { 2862 return Intervals(slice, s*2+start, e*2+start); 2863 } 2864 2865 @property size_t length()const { return slice.length/2; } 2866 2867 @property bool empty()const { return start == end; } 2868 2869 @property auto save(){ return this; } 2870 private: 2871 size_t start, end; 2872 Range slice; 2873 } 2874 2875 // called after construction from intervals 2876 // to make sure invariants hold 2877 void sanitize() 2878 { 2879 import std.algorithm.comparison : max; 2880 import std.algorithm.mutation : SwapStrategy; 2881 import std.algorithm.sorting : sort; 2882 if (data.length == 0) 2883 return; 2884 alias Ival = CodepointInterval; 2885 //intervals wrapper for a _range_ over packed array 2886 auto ivals = Intervals!(typeof(data[]))(data[]); 2887 //@@@BUG@@@ can't use "a.a < b.a" see 2888 // https://issues.dlang.org/show_bug.cgi?id=12265 2889 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2890 // what follows is a variation on stable remove 2891 // differences: 2892 // - predicate is binary, and is tested against 2893 // the last kept element (at 'i'). 2894 // - predicate mutates lhs (merges rhs into lhs) 2895 size_t len = ivals.length; 2896 size_t i = 0; 2897 size_t j = 1; 2898 while (j < len) 2899 { 2900 if (ivals[i].b >= ivals[j].a) 2901 { 2902 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2903 j++; 2904 } 2905 else //unmergable 2906 { 2907 // check if there is a hole after merges 2908 // (in the best case we do 0 writes to ivals) 2909 if (j != i+1) 2910 ivals[i+1] = ivals[j]; //copy over 2911 i++; 2912 j++; 2913 } 2914 } 2915 len = i + 1; 2916 for (size_t k=0; k + 1 < len; k++) 2917 { 2918 assert(ivals[k].a < ivals[k].b); 2919 assert(ivals[k].b < ivals[k+1].a); 2920 } 2921 data.length = len * 2; 2922 } 2923 2924 // special case for normal InversionList 2925 ref subChar(dchar ch) 2926 { 2927 auto mark = skipUpTo(ch); 2928 if (mark != data.length 2929 && data[mark] == ch && data[mark-1] == ch) 2930 { 2931 // it has split, meaning that ch happens to be in one of intervals 2932 data[mark] = data[mark]+1; 2933 } 2934 return this; 2935 } 2936 2937 // 2938 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2939 in 2940 { 2941 assert(a <= b); 2942 } 2943 do 2944 { 2945 import std.range : assumeSorted, SearchPolicy; 2946 auto range = assumeSorted(data[]); 2947 size_t pos; 2948 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2949 if (a_idx == range.length) 2950 { 2951 // [---+++----++++----++++++] 2952 // [ a b] 2953 data.append(a, b); 2954 return data.length-1; 2955 } 2956 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2957 uint[3] buf = void; 2958 uint to_insert; 2959 debug(std_uni) 2960 { 2961 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2962 } 2963 if (b_idx == range.length) 2964 { 2965 // [-------++++++++----++++++-] 2966 // [ s a b] 2967 if (a_idx & 1)// a in positive 2968 { 2969 buf[0] = b; 2970 to_insert = 1; 2971 } 2972 else// a in negative 2973 { 2974 buf[0] = a; 2975 buf[1] = b; 2976 to_insert = 2; 2977 } 2978 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2979 return pos - 1; 2980 } 2981 2982 uint top = data[b_idx]; 2983 2984 debug(std_uni) 2985 { 2986 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2987 writefln("a=%s; b=%s; top=%s;", a, b, top); 2988 } 2989 if (a_idx & 1) 2990 {// a in positive 2991 if (b_idx & 1)// b in positive 2992 { 2993 // [-------++++++++----++++++-] 2994 // [ s a b ] 2995 buf[0] = top; 2996 to_insert = 1; 2997 } 2998 else // b in negative 2999 { 3000 // [-------++++++++----++++++-] 3001 // [ s a b ] 3002 if (top == b) 3003 { 3004 assert(b_idx+1 < data.length); 3005 buf[0] = data[b_idx+1]; 3006 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3007 return pos - 1; 3008 } 3009 buf[0] = b; 3010 buf[1] = top; 3011 to_insert = 2; 3012 } 3013 } 3014 else 3015 { // a in negative 3016 if (b_idx & 1) // b in positive 3017 { 3018 // [----------+++++----++++++-] 3019 // [ a b ] 3020 buf[0] = a; 3021 buf[1] = top; 3022 to_insert = 2; 3023 } 3024 else// b in negative 3025 { 3026 // [----------+++++----++++++-] 3027 // [ a s b ] 3028 if (top == b) 3029 { 3030 assert(b_idx+1 < data.length); 3031 buf[0] = a; 3032 buf[1] = data[b_idx+1]; 3033 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3034 return pos - 1; 3035 } 3036 buf[0] = a; 3037 buf[1] = b; 3038 buf[2] = top; 3039 to_insert = 3; 3040 } 3041 } 3042 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3043 debug(std_uni) 3044 { 3045 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3046 writeln("inserting ", buf[0 .. to_insert]); 3047 } 3048 return pos - 1; 3049 } 3050 3051 // 3052 Marker dropUpTo(uint a, Marker pos=Marker.init) 3053 in 3054 { 3055 assert(pos % 2 == 0); // at start of interval 3056 } 3057 do 3058 { 3059 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3060 if (range.empty) 3061 return pos; 3062 size_t idx = pos; 3063 idx += range.lowerBound(a).length; 3064 3065 debug(std_uni) 3066 { 3067 writeln("dropUpTo full length=", data.length); 3068 writeln(pos,"~~~", idx); 3069 } 3070 if (idx == data.length) 3071 return genericReplace(data, pos, idx, cast(uint[])[]); 3072 if (idx & 1) 3073 { // a in positive 3074 //[--+++----++++++----+++++++------...] 3075 // |<---si s a t 3076 genericReplace(data, pos, idx, [a]); 3077 } 3078 else 3079 { // a in negative 3080 //[--+++----++++++----+++++++-------+++...] 3081 // |<---si s a t 3082 genericReplace(data, pos, idx, cast(uint[])[]); 3083 } 3084 return pos; 3085 } 3086 3087 // 3088 Marker skipUpTo(uint a, Marker pos=Marker.init) 3089 out(result) 3090 { 3091 assert(result % 2 == 0);// always start of interval 3092 //(may be 0-width after-split) 3093 } 3094 do 3095 { 3096 assert(data.length % 2 == 0); 3097 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3098 size_t idx = pos+range.lowerBound(a).length; 3099 3100 if (idx >= data.length) // could have Marker point to recently removed stuff 3101 return data.length; 3102 3103 if (idx & 1)// inside of interval, check for split 3104 { 3105 3106 immutable top = data[idx]; 3107 if (top == a)// no need to split, it's end 3108 return idx+1; 3109 immutable start = data[idx-1]; 3110 if (a == start) 3111 return idx-1; 3112 // split it up 3113 genericReplace(data, idx, idx+1, [a, a, top]); 3114 return idx+1; // avoid odd index 3115 } 3116 return idx; 3117 } 3118 3119 CowArray!SP data; 3120 } 3121 3122 pure @safe unittest 3123 { 3124 import std.conv : to; 3125 assert(unicode.ASCII.to!string() == "[0..128)"); 3126 } 3127 3128 // pedantic version for ctfe, and aligned-access only architectures 3129 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3130 { 3131 idx *= 3; 3132 version (LittleEndian) 3133 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3134 + (cast(uint) ptr[idx+2]<<16); 3135 else 3136 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3137 + ptr[idx+2]; 3138 } 3139 3140 // ditto 3141 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3142 { 3143 idx *= 3; 3144 version (LittleEndian) 3145 { 3146 ptr[idx] = val & 0xFF; 3147 ptr[idx+1] = (val >> 8) & 0xFF; 3148 ptr[idx+2] = (val >> 16) & 0xFF; 3149 } 3150 else 3151 { 3152 ptr[idx] = (val >> 16) & 0xFF; 3153 ptr[idx+1] = (val >> 8) & 0xFF; 3154 ptr[idx+2] = val & 0xFF; 3155 } 3156 } 3157 3158 // unaligned x86-like read/write functions 3159 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3160 { 3161 uint* src = cast(uint*)(ptr+3*idx); 3162 version (LittleEndian) 3163 return *src & 0xFF_FFFF; 3164 else 3165 return *src >> 8; 3166 } 3167 3168 // ditto 3169 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3170 { 3171 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3172 version (LittleEndian) 3173 *dest = val | (*dest & 0xFF00_0000); 3174 else 3175 *dest = (val << 8) | (*dest & 0xFF); 3176 } 3177 3178 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3179 { 3180 static if (hasUnalignedReads) 3181 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3182 else 3183 return safeRead24(ptr, idx); 3184 } 3185 3186 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3187 { 3188 static if (hasUnalignedReads) 3189 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3190 else 3191 return safeWrite24(ptr, val, idx); 3192 } 3193 3194 struct CowArray(SP=GcPolicy) 3195 { 3196 import std.range.primitives : hasLength; 3197 3198 @safe: 3199 static auto reuse(uint[] arr) 3200 { 3201 CowArray cow; 3202 cow.data = arr; 3203 SP.append(cow.data, 1); 3204 assert(cow.refCount == 1); 3205 assert(cow.length == arr.length); 3206 return cow; 3207 } 3208 3209 this(Range)(Range range) 3210 if (isInputRange!Range && hasLength!Range) 3211 { 3212 import std.algorithm.mutation : copy; 3213 length = range.length; 3214 copy(range, data[0..$-1]); 3215 } 3216 3217 this(Range)(Range range) 3218 if (isForwardRange!Range && !hasLength!Range) 3219 { 3220 import std.algorithm.mutation : copy; 3221 import std.range.primitives : walkLength; 3222 immutable len = walkLength(range.save); 3223 length = len; 3224 copy(range, data[0..$-1]); 3225 } 3226 3227 this(this) 3228 { 3229 if (!empty) 3230 { 3231 refCount = refCount + 1; 3232 } 3233 } 3234 3235 ~this() 3236 { 3237 if (!empty) 3238 { 3239 immutable cnt = refCount; 3240 if (cnt == 1) 3241 SP.destroy(data); 3242 else 3243 refCount = cnt - 1; 3244 } 3245 } 3246 3247 // no ref-count for empty U24 array 3248 @property bool empty() const { return data.length == 0; } 3249 3250 // report one less then actual size 3251 @property size_t length() const 3252 { 3253 return data.length ? data.length - 1 : 0; 3254 } 3255 3256 //+ an extra slot for ref-count 3257 @property void length(size_t len) 3258 { 3259 import std.algorithm.comparison : min; 3260 import std.algorithm.mutation : copy; 3261 if (len == 0) 3262 { 3263 if (!empty) 3264 freeThisReference(); 3265 return; 3266 } 3267 immutable total = len + 1; // including ref-count 3268 if (empty) 3269 { 3270 data = SP.alloc!uint(total); 3271 refCount = 1; 3272 return; 3273 } 3274 immutable cur_cnt = refCount; 3275 if (cur_cnt != 1) // have more references to this memory 3276 { 3277 refCount = cur_cnt - 1; 3278 auto new_data = SP.alloc!uint(total); 3279 // take shrinking into account 3280 auto to_copy = min(total, data.length) - 1; 3281 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3282 data = new_data; // before setting refCount! 3283 refCount = 1; 3284 } 3285 else // 'this' is the only reference 3286 { 3287 // use the realloc (hopefully in-place operation) 3288 data = SP.realloc(data, total); 3289 refCount = 1; // setup a ref-count in the new end of the array 3290 } 3291 } 3292 3293 alias opDollar = length; 3294 3295 uint opIndex()(size_t idx)const 3296 { 3297 return data[idx]; 3298 } 3299 3300 void opIndexAssign(uint val, size_t idx) 3301 { 3302 auto cnt = refCount; 3303 if (cnt != 1) 3304 dupThisReference(cnt); 3305 data[idx] = val; 3306 } 3307 3308 // 3309 auto opSlice(size_t from, size_t to) 3310 { 3311 if (!empty) 3312 { 3313 auto cnt = refCount; 3314 if (cnt != 1) 3315 dupThisReference(cnt); 3316 } 3317 return data[from .. to]; 3318 3319 } 3320 3321 // 3322 auto opSlice(size_t from, size_t to) const 3323 { 3324 return data[from .. to]; 3325 } 3326 3327 // length slices before the ref count 3328 auto opSlice() 3329 { 3330 return opSlice(0, length); 3331 } 3332 3333 // ditto 3334 auto opSlice() const 3335 { 3336 return opSlice(0, length); 3337 } 3338 3339 void append(Range)(Range range) 3340 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3341 { 3342 size_t nl = length + range.length; 3343 length = nl; 3344 copy(range, this[nl-range.length .. nl]); 3345 } 3346 3347 void append()(uint[] val...) 3348 { 3349 length = length + val.length; 3350 data[$-val.length-1 .. $-1] = val[]; 3351 } 3352 3353 bool opEquals()(auto ref const CowArray rhs) const 3354 { 3355 if (empty ^ rhs.empty) 3356 return false; // one is empty and the other isn't 3357 return empty || data[0..$-1] == rhs.data[0..$-1]; 3358 } 3359 3360 private: 3361 // ref-count is right after the data 3362 @property uint refCount() const 3363 { 3364 return data[$-1]; 3365 } 3366 3367 @property void refCount(uint cnt) 3368 { 3369 data[$-1] = cnt; 3370 } 3371 3372 void freeThisReference() 3373 { 3374 immutable count = refCount; 3375 if (count != 1) // have more references to this memory 3376 { 3377 // dec shared ref-count 3378 refCount = count - 1; 3379 data = []; 3380 } 3381 else 3382 SP.destroy(data); 3383 assert(!data.ptr); 3384 } 3385 3386 void dupThisReference(uint count) 3387 in 3388 { 3389 assert(!empty && count != 1 && count == refCount); 3390 } 3391 do 3392 { 3393 import std.algorithm.mutation : copy; 3394 // dec shared ref-count 3395 refCount = count - 1; 3396 // copy to the new chunk of RAM 3397 auto new_data = SP.alloc!uint(data.length); 3398 // bit-blit old stuff except the counter 3399 copy(data[0..$-1], new_data[0..$-1]); 3400 data = new_data; // before setting refCount! 3401 refCount = 1; // so that this updates the right one 3402 } 3403 3404 uint[] data; 3405 } 3406 3407 pure @safe unittest// Uint24 tests 3408 { 3409 import std.algorithm.comparison : equal; 3410 import std.algorithm.mutation : copy; 3411 import std.conv : text; 3412 import std.range : iota, chain; 3413 import std.range.primitives : isBidirectionalRange, isOutputRange; 3414 void funcRef(T)(ref T u24) 3415 { 3416 u24.length = 2; 3417 u24[1] = 1024; 3418 T u24_c = u24; 3419 assert(u24[1] == 1024); 3420 u24.length = 0; 3421 assert(u24.empty); 3422 u24.append([1, 2]); 3423 assert(equal(u24[], [1, 2])); 3424 u24.append(111); 3425 assert(equal(u24[], [1, 2, 111])); 3426 assert(!u24_c.empty && u24_c[1] == 1024); 3427 u24.length = 3; 3428 copy(iota(0, 3), u24[]); 3429 assert(equal(u24[], iota(0, 3))); 3430 assert(u24_c[1] == 1024); 3431 } 3432 3433 void func2(T)(T u24) 3434 { 3435 T u24_2 = u24; 3436 T u24_3; 3437 u24_3 = u24_2; 3438 assert(u24_2 == u24_3); 3439 assert(equal(u24[], u24_2[])); 3440 assert(equal(u24_2[], u24_3[])); 3441 funcRef(u24_3); 3442 3443 assert(equal(u24_3[], iota(0, 3))); 3444 assert(!equal(u24_2[], u24_3[])); 3445 assert(equal(u24_2[], u24[])); 3446 u24_2 = u24_3; 3447 assert(equal(u24_2[], iota(0, 3))); 3448 // to test that passed arg is intact outside 3449 // plus try out opEquals 3450 u24 = u24_3; 3451 u24 = T.init; 3452 u24_3 = T.init; 3453 assert(u24.empty); 3454 assert(u24 == u24_3); 3455 assert(u24 != u24_2); 3456 } 3457 3458 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3459 {{ 3460 alias Range = typeof(CowArray!Policy.init[]); 3461 alias U24A = CowArray!Policy; 3462 static assert(isForwardRange!Range); 3463 static assert(isBidirectionalRange!Range); 3464 static assert(isOutputRange!(Range, uint)); 3465 static assert(isRandomAccessRange!(Range)); 3466 3467 auto arr = U24A([42u, 36, 100]); 3468 assert(arr[0] == 42); 3469 assert(arr[1] == 36); 3470 arr[0] = 72; 3471 arr[1] = 0xFE_FEFE; 3472 assert(arr[0] == 72); 3473 assert(arr[1] == 0xFE_FEFE); 3474 assert(arr[2] == 100); 3475 U24A arr2 = arr; 3476 assert(arr2[0] == 72); 3477 arr2[0] = 11; 3478 // test COW-ness 3479 assert(arr[0] == 72); 3480 assert(arr2[0] == 11); 3481 // set this to about 100M to stress-test COW memory management 3482 foreach (v; 0 .. 10_000) 3483 func2(arr); 3484 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3485 3486 auto r2 = U24A(iota(0, 100)); 3487 assert(equal(r2[], iota(0, 100)), text(r2[])); 3488 copy(iota(10, 170, 2), r2[10 .. 90]); 3489 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3490 , text(r2[])); 3491 }} 3492 } 3493 3494 pure @safe unittest// core set primitives test 3495 { 3496 import std.conv : text; 3497 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3498 foreach (CodeList; AllSets) 3499 { 3500 CodeList a; 3501 //"plug a hole" test 3502 a.add(10, 20).add(25, 30).add(15, 27); 3503 assert(a == CodeList(10, 30), text(a)); 3504 3505 auto x = CodeList.init; 3506 x.add(10, 20).add(30, 40).add(50, 60); 3507 3508 a = x; 3509 a.add(20, 49);//[10, 49) [50, 60) 3510 assert(a == CodeList(10, 49, 50 ,60)); 3511 3512 a = x; 3513 a.add(20, 50); 3514 assert(a == CodeList(10, 60), text(a)); 3515 3516 // simple unions, mostly edge effects 3517 x = CodeList.init; 3518 x.add(10, 20).add(40, 60); 3519 3520 a = x; 3521 a.add(10, 25); //[10, 25) [40, 60) 3522 assert(a == CodeList(10, 25, 40, 60)); 3523 3524 a = x; 3525 a.add(5, 15); //[5, 20) [40, 60) 3526 assert(a == CodeList(5, 20, 40, 60)); 3527 3528 a = x; 3529 a.add(0, 10); // [0, 20) [40, 60) 3530 assert(a == CodeList(0, 20, 40, 60)); 3531 3532 a = x; 3533 a.add(0, 5); // prepand 3534 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3535 3536 a = x; 3537 a.add(5, 20); 3538 assert(a == CodeList(5, 20, 40, 60)); 3539 3540 a = x; 3541 a.add(3, 37); 3542 assert(a == CodeList(3, 37, 40, 60)); 3543 3544 a = x; 3545 a.add(37, 65); 3546 assert(a == CodeList(10, 20, 37, 65)); 3547 3548 // some tests on helpers for set intersection 3549 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3550 a = x; 3551 3552 auto m = a.skipUpTo(60); 3553 a.dropUpTo(110, m); 3554 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3555 3556 a = x; 3557 a.dropUpTo(100); 3558 assert(a == CodeList(100, 120), text(a.data[])); 3559 3560 a = x; 3561 m = a.skipUpTo(50); 3562 a.dropUpTo(140, m); 3563 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3564 a = x; 3565 a.dropUpTo(60); 3566 assert(a == CodeList(100, 120), text(a.data[])); 3567 } 3568 } 3569 3570 3571 //test constructor to work with any order of intervals 3572 pure @safe unittest 3573 { 3574 import std.algorithm.comparison : equal; 3575 import std.conv : text, to; 3576 import std.range : chain, iota; 3577 import std.typecons : tuple; 3578 //ensure constructor handles bad ordering and overlap 3579 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3580 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3581 assert(ch in c1, to!string(ch)); 3582 3583 //contiguos 3584 assert(CodepointSet(1000, 1006, 1006, 1009) 3585 .byInterval.equal([tuple(1000, 1009)])); 3586 //contains 3587 assert(CodepointSet(900, 1200, 1000, 1100) 3588 .byInterval.equal([tuple(900, 1200)])); 3589 //intersect left 3590 assert(CodepointSet(900, 1100, 1000, 1200) 3591 .byInterval.equal([tuple(900, 1200)])); 3592 //intersect right 3593 assert(CodepointSet(1000, 1200, 900, 1100) 3594 .byInterval.equal([tuple(900, 1200)])); 3595 3596 //ditto with extra items at end 3597 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3598 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3599 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3600 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3601 3602 //"plug a hole" test 3603 auto c2 = CodepointSet(20, 40, 3604 60, 80, 100, 140, 150, 200, 3605 40, 60, 80, 100, 140, 150 3606 ); 3607 assert(c2.byInterval.equal([tuple(20, 200)])); 3608 3609 auto c3 = CodepointSet( 3610 20, 40, 60, 80, 100, 140, 150, 200, 3611 0, 10, 15, 100, 10, 20, 200, 220); 3612 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3613 } 3614 3615 3616 pure @safe unittest 3617 { // full set operations 3618 import std.conv : text; 3619 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3620 foreach (CodeList; AllSets) 3621 { 3622 CodeList a, b, c, d; 3623 3624 //"plug a hole" 3625 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3626 b.add(40, 60).add(80, 100).add(140, 150); 3627 c = a | b; 3628 d = b | a; 3629 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3630 assert(c == d, text(c," vs ", d)); 3631 3632 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3633 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3634 d = b | a; 3635 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3636 assert(c == d, text(c," vs ", d)); 3637 3638 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3639 c = a | b;//[10, 140) [145, 200) 3640 d = b | a; 3641 assert(c == CodeList(10, 140, 145, 200)); 3642 assert(c == d, text(c," vs ", d)); 3643 3644 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3645 c = a | b;//[0, 140) [150, 220) 3646 d = b | a; 3647 assert(c == CodeList(0, 140, 150, 220)); 3648 assert(c == d, text(c," vs ", d)); 3649 3650 3651 a = CodeList.init.add(20, 40).add(60, 80); 3652 b = CodeList.init.add(25, 35).add(65, 75); 3653 c = a & b; 3654 d = b & a; 3655 assert(c == CodeList(25, 35, 65, 75), text(c)); 3656 assert(c == d, text(c," vs ", d)); 3657 3658 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3659 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3660 c = a & b; 3661 d = b & a; 3662 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3663 assert(c == d, text(c," vs ", d)); 3664 3665 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3666 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3667 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3668 d = b & a; 3669 3670 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3671 assert(c == d, text(c, " vs ",d)); 3672 assert((c & a) == c); 3673 assert((d & b) == d); 3674 assert((c & d) == d); 3675 3676 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3677 c = a & b; 3678 d = b & a; 3679 assert(c == CodeList(150, 200), text(c)); 3680 assert(c == d, text(c, " vs ",d)); 3681 assert((c & a) == c); 3682 assert((d & b) == d); 3683 assert((c & d) == d); 3684 3685 assert((a & a) == a); 3686 assert((b & b) == b); 3687 3688 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3689 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3690 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3691 d = b - a;// [40, 60) [80, 100) [200, 300) 3692 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3693 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3694 assert(c - d == c, text(c-d, " vs ", c)); 3695 assert(d - c == d, text(d-c, " vs ", d)); 3696 assert(c - c == CodeList.init); 3697 assert(d - d == CodeList.init); 3698 3699 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3700 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3701 c = a - b;// [160, 190) 3702 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3703 assert(c == CodeList(160, 190), text(c)); 3704 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3705 assert(c - d == c, text(c-d, " vs ", c)); 3706 assert(d - c == d, text(d-c, " vs ", d)); 3707 assert(c - c == CodeList.init); 3708 assert(d - d == CodeList.init); 3709 3710 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3711 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3712 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3713 d = b ~ a; 3714 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3715 text(c)); 3716 assert(c == d, text(c, " vs ", d)); 3717 } 3718 } 3719 3720 } 3721 3722 pure @safe unittest// vs single dchar 3723 { 3724 import std.conv : text; 3725 CodepointSet a = CodepointSet(10, 100, 120, 200); 3726 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3727 assert((a & 'B') == CodepointSet(66, 67)); 3728 } 3729 3730 pure @safe unittest// iteration & opIndex 3731 { 3732 import std.algorithm.comparison : equal; 3733 import std.conv : text; 3734 import std.typecons : tuple, Tuple; 3735 3736 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3737 {{ 3738 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3739 auto a = CodeList('A','N','a', 'n'); 3740 assert(equal(a.byInterval, 3741 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3742 ), text(a.byInterval)); 3743 3744 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3745 version (bug8949) 3746 { 3747 import std.range : retro; 3748 assert(equal(retro(a.byInterval), 3749 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3750 ), text(retro(a.byInterval))); 3751 } 3752 auto achr = a.byCodepoint; 3753 assert(equal(achr, arr), text(a.byCodepoint)); 3754 foreach (ch; a.byCodepoint) 3755 assert(a[ch]); 3756 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3757 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3758 foreach (ch; x.byCodepoint) 3759 assert(x[ch]); 3760 static if (is(CodeList == CodepointSet)) 3761 { 3762 auto y = CodeList(x.byInterval); 3763 assert(equal(x.byInterval, y.byInterval)); 3764 } 3765 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3766 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3767 }} 3768 } 3769 3770 //============================================================================ 3771 // Generic Trie template and various ways to build it 3772 //============================================================================ 3773 3774 // debug helper to get a shortened array dump 3775 auto arrayRepr(T)(T x) 3776 { 3777 import std.conv : text; 3778 if (x.length > 32) 3779 { 3780 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3781 } 3782 else 3783 return text(x); 3784 } 3785 3786 /** 3787 Maps `Key` to a suitable integer index within the range of `size_t`. 3788 The mapping is constructed by applying predicates from `Prefix` left to right 3789 and concatenating the resulting bits. 3790 3791 The first (leftmost) predicate defines the most significant bits of 3792 the resulting index. 3793 */ 3794 template mapTrieIndex(Prefix...) 3795 { 3796 size_t mapTrieIndex(Key)(Key key) 3797 if (isValidPrefixForTrie!(Key, Prefix)) 3798 { 3799 alias p = Prefix; 3800 size_t idx; 3801 foreach (i, v; p[0..$-1]) 3802 { 3803 idx |= p[i](key); 3804 idx <<= p[i+1].bitSize; 3805 } 3806 idx |= p[$-1](key); 3807 return idx; 3808 } 3809 } 3810 3811 /* 3812 `TrieBuilder` is a type used for incremental construction 3813 of $(LREF Trie)s. 3814 3815 See $(LREF buildTrie) for generic helpers built on top of it. 3816 */ 3817 @trusted private struct TrieBuilder(Value, Key, Args...) 3818 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3819 { 3820 import std.exception : enforce; 3821 3822 private: 3823 // last index is not stored in table, it is used as an offset to values in a block. 3824 static if (is(Value == bool))// always pack bool 3825 alias V = BitPacked!(Value, 1); 3826 else 3827 alias V = Value; 3828 static auto deduceMaxIndex(Preds...)() 3829 { 3830 size_t idx = 1; 3831 foreach (v; Preds) 3832 idx *= 2^^v.bitSize; 3833 return idx; 3834 } 3835 3836 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3837 { 3838 alias Prefix = Args[1..$]; 3839 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3840 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3841 enum roughedMaxIndex = 3842 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3843 // check warp around - if wrapped, use the default deduction rule 3844 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3845 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3846 } 3847 else 3848 { 3849 alias Prefix = Args; 3850 enum maxIndex = deduceMaxIndex!(Prefix)(); 3851 } 3852 3853 alias getIndex = mapTrieIndex!(Prefix); 3854 3855 enum lastLevel = Prefix.length-1; 3856 struct ConstructState 3857 { 3858 size_t idx_zeros, idx_ones; 3859 } 3860 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3861 size_t[Prefix.length] indices; 3862 // default filler value to use 3863 Value defValue; 3864 // this is a full-width index of next item 3865 size_t curIndex; 3866 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3867 ConstructState[Prefix.length] state; 3868 // the table being constructed 3869 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3870 3871 @disable this(); 3872 3873 //shortcut for index variable at level 'level' 3874 @property ref idx(size_t level)(){ return indices[level]; } 3875 3876 // this function assumes no holes in the input so 3877 // indices are going one by one 3878 void addValue(size_t level, T)(T val, size_t numVals) 3879 { 3880 alias j = idx!level; 3881 enum pageSize = 1 << Prefix[level].bitSize; 3882 if (numVals == 0) 3883 return; 3884 auto ptr = table.slice!(level); 3885 if (numVals == 1) 3886 { 3887 static if (level == Prefix.length-1) 3888 ptr[j] = val; 3889 else 3890 {// can incur narrowing conversion 3891 assert(j < ptr.length); 3892 ptr[j] = force!(typeof(ptr[j]))(val); 3893 } 3894 j++; 3895 if (j % pageSize == 0) 3896 spillToNextPage!level(ptr); 3897 return; 3898 } 3899 // longer row of values 3900 // get to the next page boundary 3901 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3902 immutable n = nextPB - j;// can fill right in this page 3903 if (numVals < n) //fits in current page 3904 { 3905 ptr[j .. j+numVals] = val; 3906 j += numVals; 3907 return; 3908 } 3909 static if (level != 0)//on the first level it always fits 3910 { 3911 numVals -= n; 3912 //write till the end of current page 3913 ptr[j .. j+n] = val; 3914 j += n; 3915 //spill to the next page 3916 spillToNextPage!level(ptr); 3917 // page at once loop 3918 if (state[level].idx_zeros != size_t.max && val == T.init) 3919 { 3920 alias NextIdx = typeof(table.slice!(level-1)[0]); 3921 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3922 numVals/pageSize); 3923 ptr = table.slice!level; //table structure might have changed 3924 numVals %= pageSize; 3925 } 3926 else 3927 { 3928 while (numVals >= pageSize) 3929 { 3930 numVals -= pageSize; 3931 ptr[j .. j+pageSize] = val; 3932 j += pageSize; 3933 spillToNextPage!level(ptr); 3934 } 3935 } 3936 if (numVals) 3937 { 3938 // the leftovers, an incomplete page 3939 ptr[j .. j+numVals] = val; 3940 j += numVals; 3941 } 3942 } 3943 } 3944 3945 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3946 { 3947 // last level (i.e. topmost) has 1 "page" 3948 // thus it need not to add a new page on upper level 3949 static if (level != 0) 3950 spillToNextPageImpl!(level)(ptr); 3951 } 3952 3953 // this can re-use the current page if duplicate or allocate a new one 3954 // it also makes sure that previous levels point to the correct page in this level 3955 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3956 { 3957 alias NextIdx = typeof(table.slice!(level-1)[0]); 3958 NextIdx next_lvl_index; 3959 enum pageSize = 1 << Prefix[level].bitSize; 3960 assert(idx!level % pageSize == 0); 3961 immutable last = idx!level-pageSize; 3962 const slice = ptr[idx!level - pageSize .. idx!level]; 3963 size_t j; 3964 for (j=0; j<last; j+=pageSize) 3965 { 3966 if (ptr[j .. j+pageSize] == slice) 3967 { 3968 // get index to it, reuse ptr space for the next block 3969 next_lvl_index = force!NextIdx(j/pageSize); 3970 version (none) 3971 { 3972 import std.stdio : writefln, writeln; 3973 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3974 ,level 3975 ,indices[level-1], pageSize, j, j+pageSize); 3976 writeln("LEVEL(", level 3977 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3978 writeln("LEVEL(", level 3979 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3980 } 3981 idx!level -= pageSize; // reuse this page, it is duplicate 3982 break; 3983 } 3984 } 3985 if (j == last) 3986 { 3987 L_allocate_page: 3988 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3989 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3990 { 3991 state[level].idx_zeros = next_lvl_index; 3992 } 3993 // allocate next page 3994 version (none) 3995 { 3996 import std.stdio : writefln; 3997 writefln("LEVEL(%s) page allocated: %s" 3998 , level, arrayRepr(slice[0 .. pageSize])); 3999 writefln("LEVEL(%s) index: %s ; page at this index %s" 4000 , level 4001 , next_lvl_index 4002 , arrayRepr( 4003 table.slice!(level) 4004 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4005 )); 4006 } 4007 table.length!level = table.length!level + pageSize; 4008 } 4009 L_know_index: 4010 // for the previous level, values are indices to the pages in the current level 4011 addValue!(level-1)(next_lvl_index, 1); 4012 ptr = table.slice!level; //re-load the slice after moves 4013 } 4014 4015 // idx - full-width index to fill with v (full-width index != key) 4016 // fills everything in the range of [curIndex, idx) with filler 4017 void putAt(size_t idx, Value v) 4018 { 4019 assert(idx >= curIndex); 4020 immutable numFillers = idx - curIndex; 4021 addValue!lastLevel(defValue, numFillers); 4022 addValue!lastLevel(v, 1); 4023 curIndex = idx + 1; 4024 } 4025 4026 // ditto, but sets the range of [idxA, idxB) to v 4027 void putRangeAt(size_t idxA, size_t idxB, Value v) 4028 { 4029 assert(idxA >= curIndex); 4030 assert(idxB >= idxA); 4031 size_t numFillers = idxA - curIndex; 4032 addValue!lastLevel(defValue, numFillers); 4033 addValue!lastLevel(v, idxB - idxA); 4034 curIndex = idxB; // open-right 4035 } 4036 4037 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4038 "duplicate key->value mapping"; 4039 4040 public: 4041 /** 4042 Construct a builder, where `filler` is a value 4043 to indicate empty slots (or "not found" condition). 4044 */ 4045 this(Value filler) 4046 { 4047 curIndex = 0; 4048 defValue = filler; 4049 // zeros-page index, ones-page index 4050 foreach (ref v; state) 4051 v = ConstructState(size_t.max, size_t.max); 4052 table = typeof(table)(indices); 4053 // one page per level is a bootstrap minimum 4054 foreach (i, Pred; Prefix) 4055 table.length!i = (1 << Pred.bitSize); 4056 } 4057 4058 /** 4059 Put a value `v` into interval as 4060 mapped by keys from `a` to `b`. 4061 All slots prior to `a` are filled with 4062 the default filler. 4063 */ 4064 void putRange(Key a, Key b, Value v) 4065 { 4066 auto idxA = getIndex(a), idxB = getIndex(b); 4067 // indexes of key should always grow 4068 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4069 putRangeAt(idxA, idxB, v); 4070 } 4071 4072 /** 4073 Put a value `v` into slot mapped by `key`. 4074 All slots prior to `key` are filled with the 4075 default filler. 4076 */ 4077 void putValue(Key key, Value v) 4078 { 4079 auto idx = getIndex(key); 4080 enforce(idx >= curIndex, errMsg); 4081 putAt(idx, v); 4082 } 4083 4084 /// Finishes construction of Trie, yielding an immutable Trie instance. 4085 auto build() 4086 { 4087 static if (maxIndex != 0) // doesn't cover full range of size_t 4088 { 4089 assert(curIndex <= maxIndex); 4090 addValue!lastLevel(defValue, maxIndex - curIndex); 4091 } 4092 else 4093 { 4094 if (curIndex != 0 // couldn't wrap around 4095 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4096 { 4097 addValue!lastLevel(defValue, size_t.max - curIndex); 4098 addValue!lastLevel(defValue, 1); 4099 } 4100 // else curIndex already completed the full range of size_t by wrapping around 4101 } 4102 return Trie!(V, Key, maxIndex, Prefix)(table); 4103 } 4104 } 4105 4106 /** 4107 $(P A generic Trie data-structure for a fixed number of stages. 4108 The design goal is optimal speed with smallest footprint size. 4109 ) 4110 $(P It's intentionally read-only and doesn't provide constructors. 4111 To construct one use a special builder, 4112 see $(LREF TrieBuilder) and $(LREF buildTrie). 4113 ) 4114 4115 */ 4116 @trusted private struct Trie(Value, Key, Args...) 4117 if (isValidPrefixForTrie!(Key, Args) 4118 || (isValidPrefixForTrie!(Key, Args[1..$]) 4119 && is(typeof(Args[0]) : size_t))) 4120 { 4121 import std.range.primitives : isOutputRange; 4122 static if (is(typeof(Args[0]) : size_t)) 4123 { 4124 private enum maxIndex = Args[0]; 4125 private enum hasBoundsCheck = true; 4126 private alias Prefix = Args[1..$]; 4127 } 4128 else 4129 { 4130 private enum hasBoundsCheck = false; 4131 private alias Prefix = Args; 4132 } 4133 4134 private this()(typeof(_table) table) 4135 { 4136 _table = table; 4137 } 4138 4139 // only for constant Tries constructed from precompiled tables 4140 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4141 const(size_t)[] data) const 4142 { 4143 _table = typeof(_table)(offsets, sizes, data); 4144 } 4145 4146 /** 4147 $(P Lookup the `key` in this `Trie`. ) 4148 4149 $(P The lookup always succeeds if key fits the domain 4150 provided during construction. The whole domain defined 4151 is covered so instead of not found condition 4152 the sentinel (filler) value could be used. ) 4153 4154 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4155 define a domain of `Trie` keys and the sentinel value. ) 4156 4157 Note: 4158 Domain range-checking is only enabled in debug builds 4159 and results in assertion failure. 4160 */ 4161 TypeOfBitPacked!Value opIndex()(Key key) const 4162 { 4163 static if (hasBoundsCheck) 4164 assert(mapTrieIndex!Prefix(key) < maxIndex); 4165 size_t idx; 4166 alias p = Prefix; 4167 idx = cast(size_t) p[0](key); 4168 foreach (i, v; p[0..$-1]) 4169 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4170 return _table.ptr!(p.length-1)[idx]; 4171 } 4172 4173 /// 4174 @property size_t bytes(size_t n=size_t.max)() const 4175 { 4176 return _table.bytes!n; 4177 } 4178 4179 /// 4180 @property size_t pages(size_t n)() const 4181 { 4182 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4183 /2^^Prefix[n].bitSize; 4184 } 4185 4186 /// 4187 void store(OutRange)(scope OutRange sink) const 4188 if (isOutputRange!(OutRange, char)) 4189 { 4190 _table.store(sink); 4191 } 4192 4193 private: 4194 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4195 } 4196 4197 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4198 // left-to-right, the most significant bits first 4199 template GetBitSlicing(size_t top, sizes...) 4200 { 4201 static if (sizes.length > 0) 4202 alias GetBitSlicing = 4203 AliasSeq!(sliceBits!(top - sizes[0], top), 4204 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4205 else 4206 alias GetBitSlicing = AliasSeq!(); 4207 } 4208 4209 template callableWith(T) 4210 { 4211 template callableWith(alias Pred) 4212 { 4213 static if (!is(typeof(Pred(T.init)))) 4214 enum callableWith = false; 4215 else 4216 { 4217 alias Result = typeof(Pred(T.init)); 4218 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4219 } 4220 } 4221 } 4222 4223 /* 4224 Check if `Prefix` is a valid set of predicates 4225 for `Trie` template having `Key` as the type of keys. 4226 This requires all predicates to be callable, take 4227 single argument of type `Key` and return unsigned value. 4228 */ 4229 template isValidPrefixForTrie(Key, Prefix...) 4230 { 4231 import std.meta : allSatisfy; 4232 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4233 } 4234 4235 /* 4236 Check if `Args` is a set of maximum key value followed by valid predicates 4237 for `Trie` template having `Key` as the type of keys. 4238 */ 4239 template isValidArgsForTrie(Key, Args...) 4240 { 4241 static if (Args.length > 1) 4242 { 4243 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4244 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4245 } 4246 else 4247 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4248 } 4249 4250 @property size_t sumOfIntegerTuple(ints...)() 4251 { 4252 size_t count=0; 4253 foreach (v; ints) 4254 count += v; 4255 return count; 4256 } 4257 4258 /** 4259 A shorthand for creating a custom multi-level fixed Trie 4260 from a `CodepointSet`. `sizes` are numbers of bits per level, 4261 with the most significant bits used first. 4262 4263 Note: The sum of `sizes` must be equal 21. 4264 4265 See_Also: $(LREF toTrie), which is even simpler. 4266 4267 Example: 4268 --- 4269 { 4270 import std.stdio; 4271 auto set = unicode("Number"); 4272 auto trie = codepointSetTrie!(8, 5, 8)(set); 4273 writeln("Input code points to test:"); 4274 foreach (line; stdin.byLine) 4275 { 4276 int count=0; 4277 foreach (dchar ch; line) 4278 if (trie[ch])// is number 4279 count++; 4280 writefln("Contains %d number code points.", count); 4281 } 4282 } 4283 --- 4284 */ 4285 public template codepointSetTrie(sizes...) 4286 if (sumOfIntegerTuple!sizes == 21) 4287 { 4288 auto codepointSetTrie(Set)(Set set) 4289 if (isCodepointSet!Set) 4290 { 4291 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4292 foreach (ival; set.byInterval) 4293 builder.putRange(ival[0], ival[1], true); 4294 return builder.build(); 4295 } 4296 } 4297 4298 /// Type of Trie generated by codepointSetTrie function. 4299 public template CodepointSetTrie(sizes...) 4300 if (sumOfIntegerTuple!sizes == 21) 4301 { 4302 alias Prefix = GetBitSlicing!(21, sizes); 4303 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4304 } 4305 4306 /** 4307 A slightly more general tool for building fixed `Trie` 4308 for the Unicode data. 4309 4310 Specifically unlike `codepointSetTrie` it's allows creating mappings 4311 of `dchar` to an arbitrary type `T`. 4312 4313 Note: Overload taking `CodepointSet`s will naturally convert 4314 only to bool mapping `Trie`s. 4315 4316 CodepointTrie is the type of Trie as generated by codepointTrie function. 4317 */ 4318 public template codepointTrie(T, sizes...) 4319 if (sumOfIntegerTuple!sizes == 21) 4320 { 4321 alias Prefix = GetBitSlicing!(21, sizes); 4322 4323 static if (is(TypeOfBitPacked!T == bool)) 4324 { 4325 auto codepointTrie(Set)(const scope Set set) 4326 if (isCodepointSet!Set) 4327 { 4328 return codepointSetTrie(set); 4329 } 4330 } 4331 4332 /// 4333 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4334 { 4335 return buildTrie!(T, dchar, Prefix)(map, defValue); 4336 } 4337 4338 // unsorted range of pairs 4339 /// 4340 auto codepointTrie(R)(R range, T defValue=T.init) 4341 if (isInputRange!R 4342 && is(typeof(ElementType!R.init[0]) : T) 4343 && is(typeof(ElementType!R.init[1]) : dchar)) 4344 { 4345 // build from unsorted array of pairs 4346 // TODO: expose index sorting functions for Trie 4347 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4348 } 4349 } 4350 4351 @system pure unittest 4352 { 4353 import std.algorithm.comparison : max; 4354 import std.algorithm.searching : count; 4355 4356 // pick characters from the Greek script 4357 auto set = unicode.Greek; 4358 4359 // a user-defined property (or an expensive function) 4360 // that we want to look up 4361 static uint luckFactor(dchar ch) 4362 { 4363 // here we consider a character lucky 4364 // if its code point has a lot of identical hex-digits 4365 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4366 ubyte[6] nibbles; // 6 4-bit chunks of code point 4367 uint value = ch; 4368 foreach (i; 0 .. 6) 4369 { 4370 nibbles[i] = value & 0xF; 4371 value >>= 4; 4372 } 4373 uint luck; 4374 foreach (n; nibbles) 4375 luck = cast(uint) max(luck, count(nibbles[], n)); 4376 return luck; 4377 } 4378 4379 // only unsigned built-ins are supported at the moment 4380 alias LuckFactor = BitPacked!(uint, 3); 4381 4382 // create a temporary associative array (AA) 4383 LuckFactor[dchar] map; 4384 foreach (ch; set.byCodepoint) 4385 map[ch] = LuckFactor(luckFactor(ch)); 4386 4387 // bits per stage are chosen randomly, fell free to optimize 4388 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4389 4390 // from now on the AA is not needed 4391 foreach (ch; set.byCodepoint) 4392 assert(trie[ch] == luckFactor(ch)); // verify 4393 // CJK is not Greek, thus it has the default value 4394 assert(trie['\u4444'] == 0); 4395 // and here is a couple of quite lucky Greek characters: 4396 // Greek small letter epsilon with dasia 4397 assert(trie['\u1F11'] == 3); 4398 // Ancient Greek metretes sign 4399 assert(trie['\U00010181'] == 3); 4400 4401 } 4402 4403 /// ditto 4404 public template CodepointTrie(T, sizes...) 4405 if (sumOfIntegerTuple!sizes == 21) 4406 { 4407 alias Prefix = GetBitSlicing!(21, sizes); 4408 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4409 } 4410 4411 package(std) template cmpK0(alias Pred) 4412 { 4413 import std.typecons : Tuple; 4414 static bool cmpK0(Value, Key) 4415 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4416 { 4417 return Pred(a[1]) < Pred(b[1]); 4418 } 4419 } 4420 4421 /** 4422 The most general utility for construction of `Trie`s 4423 short of using `TrieBuilder` directly. 4424 4425 Provides a number of convenience overloads. 4426 `Args` is tuple of maximum key value followed by 4427 predicates to construct index from key. 4428 4429 Alternatively if the first argument is not a value convertible to `Key` 4430 then the whole tuple of `Args` is treated as predicates 4431 and the maximum Key is deduced from predicates. 4432 */ 4433 private template buildTrie(Value, Key, Args...) 4434 if (isValidArgsForTrie!(Key, Args)) 4435 { 4436 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4437 { 4438 alias Prefix = Args[1..$]; 4439 } 4440 else 4441 alias Prefix = Args; 4442 4443 alias getIndex = mapTrieIndex!(Prefix); 4444 4445 // for multi-sort 4446 template GetComparators(size_t n) 4447 { 4448 static if (n > 0) 4449 alias GetComparators = 4450 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4451 else 4452 alias GetComparators = AliasSeq!(); 4453 } 4454 4455 /* 4456 Build `Trie` from a range of a Key-Value pairs, 4457 assuming it is sorted by Key as defined by the following lambda: 4458 ------ 4459 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4460 ------ 4461 Exception is thrown if it's detected that the above order doesn't hold. 4462 4463 In other words $(LREF mapTrieIndex) should be a 4464 monotonically increasing function that maps `Key` to an integer. 4465 4466 See_Also: $(REF sort, std,_algorithm), 4467 $(REF SortedRange, std,range), 4468 $(REF setUnion, std,_algorithm). 4469 */ 4470 auto buildTrie(Range)(Range range, Value filler=Value.init) 4471 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4472 && is(typeof(Range.init.front[1]) : Key)) 4473 { 4474 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4475 foreach (v; range) 4476 builder.putValue(v[1], v[0]); 4477 return builder.build(); 4478 } 4479 4480 /* 4481 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4482 to build `Trie` from a range of open-right intervals of `Key`s. 4483 The requirement on the ordering of keys (and the behavior on the 4484 violation of it) is the same as for Key-Value range overload. 4485 4486 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4487 If no filler provided keys inside of the intervals map to true, 4488 and `filler` is false. 4489 */ 4490 auto buildTrie(Range)(Range range, Value filler=Value.init) 4491 if (is(TypeOfBitPacked!Value == bool) 4492 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4493 && is(typeof(Range.init.front[1]) : Key)) 4494 { 4495 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4496 foreach (ival; range) 4497 builder.putRange(ival[0], ival[1], !filler); 4498 return builder.build(); 4499 } 4500 4501 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4502 if (isInputRange!Range 4503 && is(typeof(Range.init.front[0]) : Value) 4504 && is(typeof(Range.init.front[1]) : Key)) 4505 { 4506 import std.algorithm.sorting : multiSort; 4507 alias Comps = GetComparators!(Prefix.length); 4508 if (unsorted) 4509 multiSort!(Comps)(range); 4510 return buildTrie(range, filler); 4511 } 4512 4513 /* 4514 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4515 to build `Trie` simply from an input range of `Key`s. 4516 The requirement on the ordering of keys (and the behavior on the 4517 violation of it) is the same as for Key-Value range overload. 4518 4519 Keys found in range denote !`filler` i.e. the opposite of filler. 4520 If no filler provided keys map to true, and `filler` is false. 4521 */ 4522 auto buildTrie(Range)(Range range, Value filler=Value.init) 4523 if (is(TypeOfBitPacked!Value == bool) 4524 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4525 { 4526 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4527 foreach (v; range) 4528 builder.putValue(v, !filler); 4529 return builder.build(); 4530 } 4531 4532 /* 4533 If `Key` is unsigned integer `Trie` could be constructed from array 4534 of values where array index serves as key. 4535 */ 4536 auto buildTrie()(Value[] array, Value filler=Value.init) 4537 if (isUnsigned!Key) 4538 { 4539 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4540 foreach (idx, v; array) 4541 builder.putValue(idx, v); 4542 return builder.build(); 4543 } 4544 4545 /* 4546 Builds `Trie` from associative array. 4547 */ 4548 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4549 { 4550 import std.array : array; 4551 import std.range : zip; 4552 auto range = array(zip(map.values, map.keys)); 4553 return buildTrie(range, filler, true); // sort it 4554 } 4555 } 4556 4557 // helper in place of assumeSize to 4558 //reduce mangled name & help DMD inline Trie functors 4559 struct clamp(size_t bits) 4560 { 4561 static size_t opCall(T)(T arg){ return arg; } 4562 enum bitSize = bits; 4563 } 4564 4565 struct clampIdx(size_t idx, size_t bits) 4566 { 4567 static size_t opCall(T)(T arg){ return arg[idx]; } 4568 enum bitSize = bits; 4569 } 4570 4571 /** 4572 Conceptual type that outlines the common properties of all UTF Matchers. 4573 4574 Note: For illustration purposes only, every method 4575 call results in assertion failure. 4576 Use $(LREF utfMatcher) to obtain a concrete matcher 4577 for UTF-8 or UTF-16 encodings. 4578 */ 4579 public struct MatcherConcept 4580 { 4581 /** 4582 $(P Perform a semantic equivalent 2 operations: 4583 decoding a $(CODEPOINT) at front of `inp` and testing if 4584 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4585 4586 $(P The effect on `inp` depends on the kind of function called:) 4587 4588 $(P Match. If the codepoint is found in the set then range `inp` 4589 is advanced by its size in $(S_LINK Code unit, code units), 4590 otherwise the range is not modifed.) 4591 4592 $(P Skip. The range is always advanced by the size 4593 of the tested $(CODEPOINT) regardless of the result of test.) 4594 4595 $(P Test. The range is left unaffected regardless 4596 of the result of test.) 4597 */ 4598 public bool match(Range)(ref Range inp) 4599 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4600 { 4601 assert(false); 4602 } 4603 4604 ///ditto 4605 public bool skip(Range)(ref Range inp) 4606 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4607 { 4608 assert(false); 4609 } 4610 4611 ///ditto 4612 public bool test(Range)(ref Range inp) 4613 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4614 { 4615 assert(false); 4616 } 4617 /// 4618 pure @safe unittest 4619 { 4620 string truth = "2² = 4"; 4621 auto m = utfMatcher!char(unicode.Number); 4622 assert(m.match(truth)); // '2' is a number all right 4623 assert(truth == "² = 4"); // skips on match 4624 assert(m.match(truth)); // so is the superscript '2' 4625 assert(!m.match(truth)); // space is not a number 4626 assert(truth == " = 4"); // unaffected on no match 4627 assert(!m.skip(truth)); // same test ... 4628 assert(truth == "= 4"); // but skips a codepoint regardless 4629 assert(!m.test(truth)); // '=' is not a number 4630 assert(truth == "= 4"); // test never affects argument 4631 } 4632 4633 /** 4634 Advanced feature - provide direct access to a subset of matcher based a 4635 set of known encoding lengths. Lengths are provided in 4636 $(S_LINK Code unit, code units). The sub-matcher then may do less 4637 operations per any `test`/`match`. 4638 4639 Use with care as the sub-matcher won't match 4640 any $(CODEPOINTS) that have encoded length that doesn't belong 4641 to the selected set of lengths. Also the sub-matcher object references 4642 the parent matcher and must not be used past the liftetime 4643 of the latter. 4644 4645 Another caveat of using sub-matcher is that skip is not available 4646 preciesly because sub-matcher doesn't detect all lengths. 4647 */ 4648 @property auto subMatcher(Lengths...)() 4649 { 4650 assert(0); 4651 return this; 4652 } 4653 4654 pure @safe unittest 4655 { 4656 auto m = utfMatcher!char(unicode.Number); 4657 string square = "2²"; 4658 // about sub-matchers 4659 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4660 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4661 assert(!m.subMatcher!1.test(square)); // unicode '²' 4662 assert(m.subMatcher!(2,3,4).match(square)); // 4663 assert(square == ""); 4664 wstring wsquare = "2²"; 4665 auto m16 = utfMatcher!wchar(unicode.Number); 4666 // may keep ref, but the orignal (m16) must be kept alive 4667 auto bmp = m16.subMatcher!1; 4668 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4669 assert(bmp.match(wsquare)); // And '²' too 4670 } 4671 } 4672 4673 /** 4674 Test if `M` is an UTF Matcher for ranges of `Char`. 4675 */ 4676 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4677 C[] s; 4678 auto d = s.decoder; 4679 M m; 4680 assert(is(typeof(m.match(d)) == bool)); 4681 assert(is(typeof(m.test(d)) == bool)); 4682 static if (is(typeof(m.skip(d)))) 4683 { 4684 assert(is(typeof(m.skip(d)) == bool)); 4685 assert(is(typeof(m.skip(s)) == bool)); 4686 } 4687 assert(is(typeof(m.match(s)) == bool)); 4688 assert(is(typeof(m.test(s)) == bool)); 4689 }); 4690 4691 pure @safe unittest 4692 { 4693 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4694 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4695 static assert(isUtfMatcher!(CharMatcher, char)); 4696 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4697 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4698 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4699 } 4700 4701 enum Mode { 4702 alwaysSkip, 4703 neverSkip, 4704 skipOnMatch 4705 } 4706 4707 mixin template ForwardStrings() 4708 { 4709 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4710 { 4711 import std.utf : byCodeUnit; 4712 alias type = typeof(byCodeUnit(str)); 4713 return mixin(fn~"(*cast(type*)&str)"); 4714 } 4715 } 4716 4717 template Utf8Matcher() 4718 { 4719 enum validSize(int sz) = sz >= 1 && sz <= 4; 4720 4721 void badEncoding() pure @safe 4722 { 4723 import std.utf : UTFException; 4724 throw new UTFException("Invalid UTF-8 sequence"); 4725 } 4726 4727 //for 1-stage ASCII 4728 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4729 //for 2-stage lookup of 2 byte UTF-8 sequences 4730 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4731 clampIdx!(0, 5), clampIdx!(1, 6)); 4732 //ditto for 3 byte 4733 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4734 clampIdx!(0, 4), 4735 clampIdx!(1, 6), 4736 clampIdx!(2, 6) 4737 ); 4738 //ditto for 4 byte 4739 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4740 clampIdx!(0, 3), clampIdx!(1, 6), 4741 clampIdx!(2, 6), clampIdx!(3, 6) 4742 ); 4743 alias Tables = AliasSeq!( 4744 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4745 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4746 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4747 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4748 ); 4749 alias Table(int size) = Tables[size-1]; 4750 4751 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4752 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4753 4754 char truncate()(char ch) pure @safe 4755 { 4756 ch -= 0x80; 4757 if (ch < 0x40) 4758 { 4759 return ch; 4760 } 4761 else 4762 { 4763 badEncoding(); 4764 return cast(char) 0; 4765 } 4766 } 4767 4768 static auto encode(size_t sz)(dchar ch) 4769 if (sz > 1) 4770 { 4771 import std.utf : encodeUTF = encode; 4772 char[4] buf; 4773 encodeUTF(buf, ch); 4774 char[sz] ret; 4775 buf[0] &= leadMask!sz; 4776 foreach (n; 1 .. sz) 4777 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4778 ret[] = buf[0 .. sz]; 4779 return ret; 4780 } 4781 4782 auto build(Set)(Set set) 4783 { 4784 import std.algorithm.iteration : map; 4785 auto ascii = set & unicode.ASCII; 4786 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4787 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4788 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4789 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4790 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4791 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4792 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4793 alias Ret = Impl!(1,2,3,4); 4794 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4795 } 4796 4797 // Bootstrap UTF-8 static matcher interface 4798 // from 3 primitives: tab!(size), lookup and Sizes 4799 mixin template DefMatcher() 4800 { 4801 import std.format : format; 4802 import std.meta : Erase, staticIndexOf; 4803 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4804 alias UniSizes = Erase!(1, Sizes); 4805 4806 //generate dispatch code sequence for unicode parts 4807 static auto genDispatch() 4808 { 4809 string code; 4810 foreach (size; UniSizes) 4811 code ~= format(q{ 4812 if ((ch & ~leadMask!%d) == encMask!(%d)) 4813 return lookup!(%d, mode)(inp); 4814 else 4815 }, size, size, size); 4816 static if (Sizes.length == 4) //covers all code unit cases 4817 code ~= "{ badEncoding(); return false; }"; 4818 else 4819 code ~= "return false;"; //may be just fine but not covered 4820 return code; 4821 } 4822 enum dispatch = genDispatch(); 4823 4824 public bool match(Range)(ref Range inp) const 4825 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4826 !isDynamicArray!Range) 4827 { 4828 enum mode = Mode.skipOnMatch; 4829 assert(!inp.empty); 4830 immutable ch = inp[0]; 4831 static if (hasASCII) 4832 { 4833 if (ch < 0x80) 4834 { 4835 immutable r = tab!1[ch]; 4836 if (r) 4837 inp.popFront(); 4838 return r; 4839 } 4840 else 4841 mixin(dispatch); 4842 } 4843 else 4844 mixin(dispatch); 4845 } 4846 4847 static if (Sizes.length == 4) // can skip iff can detect all encodings 4848 { 4849 public bool skip(Range)(ref Range inp) const 4850 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4851 !isDynamicArray!Range) 4852 { 4853 enum mode = Mode.alwaysSkip; 4854 assert(!inp.empty); 4855 auto ch = inp[0]; 4856 static if (hasASCII) 4857 { 4858 if (ch < 0x80) 4859 { 4860 inp.popFront(); 4861 return tab!1[ch]; 4862 } 4863 else 4864 mixin(dispatch); 4865 } 4866 else 4867 mixin(dispatch); 4868 } 4869 } 4870 4871 public bool test(Range)(ref Range inp) const 4872 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4873 !isDynamicArray!Range) 4874 { 4875 enum mode = Mode.neverSkip; 4876 assert(!inp.empty); 4877 auto ch = inp[0]; 4878 4879 static if (hasASCII) 4880 { 4881 if (ch < 0x80) 4882 return tab!1[ch]; 4883 else 4884 mixin(dispatch); 4885 } 4886 else 4887 mixin(dispatch); 4888 } 4889 4890 bool match(C)(ref C[] str) const 4891 if (isSomeChar!C) 4892 { 4893 return fwdStr!"match"(str); 4894 } 4895 4896 bool skip(C)(ref C[] str) const 4897 if (isSomeChar!C) 4898 { 4899 return fwdStr!"skip"(str); 4900 } 4901 4902 bool test(C)(ref C[] str) const 4903 if (isSomeChar!C) 4904 { 4905 return fwdStr!"test"(str); 4906 } 4907 4908 mixin ForwardStrings; 4909 } 4910 4911 struct Impl(Sizes...) 4912 { 4913 import std.meta : allSatisfy, staticMap; 4914 static assert(allSatisfy!(validSize, Sizes), 4915 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4916 private: 4917 //pick tables for chosen sizes 4918 alias OurTabs = staticMap!(Table, Sizes); 4919 OurTabs tables; 4920 mixin DefMatcher; 4921 //static disptach helper UTF size ==> table 4922 alias tab(int i) = tables[i - 1]; 4923 4924 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4925 { 4926 return CherryPick!(Impl, SizesToPick)(&this); 4927 } 4928 4929 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4930 { 4931 import std.range : popFrontN; 4932 if (inp.length < size) 4933 { 4934 badEncoding(); 4935 return false; 4936 } 4937 char[size] needle = void; 4938 needle[0] = leadMask!size & inp[0]; 4939 static foreach (i; 1 .. size) 4940 { 4941 needle[i] = truncate(inp[i]); 4942 } 4943 //overlong encoding checks 4944 static if (size == 2) 4945 { 4946 //0x80-0x7FF 4947 //got 6 bits in needle[1], must use at least 8 bits 4948 //must use at least 2 bits in needle[1] 4949 if (needle[0] < 2) badEncoding(); 4950 } 4951 else static if (size == 3) 4952 { 4953 //0x800-0xFFFF 4954 //got 6 bits in needle[2], must use at least 12bits 4955 //must use 6 bits in needle[1] or anything in needle[0] 4956 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4957 } 4958 else static if (size == 4) 4959 { 4960 //0x800-0xFFFF 4961 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4962 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4963 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4964 } 4965 static if (mode == Mode.alwaysSkip) 4966 { 4967 inp.popFrontN(size); 4968 return tab!size[needle]; 4969 } 4970 else static if (mode == Mode.neverSkip) 4971 { 4972 return tab!size[needle]; 4973 } 4974 else 4975 { 4976 static assert(mode == Mode.skipOnMatch); 4977 4978 if (tab!size[needle]) 4979 { 4980 inp.popFrontN(size); 4981 return true; 4982 } 4983 else 4984 return false; 4985 } 4986 } 4987 } 4988 4989 struct CherryPick(I, Sizes...) 4990 { 4991 import std.meta : allSatisfy; 4992 static assert(allSatisfy!(validSize, Sizes), 4993 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4994 private: 4995 I* m; 4996 @property auto tab(int i)() const { return m.tables[i - 1]; } 4997 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4998 { 4999 return m.lookup!(size, mode)(inp); 5000 } 5001 mixin DefMatcher; 5002 } 5003 } 5004 5005 template Utf16Matcher() 5006 { 5007 enum validSize(int sz) = sz >= 1 && sz <= 2; 5008 5009 void badEncoding() pure @safe 5010 { 5011 import std.utf : UTFException; 5012 throw new UTFException("Invalid UTF-16 sequence"); 5013 } 5014 5015 // 1-stage ASCII 5016 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5017 //2-stage BMP 5018 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5019 //4-stage - full Unicode 5020 //assume that 0xD800 & 0xDC00 bits are cleared 5021 //thus leaving 10 bit per wchar to worry about 5022 alias UniSpec = AliasSeq!(bool, wchar[2], 5023 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5024 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5025 ); 5026 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5027 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5028 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5029 5030 auto encode2(dchar ch) 5031 { 5032 ch -= 0x1_0000; 5033 assert(ch <= 0xF_FFFF); 5034 wchar[2] ret; 5035 //do not put surrogate bits, they are sliced off 5036 ret[0] = cast(wchar)(ch >> 10); 5037 ret[1] = (ch & 0xFFF); 5038 return ret; 5039 } 5040 5041 auto build(Set)(Set set) 5042 { 5043 import std.algorithm.iteration : map; 5044 auto ascii = set & unicode.ASCII; 5045 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5046 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5047 auto other = set - (bmp | ascii); 5048 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5049 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5050 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5051 alias Ret = Impl!(1,2); 5052 return Ret(asciiT, bmpT, otherT); 5053 } 5054 5055 //bootstrap full UTF-16 matcher interace from 5056 //sizeFlags, lookupUni and ascii 5057 mixin template DefMatcher() 5058 { 5059 public bool match(Range)(ref Range inp) const 5060 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5061 !isDynamicArray!Range) 5062 { 5063 enum mode = Mode.skipOnMatch; 5064 assert(!inp.empty); 5065 immutable ch = inp[0]; 5066 static if (sizeFlags & 1) 5067 { 5068 if (ch < 0x80) 5069 { 5070 if (ascii[ch]) 5071 { 5072 inp.popFront(); 5073 return true; 5074 } 5075 else 5076 return false; 5077 } 5078 return lookupUni!mode(inp); 5079 } 5080 else 5081 return lookupUni!mode(inp); 5082 } 5083 5084 static if (Sizes.length == 2) 5085 { 5086 public bool skip(Range)(ref Range inp) const 5087 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5088 !isDynamicArray!Range) 5089 { 5090 enum mode = Mode.alwaysSkip; 5091 assert(!inp.empty); 5092 immutable ch = inp[0]; 5093 static if (sizeFlags & 1) 5094 { 5095 if (ch < 0x80) 5096 { 5097 inp.popFront(); 5098 return ascii[ch]; 5099 } 5100 else 5101 return lookupUni!mode(inp); 5102 } 5103 else 5104 return lookupUni!mode(inp); 5105 } 5106 } 5107 5108 public bool test(Range)(ref Range inp) const 5109 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5110 !isDynamicArray!Range) 5111 { 5112 enum mode = Mode.neverSkip; 5113 assert(!inp.empty); 5114 auto ch = inp[0]; 5115 static if (sizeFlags & 1) 5116 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5117 else 5118 return lookupUni!mode(inp); 5119 } 5120 5121 bool match(C)(ref C[] str) const 5122 if (isSomeChar!C) 5123 { 5124 return fwdStr!"match"(str); 5125 } 5126 5127 bool skip(C)(ref C[] str) const 5128 if (isSomeChar!C) 5129 { 5130 return fwdStr!"skip"(str); 5131 } 5132 5133 bool test(C)(ref C[] str) const 5134 if (isSomeChar!C) 5135 { 5136 return fwdStr!"test"(str); 5137 } 5138 5139 mixin ForwardStrings; //dispatch strings to range versions 5140 } 5141 5142 struct Impl(Sizes...) 5143 if (Sizes.length >= 1 && Sizes.length <= 2) 5144 { 5145 private: 5146 import std.meta : allSatisfy; 5147 static assert(allSatisfy!(validSize, Sizes), 5148 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5149 static if (Sizes.length > 1) 5150 enum sizeFlags = Sizes[0] | Sizes[1]; 5151 else 5152 enum sizeFlags = Sizes[0]; 5153 5154 static if (sizeFlags & 1) 5155 { 5156 Ascii ascii; 5157 Bmp bmp; 5158 } 5159 static if (sizeFlags & 2) 5160 { 5161 Uni uni; 5162 } 5163 mixin DefMatcher; 5164 5165 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5166 { 5167 return CherryPick!(Impl, SizesToPick)(&this); 5168 } 5169 5170 bool lookupUni(Mode mode, Range)(ref Range inp) const 5171 { 5172 wchar x = cast(wchar)(inp[0] - 0xD800); 5173 //not a high surrogate 5174 if (x > 0x3FF) 5175 { 5176 //low surrogate 5177 if (x <= 0x7FF) badEncoding(); 5178 static if (sizeFlags & 1) 5179 { 5180 auto ch = inp[0]; 5181 static if (mode == Mode.alwaysSkip) 5182 inp.popFront(); 5183 static if (mode == Mode.skipOnMatch) 5184 { 5185 if (bmp[ch]) 5186 { 5187 inp.popFront(); 5188 return true; 5189 } 5190 else 5191 return false; 5192 } 5193 else 5194 return bmp[ch]; 5195 } 5196 else //skip is not available for sub-matchers, so just false 5197 return false; 5198 } 5199 else 5200 { 5201 import std.range : popFrontN; 5202 static if (sizeFlags & 2) 5203 { 5204 if (inp.length < 2) 5205 badEncoding(); 5206 wchar y = cast(wchar)(inp[1] - 0xDC00); 5207 //not a low surrogate 5208 if (y > 0x3FF) 5209 badEncoding(); 5210 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5211 static if (mode == Mode.alwaysSkip) 5212 inp.popFrontN(2); 5213 static if (mode == Mode.skipOnMatch) 5214 { 5215 if (uni[needle]) 5216 { 5217 inp.popFrontN(2); 5218 return true; 5219 } 5220 else 5221 return false; 5222 } 5223 else 5224 return uni[needle]; 5225 } 5226 else //ditto 5227 return false; 5228 } 5229 } 5230 } 5231 5232 struct CherryPick(I, Sizes...) 5233 if (Sizes.length >= 1 && Sizes.length <= 2) 5234 { 5235 private: 5236 import std.meta : allSatisfy; 5237 I* m; 5238 enum sizeFlags = I.sizeFlags; 5239 5240 static if (sizeFlags & 1) 5241 { 5242 @property auto ascii()() const { return m.ascii; } 5243 } 5244 5245 bool lookupUni(Mode mode, Range)(ref Range inp) const 5246 { 5247 return m.lookupUni!mode(inp); 5248 } 5249 mixin DefMatcher; 5250 static assert(allSatisfy!(validSize, Sizes), 5251 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5252 } 5253 } 5254 5255 private auto utf8Matcher(Set)(Set set) 5256 { 5257 return Utf8Matcher!().build(set); 5258 } 5259 5260 private auto utf16Matcher(Set)(Set set) 5261 { 5262 return Utf16Matcher!().build(set); 5263 } 5264 5265 /** 5266 Constructs a matcher object 5267 to classify $(CODEPOINTS) from the `set` for encoding 5268 that has `Char` as code unit. 5269 5270 See $(LREF MatcherConcept) for API outline. 5271 */ 5272 public auto utfMatcher(Char, Set)(Set set) 5273 if (isCodepointSet!Set) 5274 { 5275 static if (is(Char : char)) 5276 return utf8Matcher(set); 5277 else static if (is(Char : wchar)) 5278 return utf16Matcher(set); 5279 else static if (is(Char : dchar)) 5280 static assert(false, "UTF-32 needs no decoding, 5281 and thus not supported by utfMatcher"); 5282 else 5283 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5284 } 5285 5286 5287 //a range of code units, packed with index to speed up forward iteration 5288 package(std) auto decoder(C)(C[] s, size_t offset=0) 5289 if (is(C : wchar) || is(C : char)) 5290 { 5291 static struct Decoder 5292 { 5293 pure nothrow: 5294 C[] str; 5295 size_t idx; 5296 @property C front(){ return str[idx]; } 5297 @property C back(){ return str[$-1]; } 5298 void popFront(){ idx++; } 5299 void popBack(){ str = str[0..$-1]; } 5300 void popFrontN(size_t n){ idx += n; } 5301 @property bool empty(){ return idx == str.length; } 5302 @property auto save(){ return this; } 5303 auto opIndex(size_t i){ return str[idx+i]; } 5304 @property size_t length(){ return str.length - idx; } 5305 alias opDollar = length; 5306 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5307 } 5308 static assert(isRandomAccessRange!Decoder); 5309 static assert(is(ElementType!Decoder : C)); 5310 return Decoder(s, offset); 5311 } 5312 5313 pure @safe unittest 5314 { 5315 string rs = "hi! ネемног砀 текста"; 5316 auto codec = rs.decoder; 5317 auto utf8 = utf8Matcher(unicode.Letter); 5318 auto asc = utf8.subMatcher!(1); 5319 auto uni = utf8.subMatcher!(2,3,4); 5320 5321 // h 5322 assert(asc.test(codec)); 5323 assert(!uni.match(codec)); 5324 assert(utf8.skip(codec)); 5325 assert(codec.idx == 1); 5326 5327 // i 5328 assert(asc.test(codec)); 5329 assert(!uni.match(codec)); 5330 assert(utf8.skip(codec)); 5331 assert(codec.idx == 2); 5332 5333 // ! 5334 assert(!asc.match(codec)); 5335 assert(!utf8.test(codec)); 5336 assert(!utf8.skip(codec)); 5337 assert(codec.idx == 3); 5338 5339 // space 5340 assert(!asc.test(codec)); 5341 assert(!utf8.test(codec)); 5342 assert(!utf8.skip(codec)); 5343 assert(codec.idx == 4); 5344 5345 assert(utf8.test(codec)); 5346 foreach (i; 0 .. 7) 5347 { 5348 assert(!asc.test(codec)); 5349 assert(uni.test(codec)); 5350 assert(utf8.skip(codec)); 5351 } 5352 assert(!utf8.test(codec)); 5353 assert(!utf8.skip(codec)); 5354 5355 //the same with match where applicable 5356 codec = rs.decoder; 5357 assert(utf8.match(codec)); 5358 assert(codec.idx == 1); 5359 assert(utf8.match(codec)); 5360 assert(codec.idx == 2); 5361 assert(!utf8.match(codec)); 5362 assert(codec.idx == 2); 5363 assert(!utf8.skip(codec)); 5364 assert(!utf8.skip(codec)); 5365 5366 foreach (i; 0 .. 7) 5367 { 5368 assert(!asc.test(codec)); 5369 assert(utf8.test(codec)); 5370 assert(utf8.match(codec)); 5371 } 5372 auto i = codec.idx; 5373 assert(!utf8.match(codec)); 5374 assert(codec.idx == i); 5375 } 5376 5377 pure @system unittest 5378 { 5379 import std.range : stride; 5380 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5381 { 5382 bool t = m.test(r); 5383 auto save = r.idx; 5384 assert(t == m.match(r)); 5385 assert(r.idx == save || t); //ether no change or was match 5386 r.idx = save; 5387 static if (is(typeof(m.skip(r)))) 5388 { 5389 assert(t == m.skip(r)); 5390 assert(r.idx != save); //always changed 5391 r.idx = save; 5392 } 5393 return t; 5394 } 5395 auto utf16 = utfMatcher!wchar(unicode.L); 5396 auto bmp = utf16.subMatcher!1; 5397 auto nonBmp = utf16.subMatcher!1; 5398 auto utf8 = utfMatcher!char(unicode.L); 5399 auto ascii = utf8.subMatcher!1; 5400 auto uni2 = utf8.subMatcher!2; 5401 auto uni3 = utf8.subMatcher!3; 5402 auto uni24 = utf8.subMatcher!(2,4); 5403 foreach (ch; unicode.L.byCodepoint.stride(3)) 5404 { 5405 import std.utf : encode; 5406 char[4] buf; 5407 wchar[2] buf16; 5408 auto len = encode(buf, ch); 5409 auto len16 = encode(buf16, ch); 5410 auto c8 = buf[0 .. len].decoder; 5411 auto c16 = buf16[0 .. len16].decoder; 5412 assert(testAll(utf16, c16)); 5413 assert(testAll(bmp, c16) || len16 != 1); 5414 assert(testAll(nonBmp, c16) || len16 != 2); 5415 5416 assert(testAll(utf8, c8)); 5417 5418 //submatchers return false on out of their domain 5419 assert(testAll(ascii, c8) || len != 1); 5420 assert(testAll(uni2, c8) || len != 2); 5421 assert(testAll(uni3, c8) || len != 3); 5422 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5423 } 5424 } 5425 5426 // cover decode fail cases of Matcher 5427 pure @safe unittest 5428 { 5429 import std.algorithm.iteration : map; 5430 import std.exception : collectException; 5431 import std.format : format; 5432 auto utf16 = utfMatcher!wchar(unicode.L); 5433 auto utf8 = utfMatcher!char(unicode.L); 5434 //decode failure cases UTF-8 5435 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5436 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5437 "\xCF\x00\0x00\0x00\x00"); 5438 foreach (msg; fails8) 5439 { 5440 assert(collectException((){ 5441 auto s = msg; 5442 size_t idx = 0; 5443 utf8.test(s); 5444 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5445 } 5446 //decode failure cases UTF-16 5447 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5448 foreach (msg; fails16) 5449 { 5450 assert(collectException((){ 5451 auto s = msg.map!(x => cast(wchar) x); 5452 utf16.test(s); 5453 }())); 5454 } 5455 } 5456 5457 /++ 5458 Convenience function to construct optimal configurations for 5459 packed Trie from any `set` of $(CODEPOINTS). 5460 5461 The parameter `level` indicates the number of trie levels to use, 5462 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5463 speed-size wise. 5464 5465 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5466 $(P Level 4 is the slowest and has the smallest footprint. ) 5467 5468 See the $(S_LINK Synopsis, Synopsis) section for example. 5469 5470 Note: 5471 Level 4 stays very practical (being faster and more predictable) 5472 compared to using direct lookup on the `set` itself. 5473 5474 5475 +/ 5476 public auto toTrie(size_t level, Set)(Set set) 5477 if (isCodepointSet!Set) 5478 { 5479 static if (level == 1) 5480 return codepointSetTrie!(21)(set); 5481 else static if (level == 2) 5482 return codepointSetTrie!(10, 11)(set); 5483 else static if (level == 3) 5484 return codepointSetTrie!(8, 5, 8)(set); 5485 else static if (level == 4) 5486 return codepointSetTrie!(6, 4, 4, 7)(set); 5487 else 5488 static assert(false, 5489 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5490 } 5491 5492 /** 5493 $(P Builds a `Trie` with typically optimal speed-size trade-off 5494 and wraps it into a delegate of the following type: 5495 $(D bool delegate(dchar ch)). ) 5496 5497 $(P Effectively this creates a 'tester' lambda suitable 5498 for algorithms like std.algorithm.find that take unary predicates. ) 5499 5500 See the $(S_LINK Synopsis, Synopsis) section for example. 5501 */ 5502 public auto toDelegate(Set)(Set set) 5503 if (isCodepointSet!Set) 5504 { 5505 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5506 auto t = toTrie!3(set); 5507 return (dchar ch) => t[ch]; 5508 } 5509 5510 /** 5511 $(P Opaque wrapper around unsigned built-in integers and 5512 code unit (char/wchar/dchar) types. 5513 Parameter `sz` indicates that the value is confined 5514 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5515 packed more tightly when stored in certain 5516 data-structures like trie. ) 5517 5518 Note: 5519 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5520 but not vise-versa. Users have to ensure the value fits in 5521 the range required and use the `cast` 5522 operator to perform the conversion.) 5523 */ 5524 struct BitPacked(T, size_t sz) 5525 if (isIntegral!T || is(T:dchar)) 5526 { 5527 enum bitSize = sz; 5528 T _value; 5529 alias _value this; 5530 } 5531 5532 /* 5533 Depending on the form of the passed argument `bitSizeOf` returns 5534 the amount of bits required to represent a given type 5535 or a return type of a given functor. 5536 */ 5537 template bitSizeOf(Args...) 5538 if (Args.length == 1) 5539 { 5540 import std.traits : ReturnType; 5541 alias T = Args[0]; 5542 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5543 { 5544 enum bitSizeOf = T.bitSize; 5545 } 5546 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5547 { 5548 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5549 } 5550 else 5551 { 5552 enum bitSizeOf = T.sizeof*8; 5553 } 5554 } 5555 5556 /** 5557 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5558 and thus suitable for packing. 5559 */ 5560 template isBitPacked(T) 5561 { 5562 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5563 enum isBitPacked = true; 5564 else 5565 enum isBitPacked = false; 5566 } 5567 5568 /** 5569 Gives the type `U` from $(LREF BitPacked)!(U, x) 5570 or `T` itself for every other type. 5571 */ 5572 template TypeOfBitPacked(T) 5573 { 5574 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5575 alias TypeOfBitPacked = U; 5576 else 5577 alias TypeOfBitPacked = T; 5578 } 5579 5580 /* 5581 Wrapper, used in definition of custom data structures from `Trie` template. 5582 Applying it to a unary lambda function indicates that the returned value always 5583 fits within `bits` of bits. 5584 */ 5585 struct assumeSize(alias Fn, size_t bits) 5586 { 5587 enum bitSize = bits; 5588 static auto ref opCall(T)(auto ref T arg) 5589 { 5590 return Fn(arg); 5591 } 5592 } 5593 5594 /* 5595 A helper for defining lambda function that yields a slice 5596 of certain bits from an unsigned integral value. 5597 The resulting lambda is wrapped in assumeSize and can be used directly 5598 with `Trie` template. 5599 */ 5600 struct sliceBits(size_t from, size_t to) 5601 { 5602 //for now bypass assumeSize, DMD has trouble inlining it 5603 enum bitSize = to-from; 5604 static auto opCall(T)(T x) 5605 out(result) 5606 { 5607 assert(result < (1 << to-from)); 5608 } 5609 do 5610 { 5611 static assert(from < to); 5612 static if (from == 0) 5613 return x & ((1 << to)-1); 5614 else 5615 return (x >> from) & ((1<<(to-from))-1); 5616 } 5617 } 5618 5619 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5620 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5621 alias lo8 = assumeSize!(low_8, 8); 5622 alias mlo8 = assumeSize!(midlow_8, 8); 5623 5624 @safe pure nothrow @nogc unittest 5625 { 5626 static assert(bitSizeOf!lo8 == 8); 5627 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5628 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5629 } 5630 5631 template Sequence(size_t start, size_t end) 5632 { 5633 static if (start < end) 5634 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5635 else 5636 alias Sequence = AliasSeq!(); 5637 } 5638 5639 //---- TRIE TESTS ---- 5640 @system unittest 5641 { 5642 import std.algorithm.iteration : map; 5643 import std.algorithm.sorting : sort; 5644 import std.array : array; 5645 import std.conv : text, to; 5646 import std.range : iota; 5647 static trieStats(TRIE)(TRIE t) 5648 { 5649 version (std_uni_stats) 5650 { 5651 import std.stdio : writefln, writeln; 5652 writeln("---TRIE FOOTPRINT STATS---"); 5653 static foreach (i; 0 .. t.table.dim) 5654 { 5655 writefln("lvl%s = %s bytes; %s pages" 5656 , i, t.bytes!i, t.pages!i); 5657 } 5658 writefln("TOTAL: %s bytes", t.bytes); 5659 version (none) 5660 { 5661 writeln("INDEX (excluding value level):"); 5662 static foreach (i; 0 .. t.table.dim-1) 5663 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5664 } 5665 writeln("---------------------------"); 5666 } 5667 } 5668 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5669 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5670 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5671 alias Set = CodepointSet; 5672 auto set = Set('A','Z','a','z'); 5673 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5674 for (int a='a'; a<'z';a++) 5675 assert(trie[a]); 5676 for (int a='A'; a<'Z';a++) 5677 assert(trie[a]); 5678 for (int a=0; a<'A'; a++) 5679 assert(!trie[a]); 5680 for (int a ='Z'; a<'a'; a++) 5681 assert(!trie[a]); 5682 trieStats(trie); 5683 5684 auto redundant2 = Set( 5685 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5686 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5687 trieStats(trie2); 5688 foreach (e; redundant2.byCodepoint) 5689 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5690 foreach (i; 0 .. 1024) 5691 { 5692 assert(trie2[i] == (i in redundant2)); 5693 } 5694 5695 5696 auto redundant3 = Set( 5697 2, 4, 6, 8, 16, 5698 2+16, 4+16, 16+6, 16+8, 16+16, 5699 2+32, 4+32, 32+6, 32+8, 5700 ); 5701 5702 enum max3 = 256; 5703 // sliceBits 5704 auto trie3 = buildTrie!(bool, uint, max3, 5705 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5706 )(redundant3.byInterval); 5707 trieStats(trie3); 5708 foreach (i; 0 .. max3) 5709 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5710 5711 auto redundant4 = Set( 5712 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5713 1000, 2000, 3000, 4000, 5000, 6000 5714 ); 5715 enum max4 = 2^^16; 5716 auto trie4 = buildTrie!(bool, size_t, max4, 5717 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5718 )(redundant4.byInterval); 5719 foreach (i; 0 .. max4) 5720 { 5721 if (i in redundant4) 5722 assert(trie4[i], text(cast(uint) i)); 5723 } 5724 trieStats(trie4); 5725 5726 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5727 string[] redundantS = ["tea", "start", "orange"]; 5728 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5729 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5730 // using first char only 5731 assert(redundantS == ["orange", "start", "tea"]); 5732 assert(strie["test"], text(strie["test"])); 5733 assert(!strie["aea"]); 5734 assert(strie["s"]); 5735 5736 // a bit size test 5737 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5738 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5739 trieStats(bt); 5740 foreach (i; 0 .. 256) 5741 assert(bt[cast(ubyte) i]); 5742 } 5743 5744 template useItemAt(size_t idx, T) 5745 if (isIntegral!T || is(T: dchar)) 5746 { 5747 size_t impl(const scope T[] arr){ return arr[idx]; } 5748 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5749 } 5750 5751 template useLastItem(T) 5752 { 5753 size_t impl(const scope T[] arr){ return arr[$-1]; } 5754 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5755 } 5756 5757 template fullBitSize(Prefix...) 5758 { 5759 static if (Prefix.length > 0) 5760 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5761 else 5762 enum fullBitSize = 0; 5763 } 5764 5765 template idxTypes(Key, size_t fullBits, Prefix...) 5766 { 5767 static if (Prefix.length == 1) 5768 {// the last level is value level, so no index once reduced to 1-level 5769 alias idxTypes = AliasSeq!(); 5770 } 5771 else 5772 { 5773 // Important note on bit packing 5774 // Each level has to hold enough of bits to address the next one 5775 // The bottom level is known to hold full bit width 5776 // thus it's size in pages is full_bit_width - size_of_last_prefix 5777 // Recourse on this notion 5778 alias idxTypes = 5779 AliasSeq!( 5780 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5781 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5782 ); 5783 } 5784 } 5785 5786 //============================================================================ 5787 5788 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5789 if (is(Char1 : dchar) && is(Char2 : dchar)) 5790 { 5791 import std.algorithm.comparison : cmp; 5792 import std.algorithm.iteration : map, filter; 5793 import std.ascii : toLower; 5794 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5795 return cmp( 5796 a.map!toLower.filter!pred, 5797 b.map!toLower.filter!pred); 5798 } 5799 5800 @safe pure unittest 5801 { 5802 assert(!comparePropertyName("foo-bar", "fooBar")); 5803 } 5804 5805 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5806 if (is(Char1 : dchar) && is(Char2 : dchar)) 5807 { 5808 return comparePropertyName(a, b) < 0; 5809 } 5810 5811 //============================================================================ 5812 // Utilities for compression of Unicode code point sets 5813 //============================================================================ 5814 5815 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5816 { 5817 // not optimized as usually done 1 time (and not public interface) 5818 if (val < 128) 5819 arr ~= cast(ubyte) val; 5820 else if (val < (1 << 13)) 5821 { 5822 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5823 arr ~= val & 0xFF; 5824 } 5825 else 5826 { 5827 assert(val < (1 << 21)); 5828 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5829 arr ~= (val >> 8) & 0xFF; 5830 arr ~= val & 0xFF; 5831 } 5832 } 5833 5834 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5835 { 5836 import std.exception : enforce; 5837 immutable first = arr[idx++]; 5838 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5839 return first; 5840 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5841 uint val = (first & 0x1F); 5842 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5843 foreach (j; 0 .. extra) 5844 val = (val << 8) | arr[idx+j]; 5845 idx += extra; 5846 return val; 5847 } 5848 5849 5850 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5851 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5852 { 5853 ubyte[] storage; 5854 uint base = 0; 5855 // RLE encode 5856 foreach (val; intervals) 5857 { 5858 compressTo(val[0]-base, storage); 5859 base = val[0]; 5860 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5861 { 5862 compressTo(val[1]-base, storage); 5863 base = val[1]; 5864 } 5865 } 5866 return storage; 5867 } 5868 5869 @safe pure unittest 5870 { 5871 import std.algorithm.comparison : equal; 5872 import std.typecons : tuple; 5873 5874 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5875 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5876 assert(compressIntervals(run) == enc); 5877 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5878 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5879 assert(compressIntervals(run2) == enc2); 5880 size_t idx = 0; 5881 assert(decompressFrom(enc, idx) == 80); 5882 assert(decompressFrom(enc, idx) == 47); 5883 assert(decompressFrom(enc, idx) == 1); 5884 assert(decompressFrom(enc, idx) == (1 << 10)); 5885 idx = 0; 5886 assert(decompressFrom(enc2, idx) == 0); 5887 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5888 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5889 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5890 } 5891 5892 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5893 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5894 { 5895 return DecompressedIntervals(data); 5896 } 5897 5898 @safe struct DecompressedIntervals 5899 { 5900 pure: 5901 const(ubyte)[] _stream; 5902 size_t _idx; 5903 CodepointInterval _front; 5904 5905 this(const(ubyte)[] stream) 5906 { 5907 _stream = stream; 5908 popFront(); 5909 } 5910 5911 @property CodepointInterval front() 5912 { 5913 assert(!empty); 5914 return _front; 5915 } 5916 5917 void popFront() 5918 { 5919 if (_idx == _stream.length) 5920 { 5921 _idx = size_t.max; 5922 return; 5923 } 5924 uint base = _front[1]; 5925 _front[0] = base + decompressFrom(_stream, _idx); 5926 if (_idx == _stream.length)// odd length ---> till the end 5927 _front[1] = lastDchar+1; 5928 else 5929 { 5930 base = _front[0]; 5931 _front[1] = base + decompressFrom(_stream, _idx); 5932 } 5933 } 5934 5935 @property bool empty() const 5936 { 5937 return _idx == size_t.max; 5938 } 5939 5940 @property DecompressedIntervals save() return scope { return this; } 5941 } 5942 5943 @safe pure nothrow @nogc unittest 5944 { 5945 static assert(isInputRange!DecompressedIntervals); 5946 static assert(isForwardRange!DecompressedIntervals); 5947 } 5948 5949 //============================================================================ 5950 5951 version (std_uni_bootstrap){} 5952 else 5953 { 5954 5955 // helper for looking up code point sets 5956 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5957 { 5958 import std.algorithm.iteration : map; 5959 import std.range : assumeSorted; 5960 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5961 (table.map!"a.name"()); 5962 size_t idx = range.lowerBound(name).length; 5963 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5964 return idx; 5965 return -1; 5966 } 5967 5968 // another one that loads it 5969 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5970 { 5971 auto idx = findUnicodeSet!table(name); 5972 if (idx >= 0) 5973 { 5974 dest = Set(asSet(table[idx].compressed)); 5975 return true; 5976 } 5977 return false; 5978 } 5979 5980 bool loadProperty(Set=CodepointSet, C) 5981 (const scope C[] name, ref Set target) pure 5982 { 5983 import std.internal.unicode_tables : uniProps; // generated file 5984 alias ucmp = comparePropertyName; 5985 // conjure cumulative properties by hand 5986 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5987 { 5988 target = asSet(uniProps.Lu); 5989 target |= asSet(uniProps.Ll); 5990 target |= asSet(uniProps.Lt); 5991 target |= asSet(uniProps.Lo); 5992 target |= asSet(uniProps.Lm); 5993 } 5994 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5995 { 5996 target = asSet(uniProps.Ll); 5997 target |= asSet(uniProps.Lu); 5998 target |= asSet(uniProps.Lt);// Title case 5999 } 6000 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 6001 { 6002 target = asSet(uniProps.Mn); 6003 target |= asSet(uniProps.Mc); 6004 target |= asSet(uniProps.Me); 6005 } 6006 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 6007 { 6008 target = asSet(uniProps.Nd); 6009 target |= asSet(uniProps.Nl); 6010 target |= asSet(uniProps.No); 6011 } 6012 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 6013 { 6014 target = asSet(uniProps.Pc); 6015 target |= asSet(uniProps.Pd); 6016 target |= asSet(uniProps.Ps); 6017 target |= asSet(uniProps.Pe); 6018 target |= asSet(uniProps.Pi); 6019 target |= asSet(uniProps.Pf); 6020 target |= asSet(uniProps.Po); 6021 } 6022 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6023 { 6024 target = asSet(uniProps.Sm); 6025 target |= asSet(uniProps.Sc); 6026 target |= asSet(uniProps.Sk); 6027 target |= asSet(uniProps.So); 6028 } 6029 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6030 { 6031 target = asSet(uniProps.Zs); 6032 target |= asSet(uniProps.Zl); 6033 target |= asSet(uniProps.Zp); 6034 } 6035 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6036 { 6037 target = asSet(uniProps.Cc); 6038 target |= asSet(uniProps.Cf); 6039 target |= asSet(uniProps.Cs); 6040 target |= asSet(uniProps.Co); 6041 target |= asSet(uniProps.Cn); 6042 } 6043 else if (ucmp(name, "graphical") == 0) 6044 { 6045 target = asSet(uniProps.Alphabetic); 6046 6047 target |= asSet(uniProps.Mn); 6048 target |= asSet(uniProps.Mc); 6049 target |= asSet(uniProps.Me); 6050 6051 target |= asSet(uniProps.Nd); 6052 target |= asSet(uniProps.Nl); 6053 target |= asSet(uniProps.No); 6054 6055 target |= asSet(uniProps.Pc); 6056 target |= asSet(uniProps.Pd); 6057 target |= asSet(uniProps.Ps); 6058 target |= asSet(uniProps.Pe); 6059 target |= asSet(uniProps.Pi); 6060 target |= asSet(uniProps.Pf); 6061 target |= asSet(uniProps.Po); 6062 6063 target |= asSet(uniProps.Zs); 6064 6065 target |= asSet(uniProps.Sm); 6066 target |= asSet(uniProps.Sc); 6067 target |= asSet(uniProps.Sk); 6068 target |= asSet(uniProps.So); 6069 } 6070 else if (ucmp(name, "any") == 0) 6071 target = Set.fromIntervals(0, 0x110000); 6072 else if (ucmp(name, "ascii") == 0) 6073 target = Set.fromIntervals(0, 0x80); 6074 else 6075 return loadUnicodeSet!(uniProps.tab)(name, target); 6076 return true; 6077 } 6078 6079 // CTFE-only helper for checking property names at compile-time 6080 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6081 { 6082 import std.algorithm.searching : find; 6083 auto names = [ 6084 "L", "Letter", 6085 "LC", "Cased Letter", 6086 "M", "Mark", 6087 "N", "Number", 6088 "P", "Punctuation", 6089 "S", "Symbol", 6090 "Z", "Separator", 6091 "Graphical", 6092 "any", 6093 "ascii" 6094 ]; 6095 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6096 return !x.empty; 6097 } 6098 6099 // ditto, CTFE-only, not optimized 6100 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6101 { 6102 return findUnicodeSet!table(name) >= 0; 6103 } 6104 6105 template SetSearcher(alias table, string kind) 6106 { 6107 /// Run-time checked search. 6108 static auto opCall(C)(const scope C[] name) 6109 if (is(C : dchar)) 6110 { 6111 import std.conv : to; 6112 CodepointSet set; 6113 if (loadUnicodeSet!table(name, set)) 6114 return set; 6115 throw new Exception("No unicode set for "~kind~" by name " 6116 ~name.to!string()~" was found."); 6117 } 6118 /// Compile-time checked search. 6119 static @property auto opDispatch(string name)() 6120 { 6121 static if (findSetName!table(name)) 6122 { 6123 CodepointSet set; 6124 loadUnicodeSet!table(name, set); 6125 return set; 6126 } 6127 else 6128 static assert(false, "No unicode set for "~kind~" by name " 6129 ~name~" was found."); 6130 } 6131 } 6132 6133 // Characters that need escaping in string posed as regular expressions 6134 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6135 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6136 6137 package(std) CodepointSet memoizeExpr(string expr)() 6138 { 6139 if (__ctfe) 6140 return mixin(expr); 6141 alias T = typeof(mixin(expr)); 6142 static T slot; 6143 static bool initialized; 6144 if (!initialized) 6145 { 6146 slot = mixin(expr); 6147 initialized = true; 6148 } 6149 return slot; 6150 } 6151 6152 //property for \w character class 6153 package(std) @property CodepointSet wordCharacter() @safe 6154 { 6155 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6156 | unicode.Me | unicode.Nd | unicode.Pc")(); 6157 } 6158 6159 //basic stack, just in case it gets used anywhere else then Parser 6160 package(std) struct Stack(T) 6161 { 6162 @safe: 6163 T[] data; 6164 @property bool empty(){ return data.empty; } 6165 6166 @property size_t length(){ return data.length; } 6167 6168 void push(T val){ data ~= val; } 6169 6170 @trusted T pop() 6171 { 6172 assert(!empty); 6173 auto val = data[$ - 1]; 6174 data = data[0 .. $ - 1]; 6175 if (!__ctfe) 6176 cast(void) data.assumeSafeAppend(); 6177 return val; 6178 } 6179 6180 @property ref T top() 6181 { 6182 assert(!empty); 6183 return data[$ - 1]; 6184 } 6185 } 6186 6187 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6188 //returns it's value and skips these maxDigit chars on success, throws on failure 6189 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6190 { 6191 import std.exception : enforce; 6192 //std.conv.parse is both @system and bogus 6193 uint val; 6194 for (int k = 0; k < maxDigit; k++) 6195 { 6196 enforce(!str.empty, "incomplete escape sequence"); 6197 //accepts ascii only, so it's OK to index directly 6198 immutable current = str.front; 6199 if ('0' <= current && current <= '9') 6200 val = val * 16 + current - '0'; 6201 else if ('a' <= current && current <= 'f') 6202 val = val * 16 + current -'a' + 10; 6203 else if ('A' <= current && current <= 'F') 6204 val = val * 16 + current - 'A' + 10; 6205 else 6206 throw new Exception("invalid escape sequence"); 6207 str.popFront(); 6208 } 6209 enforce(val <= 0x10FFFF, "invalid codepoint"); 6210 return val; 6211 } 6212 6213 @safe unittest 6214 { 6215 import std.algorithm.searching : canFind; 6216 import std.exception : collectException; 6217 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6218 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6219 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6220 foreach (v; non_hex) 6221 assert(collectException(parseUniHex(v, v.length)).msg 6222 .canFind("invalid escape sequence")); 6223 foreach (i, v; hex) 6224 assert(parseUniHex(v, v.length) == value[i]); 6225 string over = "0011FFFF"; 6226 assert(collectException(parseUniHex(over, over.length)).msg 6227 .canFind("invalid codepoint")); 6228 } 6229 6230 auto caseEnclose(CodepointSet set) 6231 { 6232 auto cased = set & unicode.LC; 6233 foreach (dchar ch; cased.byCodepoint) 6234 { 6235 foreach (c; simpleCaseFoldings(ch)) 6236 set |= c; 6237 } 6238 return set; 6239 } 6240 6241 /+ 6242 fetch codepoint set corresponding to a name (InBlock or binary property) 6243 +/ 6244 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6245 { 6246 CodepointSet s = unicode(name); 6247 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6248 if (casefold) 6249 s = caseEnclose(s); 6250 if (negated) 6251 s = s.inverted; 6252 return s; 6253 } 6254 6255 struct UnicodeSetParser(Range) 6256 { 6257 import std.exception : enforce; 6258 import std.typecons : tuple, Tuple; 6259 Range range; 6260 bool casefold_; 6261 6262 @property bool empty(){ return range.empty; } 6263 @property dchar front(){ return range.front; } 6264 void popFront(){ range.popFront(); } 6265 6266 //CodepointSet operations relatively in order of priority 6267 enum Operator:uint { 6268 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6269 } 6270 6271 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6272 //also fetches next set operation 6273 Tuple!(CodepointSet,Operator) parseCharTerm() 6274 { 6275 import std.range : drop; 6276 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6277 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6278 PotentialTwinSymbolOperator } 6279 Operator op = Operator.None; 6280 dchar last; 6281 CodepointSet set; 6282 State state = State.Start; 6283 6284 void addWithFlags(ref CodepointSet set, uint ch) 6285 { 6286 if (casefold_) 6287 { 6288 auto foldings = simpleCaseFoldings(ch); 6289 foreach (v; foldings) 6290 set |= v; 6291 } 6292 else 6293 set |= ch; 6294 } 6295 6296 static Operator twinSymbolOperator(dchar symbol) 6297 { 6298 switch (symbol) 6299 { 6300 case '|': 6301 return Operator.Union; 6302 case '-': 6303 return Operator.Difference; 6304 case '~': 6305 return Operator.SymDifference; 6306 case '&': 6307 return Operator.Intersection; 6308 default: 6309 assert(false); 6310 } 6311 } 6312 6313 L_CharTermLoop: 6314 for (;;) 6315 { 6316 final switch (state) 6317 { 6318 case State.Start: 6319 switch (front) 6320 { 6321 case '|': 6322 case '-': 6323 case '~': 6324 case '&': 6325 state = State.PotentialTwinSymbolOperator; 6326 last = front; 6327 break; 6328 case '[': 6329 op = Operator.Union; 6330 goto case; 6331 case ']': 6332 break L_CharTermLoop; 6333 case '\\': 6334 state = State.Escape; 6335 break; 6336 default: 6337 state = State.Char; 6338 last = front; 6339 } 6340 break; 6341 case State.Char: 6342 // xxx last front xxx 6343 switch (front) 6344 { 6345 case '|': 6346 case '~': 6347 case '&': 6348 // then last is treated as normal char and added as implicit union 6349 state = State.PotentialTwinSymbolOperator; 6350 addWithFlags(set, last); 6351 last = front; 6352 break; 6353 case '-': // still need more info 6354 state = State.CharDash; 6355 break; 6356 case '\\': 6357 set |= last; 6358 state = State.Escape; 6359 break; 6360 case '[': 6361 op = Operator.Union; 6362 goto case; 6363 case ']': 6364 addWithFlags(set, last); 6365 break L_CharTermLoop; 6366 default: 6367 state = State.Char; 6368 addWithFlags(set, last); 6369 last = front; 6370 } 6371 break; 6372 case State.PotentialTwinSymbolOperator: 6373 // xxx last front xxxx 6374 // where last = [|-&~] 6375 if (front == last) 6376 { 6377 op = twinSymbolOperator(last); 6378 popFront();//skip second twin char 6379 break L_CharTermLoop; 6380 } 6381 goto case State.Char; 6382 case State.Escape: 6383 // xxx \ front xxx 6384 switch (front) 6385 { 6386 case 'f': 6387 last = '\f'; 6388 state = State.Char; 6389 break; 6390 case 'n': 6391 last = '\n'; 6392 state = State.Char; 6393 break; 6394 case 'r': 6395 last = '\r'; 6396 state = State.Char; 6397 break; 6398 case 't': 6399 last = '\t'; 6400 state = State.Char; 6401 break; 6402 case 'v': 6403 last = '\v'; 6404 state = State.Char; 6405 break; 6406 case 'c': 6407 last = unicode.parseControlCode(this); 6408 state = State.Char; 6409 break; 6410 foreach (val; Escapables) 6411 { 6412 case val: 6413 } 6414 last = front; 6415 state = State.Char; 6416 break; 6417 case 'p': 6418 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6419 state = State.Start; 6420 continue L_CharTermLoop; //next char already fetched 6421 case 'P': 6422 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6423 state = State.Start; 6424 continue L_CharTermLoop; //next char already fetched 6425 case 'x': 6426 popFront(); 6427 last = parseUniHex(this, 2); 6428 state = State.Char; 6429 continue L_CharTermLoop; 6430 case 'u': 6431 popFront(); 6432 last = parseUniHex(this, 4); 6433 state = State.Char; 6434 continue L_CharTermLoop; 6435 case 'U': 6436 popFront(); 6437 last = parseUniHex(this, 8); 6438 state = State.Char; 6439 continue L_CharTermLoop; 6440 case 'd': 6441 set.add(unicode.Nd); 6442 state = State.Start; 6443 break; 6444 case 'D': 6445 set.add(unicode.Nd.inverted); 6446 state = State.Start; 6447 break; 6448 case 's': 6449 set.add(unicode.White_Space); 6450 state = State.Start; 6451 break; 6452 case 'S': 6453 set.add(unicode.White_Space.inverted); 6454 state = State.Start; 6455 break; 6456 case 'w': 6457 set.add(wordCharacter); 6458 state = State.Start; 6459 break; 6460 case 'W': 6461 set.add(wordCharacter.inverted); 6462 state = State.Start; 6463 break; 6464 default: 6465 if (front >= privateUseStart && front <= privateUseEnd) 6466 enforce(false, "no matching ']' found while parsing character class"); 6467 enforce(false, "invalid escape sequence"); 6468 } 6469 break; 6470 case State.CharDash: 6471 // xxx last - front xxx 6472 switch (front) 6473 { 6474 case '[': 6475 op = Operator.Union; 6476 goto case; 6477 case ']': 6478 //means dash is a single char not an interval specifier 6479 addWithFlags(set, last); 6480 addWithFlags(set, '-'); 6481 break L_CharTermLoop; 6482 case '-'://set Difference again 6483 addWithFlags(set, last); 6484 op = Operator.Difference; 6485 popFront();//skip '-' 6486 break L_CharTermLoop; 6487 case '\\': 6488 state = State.CharDashEscape; 6489 break; 6490 default: 6491 enforce(last <= front, "inverted range"); 6492 if (casefold_) 6493 { 6494 for (uint ch = last; ch <= front; ch++) 6495 addWithFlags(set, ch); 6496 } 6497 else 6498 set.add(last, front + 1); 6499 state = State.Start; 6500 } 6501 break; 6502 case State.CharDashEscape: 6503 //xxx last - \ front xxx 6504 uint end; 6505 switch (front) 6506 { 6507 case 'f': 6508 end = '\f'; 6509 break; 6510 case 'n': 6511 end = '\n'; 6512 break; 6513 case 'r': 6514 end = '\r'; 6515 break; 6516 case 't': 6517 end = '\t'; 6518 break; 6519 case 'v': 6520 end = '\v'; 6521 break; 6522 foreach (val; Escapables) 6523 { 6524 case val: 6525 } 6526 end = front; 6527 break; 6528 case 'c': 6529 end = unicode.parseControlCode(this); 6530 break; 6531 case 'x': 6532 popFront(); 6533 end = parseUniHex(this, 2); 6534 enforce(last <= end,"inverted range"); 6535 set.add(last, end + 1); 6536 state = State.Start; 6537 continue L_CharTermLoop; 6538 case 'u': 6539 popFront(); 6540 end = parseUniHex(this, 4); 6541 enforce(last <= end,"inverted range"); 6542 set.add(last, end + 1); 6543 state = State.Start; 6544 continue L_CharTermLoop; 6545 case 'U': 6546 popFront(); 6547 end = parseUniHex(this, 8); 6548 enforce(last <= end,"inverted range"); 6549 set.add(last, end + 1); 6550 state = State.Start; 6551 continue L_CharTermLoop; 6552 default: 6553 if (front >= privateUseStart && front <= privateUseEnd) 6554 enforce(false, "no matching ']' found while parsing character class"); 6555 enforce(false, "invalid escape sequence"); 6556 } 6557 // Lookahead to check if it's a \T 6558 // where T is sub-pattern terminator in multi-pattern scheme 6559 auto lookahead = range.save.drop(1); 6560 if (end == '\\' && !lookahead.empty) 6561 { 6562 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6563 enforce(false, "no matching ']' found while parsing character class"); 6564 } 6565 enforce(last <= end,"inverted range"); 6566 set.add(last, end + 1); 6567 state = State.Start; 6568 break; 6569 } 6570 popFront(); 6571 enforce(!empty, "unexpected end of CodepointSet"); 6572 } 6573 return tuple(set, op); 6574 } 6575 6576 alias ValStack = Stack!(CodepointSet); 6577 alias OpStack = Stack!(Operator); 6578 6579 CodepointSet parseSet() 6580 { 6581 ValStack vstack; 6582 OpStack opstack; 6583 import std.functional : unaryFun; 6584 enforce(!empty, "unexpected end of input"); 6585 enforce(front == '[', "expected '[' at the start of unicode set"); 6586 // 6587 static bool apply(Operator op, ref ValStack stack) 6588 { 6589 switch (op) 6590 { 6591 case Operator.Negate: 6592 enforce(!stack.empty, "no operand for '^'"); 6593 stack.top = stack.top.inverted; 6594 break; 6595 case Operator.Union: 6596 auto s = stack.pop();//2nd operand 6597 enforce(!stack.empty, "no operand for '||'"); 6598 stack.top.add(s); 6599 break; 6600 case Operator.Difference: 6601 auto s = stack.pop();//2nd operand 6602 enforce(!stack.empty, "no operand for '--'"); 6603 stack.top.sub(s); 6604 break; 6605 case Operator.SymDifference: 6606 auto s = stack.pop();//2nd operand 6607 enforce(!stack.empty, "no operand for '~~'"); 6608 stack.top ~= s; 6609 break; 6610 case Operator.Intersection: 6611 auto s = stack.pop();//2nd operand 6612 enforce(!stack.empty, "no operand for '&&'"); 6613 stack.top.intersect(s); 6614 break; 6615 default: 6616 return false; 6617 } 6618 return true; 6619 } 6620 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6621 { 6622 while (cond(opstack.top)) 6623 { 6624 if (!apply(opstack.pop(),vstack)) 6625 return false;//syntax error 6626 if (opstack.empty) 6627 return false; 6628 } 6629 return true; 6630 } 6631 6632 L_CharsetLoop: 6633 do 6634 { 6635 switch (front) 6636 { 6637 case '[': 6638 opstack.push(Operator.Open); 6639 popFront(); 6640 enforce(!empty, "unexpected end of character class"); 6641 if (front == '^') 6642 { 6643 opstack.push(Operator.Negate); 6644 popFront(); 6645 enforce(!empty, "unexpected end of character class"); 6646 } 6647 else if (front == ']') // []...] is special cased 6648 { 6649 popFront(); 6650 enforce(!empty, "wrong character set"); 6651 auto pair = parseCharTerm(); 6652 pair[0].add(']', ']'+1); 6653 if (pair[1] != Operator.None) 6654 { 6655 if (opstack.top == Operator.Union) 6656 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6657 opstack.push(pair[1]); 6658 } 6659 vstack.push(pair[0]); 6660 } 6661 break; 6662 case ']': 6663 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6664 "character class syntax error"); 6665 enforce(!opstack.empty, "unmatched ']'"); 6666 opstack.pop(); 6667 popFront(); 6668 if (opstack.empty) 6669 break L_CharsetLoop; 6670 auto pair = parseCharTerm(); 6671 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6672 { 6673 vstack.top.add(pair[0]);//apply union 6674 } 6675 if (pair[1] != Operator.None) 6676 { 6677 if (opstack.top == Operator.Union) 6678 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6679 opstack.push(pair[1]); 6680 } 6681 break; 6682 // 6683 default://yet another pair of term(op)? 6684 auto pair = parseCharTerm(); 6685 if (pair[1] != Operator.None) 6686 { 6687 if (opstack.top == Operator.Union) 6688 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6689 opstack.push(pair[1]); 6690 } 6691 vstack.push(pair[0]); 6692 } 6693 6694 }while (!empty || !opstack.empty); 6695 while (!opstack.empty) 6696 apply(opstack.pop(),vstack); 6697 assert(vstack.length == 1); 6698 return vstack.top; 6699 } 6700 } 6701 6702 /** 6703 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6704 a block, script or general category. 6705 6706 It uses well defined standard rules of property name lookup. 6707 This includes fuzzy matching of names, so that 6708 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6709 and yield the same set of white space $(CHARACTERS). 6710 */ 6711 @safe public struct unicode 6712 { 6713 import std.exception : enforce; 6714 /** 6715 Performs the lookup of set of $(CODEPOINTS) 6716 with compile-time correctness checking. 6717 This short-cut version combines 3 searches: 6718 across blocks, scripts, and common binary properties. 6719 6720 Note that since scripts and blocks overlap the 6721 usual trick to disambiguate is used - to get a block use 6722 `unicode.InBlockName`, to search a script 6723 use `unicode.ScriptName`. 6724 6725 See_Also: $(LREF block), $(LREF script) 6726 and (not included in this search) $(LREF hangulSyllableType). 6727 */ 6728 6729 static @property auto opDispatch(string name)() pure 6730 { 6731 static if (findAny(name)) 6732 return loadAny(name); 6733 else 6734 static assert(false, "No unicode set by name "~name~" was found."); 6735 } 6736 6737 /// 6738 @safe unittest 6739 { 6740 import std.exception : collectException; 6741 auto ascii = unicode.ASCII; 6742 assert(ascii['A']); 6743 assert(ascii['~']); 6744 assert(!ascii['\u00e0']); 6745 // matching is case-insensitive 6746 assert(ascii == unicode.ascII); 6747 assert(!ascii['à']); 6748 // underscores, '-' and whitespace in names are ignored too 6749 auto latin = unicode.in_latin1_Supplement; 6750 assert(latin['à']); 6751 assert(!latin['$']); 6752 // BTW Latin 1 Supplement is a block, hence "In" prefix 6753 assert(latin == unicode("In Latin 1 Supplement")); 6754 // run-time look up throws if no such set is found 6755 assert(collectException(unicode("InCyrilliac"))); 6756 } 6757 6758 /** 6759 The same lookup across blocks, scripts, or binary properties, 6760 but performed at run-time. 6761 This version is provided for cases where `name` 6762 is not known beforehand; otherwise compile-time 6763 checked $(LREF opDispatch) is typically a better choice. 6764 6765 See the $(S_LINK Unicode properties, table of properties) for available 6766 sets. 6767 */ 6768 static auto opCall(C)(const scope C[] name) 6769 if (is(C : dchar)) 6770 { 6771 return loadAny(name); 6772 } 6773 6774 /** 6775 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6776 6777 Note: 6778 Here block names are unambiguous as no scripts are searched 6779 and thus to search use simply `unicode.block.BlockName` notation. 6780 6781 See $(S_LINK Unicode properties, table of properties) for available sets. 6782 See_Also: $(S_LINK Unicode properties, table of properties). 6783 */ 6784 struct block 6785 { 6786 import std.internal.unicode_tables : blocks; // generated file 6787 mixin SetSearcher!(blocks.tab, "block"); 6788 } 6789 6790 /// 6791 @safe unittest 6792 { 6793 // use .block for explicitness 6794 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6795 } 6796 6797 /** 6798 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6799 6800 See the $(S_LINK Unicode properties, table of properties) for available 6801 sets. 6802 */ 6803 struct script 6804 { 6805 import std.internal.unicode_tables : scripts; // generated file 6806 mixin SetSearcher!(scripts.tab, "script"); 6807 } 6808 6809 /// 6810 @safe unittest 6811 { 6812 auto arabicScript = unicode.script.arabic; 6813 auto arabicBlock = unicode.block.arabic; 6814 // there is an intersection between script and block 6815 assert(arabicBlock['']); 6816 assert(arabicScript['']); 6817 // but they are different 6818 assert(arabicBlock != arabicScript); 6819 assert(arabicBlock == unicode.inArabic); 6820 assert(arabicScript == unicode.arabic); 6821 } 6822 6823 /** 6824 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6825 6826 Other non-binary properties (once supported) follow the same 6827 notation - `unicode.propertyName.propertyValue` for compile-time 6828 checked access and `unicode.propertyName(propertyValue)` 6829 for run-time checked one. 6830 6831 See the $(S_LINK Unicode properties, table of properties) for available 6832 sets. 6833 */ 6834 struct hangulSyllableType 6835 { 6836 import std.internal.unicode_tables : hangul; // generated file 6837 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6838 } 6839 6840 /// 6841 @safe unittest 6842 { 6843 // L here is syllable type not Letter as in unicode.L short-cut 6844 auto leadingVowel = unicode.hangulSyllableType("L"); 6845 // check that some leading vowels are present 6846 foreach (vowel; '\u1110'..'\u115F') 6847 assert(leadingVowel[vowel]); 6848 assert(leadingVowel == unicode.hangulSyllableType.L); 6849 } 6850 6851 //parse control code of form \cXXX, c assumed to be the current symbol 6852 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6853 { 6854 with(p) 6855 { 6856 popFront(); 6857 enforce(!empty, "Unfinished escape sequence"); 6858 enforce(('a' <= front && front <= 'z') 6859 || ('A' <= front && front <= 'Z'), 6860 "Only letters are allowed after \\c"); 6861 return front & 0x1f; 6862 } 6863 } 6864 6865 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6866 //\ - assumed to be processed, p - is current 6867 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6868 bool negated, bool casefold) 6869 { 6870 static import std.ascii; 6871 with(p) 6872 { 6873 enum MAX_PROPERTY = 128; 6874 char[MAX_PROPERTY] result; 6875 uint k = 0; 6876 popFront(); 6877 enforce(!empty, "eof parsing unicode property spec"); 6878 if (front == '{') 6879 { 6880 popFront(); 6881 while (k < MAX_PROPERTY && !empty && front !='}' 6882 && front !=':') 6883 { 6884 if (front != '-' && front != ' ' && front != '_') 6885 result[k++] = cast(char) std.ascii.toLower(front); 6886 popFront(); 6887 } 6888 enforce(k != MAX_PROPERTY, "invalid property name"); 6889 enforce(front == '}', "} expected "); 6890 } 6891 else 6892 {//single char properties e.g.: \pL, \pN ... 6893 enforce(front < 0x80, "invalid property name"); 6894 result[k++] = cast(char) front; 6895 } 6896 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6897 enforce(!s.empty, "unrecognized unicode property spec"); 6898 popFront(); 6899 return s; 6900 } 6901 } 6902 6903 /** 6904 Parse unicode codepoint set from given `range` using standard regex 6905 syntax '[...]'. The range is advanced skiping over regex set definition. 6906 `casefold` parameter determines if the set should be casefolded - that is 6907 include both lower and upper case versions for any letters in the set. 6908 */ 6909 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6910 if (isInputRange!Range && is(ElementType!Range : dchar)) 6911 { 6912 auto usParser = UnicodeSetParser!Range(range, casefold); 6913 auto set = usParser.parseSet(); 6914 range = usParser.range; 6915 return set; 6916 } 6917 6918 /// 6919 @safe unittest 6920 { 6921 import std.uni : unicode; 6922 string pat = "[a-zA-Z0-9]hello"; 6923 auto set = unicode.parseSet(pat); 6924 // check some of the codepoints 6925 assert(set['a'] && set['A'] && set['9']); 6926 assert(pat == "hello"); 6927 } 6928 6929 private: 6930 alias ucmp = comparePropertyName; 6931 6932 static bool findAny(string name) 6933 { 6934 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6935 return isPrettyPropertyName(name) 6936 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6937 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6938 } 6939 6940 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6941 { 6942 import std.conv : to; 6943 import std.internal.unicode_tables : blocks, scripts; // generated file 6944 Set set; 6945 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6946 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6947 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6948 if (loaded) 6949 return set; 6950 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6951 } 6952 6953 // FIXME: re-disable once the compiler is fixed 6954 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6955 //@disable ~this(); 6956 } 6957 6958 @safe unittest 6959 { 6960 import std.internal.unicode_tables : blocks, uniProps; // generated file 6961 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6962 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6963 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6964 } 6965 6966 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6967 6968 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6969 // Use combined trie instead of checking for '\r' | '\n' | ccTrie, 6970 // or extend | '\u200D' separately 6971 6972 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6973 { 6974 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6975 } 6976 6977 // Our grapheme decoder is a state machine, this is list of all possible 6978 // states before each code point. 6979 private enum GraphemeState 6980 { 6981 Start, 6982 CR, 6983 RI, 6984 L, 6985 V, 6986 LVT, 6987 Emoji, 6988 EmojiZWJ, 6989 Prepend, 6990 End 6991 } 6992 6993 // Message values whether end of grapheme is reached 6994 private enum TransformRes 6995 { 6996 // No, unless the source range ends here 6997 // (GB2 - break at end of text, unless text is empty) 6998 goOn, 6999 redo, // Run last character again with new state 7000 retInclude, // Yes, after the just iterated character 7001 retExclude // Yes, before the just iterated character 7002 } 7003 7004 // The logic of the grapheme decoding is all here 7005 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 7006 // Note, getting GB1 (break at start of text, unless text is empty) right 7007 // relies on the user starting grapheme walking from beginning of the text, and 7008 // not attempting to walk an empty text. 7009 private immutable TransformRes 7010 function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = 7011 [ 7012 GraphemeState.Start: (ref state, ch) 7013 { 7014 // GB4. Break after controls. 7015 if (graphemeControlTrie[ch] || ch == '\n') 7016 return TransformRes.retInclude; 7017 7018 with (GraphemeState) state = 7019 ch == '\r' ? CR : 7020 isRegionalIndicator(ch) ? RI : 7021 isHangL(ch) ? L : 7022 hangLV[ch] || isHangV(ch) ? V : 7023 hangLVT[ch] || isHangT(ch) ? LVT : 7024 prependTrie[ch] ? Prepend : 7025 xpictoTrie[ch] ? Emoji : 7026 End; 7027 7028 // No matter what we encountered, we always include the 7029 // first code point in the grapheme. 7030 return TransformRes.goOn; 7031 }, 7032 7033 // GB3, GB4. Do not break between a CR and LF. 7034 // Otherwise, break after controls. 7035 GraphemeState.CR: (ref state, ch) => ch == '\n' ? 7036 TransformRes.retInclude : 7037 TransformRes.retExclude, 7038 7039 // GB12 - GB13. Do not break within emoji flag sequences. 7040 // That is, do not break between regional indicator (RI) symbols if 7041 // there is an odd number of RI characters before the break point. 7042 // This state applies if one and only one RI code point has been 7043 // encountered. 7044 GraphemeState.RI: (ref state, ch) 7045 { 7046 state = GraphemeState.End; 7047 7048 return isRegionalIndicator(ch) ? 7049 TransformRes.goOn : 7050 TransformRes.redo; 7051 }, 7052 7053 // GB6. Do not break Hangul syllable sequences. 7054 GraphemeState.L: (ref state, ch) 7055 { 7056 if (isHangL(ch)) 7057 return TransformRes.goOn; 7058 else if (isHangV(ch) || hangLV[ch]) 7059 { 7060 state = GraphemeState.V; 7061 return TransformRes.goOn; 7062 } 7063 else if (hangLVT[ch]) 7064 { 7065 state = GraphemeState.LVT; 7066 return TransformRes.goOn; 7067 } 7068 7069 state = GraphemeState.End; 7070 return TransformRes.redo; 7071 }, 7072 7073 // GB7. Do not break Hangul syllable sequences. 7074 GraphemeState.V: (ref state, ch) 7075 { 7076 if (isHangV(ch)) 7077 return TransformRes.goOn; 7078 else if (isHangT(ch)) 7079 { 7080 state = GraphemeState.LVT; 7081 return TransformRes.goOn; 7082 } 7083 7084 state = GraphemeState.End; 7085 return TransformRes.redo; 7086 }, 7087 7088 // GB8. Do not break Hangul syllable sequences. 7089 GraphemeState.LVT: (ref state, ch) 7090 { 7091 if (isHangT(ch)) 7092 return TransformRes.goOn; 7093 7094 state = GraphemeState.End; 7095 return TransformRes.redo; 7096 }, 7097 7098 // GB11. Do not break within emoji modifier sequences or emoji 7099 // zwj sequences. This state applies when the last code point was 7100 // NOT a ZWJ. 7101 GraphemeState.Emoji: (ref state, ch) 7102 { 7103 if (graphemeExtendTrie[ch]) 7104 return TransformRes.goOn; 7105 7106 static assert(!graphemeExtendTrie['\u200D']); 7107 7108 if (ch == '\u200D') 7109 { 7110 state = GraphemeState.EmojiZWJ; 7111 return TransformRes.goOn; 7112 } 7113 7114 state = GraphemeState.End; 7115 // There might still be spacing marks are 7116 // at the end, which are not allowed in 7117 // middle of emoji sequences 7118 return TransformRes.redo; 7119 }, 7120 7121 // GB11. Do not break within emoji modifier sequences or emoji 7122 // zwj sequences. This state applies when the last code point was 7123 // a ZWJ. 7124 GraphemeState.EmojiZWJ: (ref state, ch) 7125 { 7126 state = GraphemeState.Emoji; 7127 if (xpictoTrie[ch]) 7128 return TransformRes.goOn; 7129 return TransformRes.redo; 7130 }, 7131 7132 // GB9b. Do not break after Prepend characters. 7133 GraphemeState.Prepend: (ref state, ch) 7134 { 7135 // GB5. Break before controls. 7136 if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') 7137 return TransformRes.retExclude; 7138 7139 state = GraphemeState.Start; 7140 return TransformRes.redo; 7141 }, 7142 7143 // GB9, GB9a. Do not break before extending characters, ZWJ 7144 // or SpacingMarks. 7145 // GB999. Otherwise, break everywhere. 7146 GraphemeState.End: (ref state, ch) 7147 => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? 7148 TransformRes.retExclude : 7149 TransformRes.goOn 7150 ]; 7151 7152 enum GraphemeRet { none, step, value } 7153 7154 template genericDecodeGrapheme(GraphemeRet retType) 7155 { alias Ret = GraphemeRet; 7156 7157 static if (retType == Ret.value) 7158 alias Value = Grapheme; 7159 else static if (retType == Ret.step) 7160 alias Value = size_t; 7161 else static if (retType == Ret.none) 7162 alias Value = void; 7163 7164 Value genericDecodeGrapheme(Input)(ref Input range) 7165 { 7166 static if (retType == Ret.value) 7167 Grapheme result; 7168 else static if (retType == Ret.step) 7169 size_t result = 0; 7170 7171 auto state = GraphemeState.Start; 7172 dchar ch; 7173 7174 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 7175 outer: 7176 while (!range.empty) 7177 { 7178 ch = range.front; 7179 7180 rerun: 7181 final switch (graphemeTransforms[state](state, ch)) 7182 with(TransformRes) 7183 { 7184 case goOn: 7185 static if (retType == Ret.value) 7186 result ~= ch; 7187 else static if (retType == Ret.step) 7188 result++; 7189 range.popFront(); 7190 continue; 7191 7192 case redo: 7193 goto rerun; 7194 7195 case retInclude: 7196 static if (retType == Ret.value) 7197 result ~= ch; 7198 else static if (retType == Ret.step) 7199 result++; 7200 range.popFront(); 7201 break outer; 7202 7203 case retExclude: 7204 break outer; 7205 } 7206 } 7207 7208 static if (retType != Ret.none) 7209 return result; 7210 } 7211 } 7212 7213 public: // Public API continues 7214 7215 /++ 7216 Computes the length of grapheme cluster starting at `index`. 7217 Both the resulting length and the `index` are measured 7218 in $(S_LINK Code unit, code units). 7219 7220 Params: 7221 C = type that is implicitly convertible to `dchars` 7222 input = array of grapheme clusters 7223 index = starting index into `input[]` 7224 7225 Returns: 7226 length of grapheme cluster 7227 +/ 7228 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7229 if (is(C : dchar)) 7230 { 7231 auto src = input[index..$]; 7232 auto n = src.length; 7233 genericDecodeGrapheme!(GraphemeRet.none)(src); 7234 return n - src.length; 7235 } 7236 7237 /// 7238 @safe unittest 7239 { 7240 assert(graphemeStride(" ", 1) == 1); 7241 // A + combing ring above 7242 string city = "A\u030Arhus"; 7243 size_t first = graphemeStride(city, 0); 7244 assert(first == 3); //\u030A has 2 UTF-8 code units 7245 assert(city[0 .. first] == "A\u030A"); 7246 assert(city[first..$] == "rhus"); 7247 } 7248 7249 @safe unittest 7250 { 7251 // Ensure that graphemeStride is usable from CTFE. 7252 enum c1 = graphemeStride("A", 0); 7253 static assert(c1 == 1); 7254 7255 enum c2 = graphemeStride("A\u0301", 0); 7256 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7257 } 7258 7259 @safe pure nothrow @nogc unittest 7260 { 7261 // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face 7262 assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); 7263 // skier ~ female sign ~ '€' 7264 assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); 7265 // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' 7266 assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); 7267 // skier ~ zero-width joiner ~ female sign ~ '€' 7268 assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); 7269 // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner 7270 // ~ female sign ~ '€' 7271 assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); 7272 // skier ~ zero-width joiner ~ '€' 7273 assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); 7274 //'€' ~ zero-width joiner ~ skier 7275 assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); 7276 // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two 7277 assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); 7278 // Kaithi number sign ~ null 7279 assert(graphemeStride("\U000110BD\0"d, 0) == 1); 7280 } 7281 7282 /++ 7283 Reads one full grapheme cluster from an 7284 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7285 7286 For examples see the $(LREF Grapheme) below. 7287 7288 Note: 7289 This function modifies `inp` and thus `inp` 7290 must be an L-value. 7291 +/ 7292 Grapheme decodeGrapheme(Input)(ref Input inp) 7293 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7294 { 7295 return genericDecodeGrapheme!(GraphemeRet.value)(inp); 7296 } 7297 7298 @safe unittest 7299 { 7300 import std.algorithm.comparison : equal; 7301 7302 Grapheme gr; 7303 string s = " \u0020\u0308 "; 7304 gr = decodeGrapheme(s); 7305 assert(gr.length == 1 && gr[0] == ' '); 7306 gr = decodeGrapheme(s); 7307 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7308 s = "\u0300\u0308\u1100"; 7309 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7310 assert(equal(decodeGrapheme(s)[], "\u1100")); 7311 s = "\u11A8\u0308\uAC01"; 7312 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7313 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7314 7315 // Two Union Jacks of the Great Britain 7316 s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7317 assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7")); 7318 } 7319 7320 /++ 7321 Reads one full grapheme cluster from an 7322 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`, 7323 but doesn't return it. Instead returns the number of code units read. 7324 This differs from number of code points read only if `input` is an 7325 autodecodable string. 7326 7327 Note: 7328 This function modifies `inp` and thus `inp` 7329 must be an L-value. 7330 +/ 7331 size_t popGrapheme(Input)(ref Input inp) 7332 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7333 { 7334 static if (isAutodecodableString!Input || hasLength!Input) 7335 { 7336 // Why count each step in the decoder when you can just 7337 // measure the grapheme in one go? 7338 auto n = inp.length; 7339 genericDecodeGrapheme!(GraphemeRet.none)(inp); 7340 return n - inp.length; 7341 } 7342 else return genericDecodeGrapheme!(GraphemeRet.step)(inp); 7343 } 7344 7345 /// 7346 @safe pure unittest 7347 { 7348 // Two Union Jacks of the Great Britain in each 7349 string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7350 wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7351 dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7352 7353 // String pop length in code units, not points. 7354 assert(s.popGrapheme() == 8); 7355 assert(ws.popGrapheme() == 4); 7356 assert(ds.popGrapheme() == 2); 7357 7358 assert(s == "\U0001F1EC\U0001F1E7"); 7359 assert(ws == "\U0001F1EC\U0001F1E7"); 7360 assert(ds == "\U0001F1EC\U0001F1E7"); 7361 7362 import std.algorithm.comparison : equal; 7363 import std.algorithm.iteration : filter; 7364 7365 // Also works for non-random access ranges as long as the 7366 // character type is 32-bit. 7367 auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha); 7368 // Windows-style line ending is two code points in a single grapheme. 7369 assert(testPiece.popGrapheme() == 2); 7370 assert(testPiece.equal("!"d)); 7371 } 7372 7373 // Attribute compliance test. Should be nothrow `@nogc` when 7374 // no autodecoding needed. 7375 @safe pure nothrow @nogc unittest 7376 { 7377 import std.algorithm.iteration : filter; 7378 7379 auto str = "abcdef"d; 7380 assert(str.popGrapheme() == 1); 7381 7382 // also test with non-random access 7383 auto filtered = "abcdef"d.filter!(x => x%2); 7384 assert(filtered.popGrapheme() == 1); 7385 } 7386 7387 /++ 7388 $(P Iterate a string by $(LREF Grapheme).) 7389 7390 $(P Useful for doing string manipulation that needs to be aware 7391 of graphemes.) 7392 7393 See_Also: 7394 $(LREF byCodePoint) 7395 +/ 7396 auto byGrapheme(Range)(Range range) 7397 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7398 { 7399 // TODO: Bidirectional access 7400 static struct Result(R) 7401 { 7402 private R _range; 7403 private Grapheme _front; 7404 7405 bool empty() @property 7406 { 7407 return _front.length == 0; 7408 } 7409 7410 Grapheme front() @property 7411 { 7412 return _front; 7413 } 7414 7415 void popFront() 7416 { 7417 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7418 } 7419 7420 static if (isForwardRange!R) 7421 { 7422 Result save() @property 7423 { 7424 return Result(_range.save, _front); 7425 } 7426 } 7427 } 7428 7429 auto result = Result!(Range)(range); 7430 result.popFront(); 7431 return result; 7432 } 7433 7434 /// 7435 @safe unittest 7436 { 7437 import std.algorithm.comparison : equal; 7438 import std.range.primitives : walkLength; 7439 import std.range : take, drop; 7440 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7441 assert(text.walkLength == 5); // 5 code points 7442 7443 auto gText = text.byGrapheme; 7444 assert(gText.walkLength == 4); // 4 graphemes 7445 7446 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7447 assert(gText.drop(3).equal("l".byGrapheme)); 7448 } 7449 7450 // For testing non-forward-range input ranges 7451 version (StdUnittest) 7452 private static @safe struct InputRangeString 7453 { 7454 private string s; 7455 7456 bool empty() @property { return s.empty; } 7457 dchar front() @property { return s.front; } 7458 void popFront() { s.popFront(); } 7459 } 7460 7461 @safe unittest 7462 { 7463 import std.algorithm.comparison : equal; 7464 import std.array : array; 7465 import std.range : retro; 7466 import std.range.primitives : walkLength; 7467 assert("".byGrapheme.walkLength == 0); 7468 7469 auto reverse = "le\u0308on"; 7470 assert(reverse.walkLength == 5); 7471 7472 auto gReverse = reverse.byGrapheme; 7473 assert(gReverse.walkLength == 4); 7474 7475 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7476 {{ 7477 assert(text.walkLength == 5); 7478 static assert(isForwardRange!(typeof(text))); 7479 7480 auto gText = text.byGrapheme; 7481 static assert(isForwardRange!(typeof(gText))); 7482 assert(gText.walkLength == 4); 7483 assert(gText.array.retro.equal(gReverse)); 7484 }} 7485 7486 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7487 static assert(!isForwardRange!(typeof(nonForwardRange))); 7488 assert(nonForwardRange.walkLength == 4); 7489 } 7490 7491 // Issue 23474 7492 @safe pure unittest 7493 { 7494 import std.range.primitives : walkLength; 7495 assert(byGrapheme("\r\u0308").walkLength == 2); 7496 } 7497 7498 /++ 7499 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7500 7501 $(P Useful for converting the result to a string after doing operations 7502 on graphemes.) 7503 7504 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7505 +/ 7506 auto byCodePoint(Range)(Range range) 7507 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7508 { 7509 // TODO: Propagate bidirectional access 7510 static struct Result 7511 { 7512 private Range _range; 7513 private size_t i = 0; 7514 7515 bool empty() @property 7516 { 7517 return _range.empty; 7518 } 7519 7520 dchar front() @property 7521 { 7522 return _range.front[i]; 7523 } 7524 7525 void popFront() 7526 { 7527 ++i; 7528 7529 if (i >= _range.front.length) 7530 { 7531 _range.popFront(); 7532 i = 0; 7533 } 7534 } 7535 7536 static if (isForwardRange!Range) 7537 { 7538 Result save() @property 7539 { 7540 return Result(_range.save, i); 7541 } 7542 } 7543 } 7544 7545 return Result(range); 7546 } 7547 7548 /// Ditto 7549 auto byCodePoint(Range)(Range range) 7550 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7551 { 7552 import std.range.primitives : isBidirectionalRange, popBack; 7553 import std.traits : isNarrowString; 7554 static if (isNarrowString!Range) 7555 { 7556 static struct Result 7557 { 7558 private Range _range; 7559 @property bool empty() { return _range.empty; } 7560 @property dchar front(){ return _range.front; } 7561 void popFront(){ _range.popFront; } 7562 @property auto save() { return Result(_range.save); } 7563 @property dchar back(){ return _range.back; } 7564 void popBack(){ _range.popBack; } 7565 } 7566 static assert(isBidirectionalRange!(Result)); 7567 return Result(range); 7568 } 7569 else 7570 return range; 7571 } 7572 7573 /// 7574 @safe unittest 7575 { 7576 import std.array : array; 7577 import std.conv : text; 7578 import std.range : retro; 7579 7580 string s = "noe\u0308l"; // noël 7581 7582 // reverse it and convert the result to a string 7583 string reverse = s.byGrapheme 7584 .array 7585 .retro 7586 .byCodePoint 7587 .text; 7588 7589 assert(reverse == "le\u0308on"); // lëon 7590 } 7591 7592 @safe unittest 7593 { 7594 import std.algorithm.comparison : equal; 7595 import std.range.primitives : walkLength; 7596 import std.range : retro; 7597 assert("".byGrapheme.byCodePoint.equal("")); 7598 7599 string text = "noe\u0308l"; 7600 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7601 7602 auto gText = InputRangeString(text).byGrapheme; 7603 static assert(!isForwardRange!(typeof(gText))); 7604 7605 auto cpText = gText.byCodePoint; 7606 static assert(!isForwardRange!(typeof(cpText))); 7607 7608 assert(cpText.walkLength == text.walkLength); 7609 7610 auto plainCp = text.byCodePoint; 7611 static assert(isForwardRange!(typeof(plainCp))); 7612 assert(equal(plainCp, text)); 7613 assert(equal(retro(plainCp.save), retro(text.save))); 7614 // Check that we still have length for dstring 7615 assert("абвгд"d.byCodePoint.length == 5); 7616 } 7617 7618 /++ 7619 $(P A structure designed to effectively pack $(CHARACTERS) 7620 of a $(CLUSTER). 7621 ) 7622 7623 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7624 always refer to distinct objects. In most actual scenarios a `Grapheme` 7625 fits on the stack and avoids memory allocation overhead for all but quite 7626 long clusters. 7627 ) 7628 7629 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7630 +/ 7631 @safe struct Grapheme 7632 { 7633 import std.exception : enforce; 7634 import std.traits : isDynamicArray; 7635 7636 public: 7637 /// Ctor 7638 this(C)(const scope C[] chars...) 7639 if (is(C : dchar)) 7640 { 7641 this ~= chars; 7642 } 7643 7644 ///ditto 7645 this(Input)(Input seq) 7646 if (!isDynamicArray!Input 7647 && isInputRange!Input && is(ElementType!Input : dchar)) 7648 { 7649 this ~= seq; 7650 } 7651 7652 /// Gets a $(CODEPOINT) at the given index in this cluster. 7653 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7654 { 7655 assert(index < length); 7656 return read24(isBig ? ptr_ : small_.ptr, index); 7657 } 7658 7659 /++ 7660 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7661 7662 Warning: 7663 Use of this facility may invalidate grapheme cluster, 7664 see also $(LREF Grapheme.valid). 7665 +/ 7666 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7667 { 7668 assert(index < length); 7669 write24(isBig ? ptr_ : small_.ptr, ch, index); 7670 } 7671 7672 /// 7673 @safe unittest 7674 { 7675 auto g = Grapheme("A\u0302"); 7676 assert(g[0] == 'A'); 7677 assert(g.valid); 7678 g[1] = '~'; // ASCII tilda is not a combining mark 7679 assert(g[1] == '~'); 7680 assert(!g.valid); 7681 } 7682 7683 /++ 7684 Random-access range over Grapheme's $(CHARACTERS). 7685 7686 Warning: Invalidates when this Grapheme leaves the scope, 7687 attempts to use it then would lead to memory corruption. 7688 +/ 7689 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7690 { 7691 return sliceOverIndexed(a, b, &this); 7692 } 7693 7694 /// ditto 7695 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7696 { 7697 return sliceOverIndexed(0, length, &this); 7698 } 7699 7700 /// Grapheme cluster length in $(CODEPOINTS). 7701 @property size_t length() const @nogc nothrow pure 7702 { 7703 return isBig ? len_ : slen_ & 0x7F; 7704 } 7705 7706 /++ 7707 Append $(CHARACTER) `ch` to this grapheme. 7708 Warning: 7709 Use of this facility may invalidate grapheme cluster, 7710 see also `valid`. 7711 7712 See_Also: $(LREF Grapheme.valid) 7713 +/ 7714 ref opOpAssign(string op)(dchar ch) @trusted 7715 { 7716 static if (op == "~") 7717 { 7718 import std.internal.memory : enforceRealloc; 7719 if (!isBig) 7720 { 7721 if (slen_ == small_cap) 7722 convertToBig();// & fallthrough to "big" branch 7723 else 7724 { 7725 write24(small_.ptr, ch, smallLength); 7726 slen_++; 7727 return this; 7728 } 7729 } 7730 7731 assert(isBig); 7732 if (len_ == cap_) 7733 { 7734 import core.checkedint : addu, mulu; 7735 bool overflow; 7736 cap_ = addu(cap_, grow, overflow); 7737 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7738 if (overflow) assert(0); 7739 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7740 } 7741 write24(ptr_, ch, len_++); 7742 return this; 7743 } 7744 else 7745 static assert(false, "No operation "~op~" defined for Grapheme"); 7746 } 7747 7748 /// 7749 @safe unittest 7750 { 7751 import std.algorithm.comparison : equal; 7752 auto g = Grapheme("A"); 7753 assert(g.valid); 7754 g ~= '\u0301'; 7755 assert(g[].equal("A\u0301")); 7756 assert(g.valid); 7757 g ~= "B"; 7758 // not a valid grapheme cluster anymore 7759 assert(!g.valid); 7760 // still could be useful though 7761 assert(g[].equal("A\u0301B")); 7762 } 7763 7764 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7765 ref opOpAssign(string op, Input)(scope Input inp) 7766 if (isInputRange!Input && is(ElementType!Input : dchar)) 7767 { 7768 static if (op == "~") 7769 { 7770 foreach (dchar ch; inp) 7771 this ~= ch; 7772 return this; 7773 } 7774 else 7775 static assert(false, "No operation "~op~" defined for Grapheme"); 7776 } 7777 7778 // This is not a good `opEquals`, but formerly the automatically generated 7779 // opEquals was used, which was inferred `@safe` because of bugzilla 20655: 7780 // https://issues.dlang.org/show_bug.cgi?id=20655 7781 // This `@trusted opEquals` is only here to prevent breakage. 7782 bool opEquals(R)(const auto ref R other) const @trusted 7783 { 7784 return this.tupleof == other.tupleof; 7785 } 7786 7787 // Define a default toHash to allow AA usage 7788 size_t toHash() const @trusted 7789 { 7790 return hashOf(slen_, hashOf(small_)); 7791 } 7792 7793 /++ 7794 True if this object contains valid extended grapheme cluster. 7795 Decoding primitives of this module always return a valid `Grapheme`. 7796 7797 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7798 render it no longer valid. Certain applications may chose to use 7799 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7800 entirely. 7801 +/ 7802 @property bool valid()() /*const*/ 7803 { 7804 auto r = this[]; 7805 genericDecodeGrapheme!(GraphemeRet.none)(r); 7806 return r.length == 0; 7807 } 7808 7809 this(this) @nogc nothrow pure @trusted 7810 { 7811 import std.internal.memory : enforceMalloc; 7812 if (isBig) 7813 {// dup it 7814 import core.checkedint : addu, mulu; 7815 bool overflow; 7816 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7817 if (overflow) assert(0); 7818 7819 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7820 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7821 ptr_ = p; 7822 } 7823 } 7824 7825 ~this() @nogc nothrow pure @trusted 7826 { 7827 import core.memory : pureFree; 7828 if (isBig) 7829 { 7830 pureFree(ptr_); 7831 } 7832 } 7833 7834 7835 private: 7836 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7837 // "out of the blue" grow rate, needs testing 7838 // (though graphemes are typically small < 9) 7839 enum grow = 20; 7840 enum small_cap = small_bytes/3; 7841 enum small_flag = 0x80, small_mask = 0x7F; 7842 // 16 bytes in 32bits, should be enough for the majority of cases 7843 union 7844 { 7845 struct 7846 { 7847 ubyte* ptr_; 7848 size_t cap_; 7849 size_t len_; 7850 size_t padding_; 7851 } 7852 struct 7853 { 7854 ubyte[small_bytes] small_; 7855 ubyte slen_; 7856 } 7857 } 7858 7859 void convertToBig() @nogc nothrow pure @trusted 7860 { 7861 import std.internal.memory : enforceMalloc; 7862 static assert(grow.max / 3 - 1 >= grow); 7863 enum nbytes = 3 * (grow + 1); 7864 size_t k = smallLength; 7865 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7866 for (int i=0; i<k; i++) 7867 write24(p, read24(small_.ptr, i), i); 7868 // now we can overwrite small array data 7869 ptr_ = p; 7870 len_ = slen_; 7871 assert(grow > len_); 7872 cap_ = grow; 7873 setBig(); 7874 } 7875 7876 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7877 7878 @property size_t smallLength() const @nogc nothrow pure 7879 { 7880 return slen_ & small_mask; 7881 } 7882 @property ubyte isBig() const @nogc nothrow pure 7883 { 7884 return slen_ & small_flag; 7885 } 7886 } 7887 7888 static assert(Grapheme.sizeof == size_t.sizeof*4); 7889 7890 7891 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7892 { 7893 import std.algorithm.comparison : equal; 7894 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7895 assert(byGrapheme("ЮУЗ").equal(data[])); 7896 } 7897 7898 /// 7899 @safe unittest 7900 { 7901 import std.algorithm.comparison : equal; 7902 import std.algorithm.iteration : filter; 7903 import std.range : isRandomAccessRange; 7904 7905 string bold = "ku\u0308hn"; 7906 7907 // note that decodeGrapheme takes parameter by ref 7908 auto first = decodeGrapheme(bold); 7909 7910 assert(first.length == 1); 7911 assert(first[0] == 'k'); 7912 7913 // the next grapheme is 2 characters long 7914 auto wideOne = decodeGrapheme(bold); 7915 // slicing a grapheme yields a random-access range of dchar 7916 assert(wideOne[].equal("u\u0308")); 7917 assert(wideOne.length == 2); 7918 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7919 7920 // all of the usual range manipulation is possible 7921 assert(wideOne[].filter!isMark().equal("\u0308")); 7922 7923 auto g = Grapheme("A"); 7924 assert(g.valid); 7925 g ~= '\u0301'; 7926 assert(g[].equal("A\u0301")); 7927 assert(g.valid); 7928 g ~= "B"; 7929 // not a valid grapheme cluster anymore 7930 assert(!g.valid); 7931 // still could be useful though 7932 assert(g[].equal("A\u0301B")); 7933 } 7934 7935 @safe unittest 7936 { 7937 auto g = Grapheme("A\u0302"); 7938 assert(g[0] == 'A'); 7939 assert(g.valid); 7940 g[1] = '~'; // ASCII tilda is not a combining mark 7941 assert(g[1] == '~'); 7942 assert(!g.valid); 7943 } 7944 7945 @safe unittest 7946 { 7947 import std.algorithm.comparison : equal; 7948 import std.algorithm.iteration : map; 7949 import std.conv : text; 7950 import std.range : iota; 7951 7952 // not valid clusters (but it just a test) 7953 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7954 assert(g[0] == 'a'); 7955 assert(g[1] == 'b'); 7956 assert(g[2] == 'c'); 7957 assert(g[3] == 'd'); 7958 assert(g[4] == 'e'); 7959 g[3] = 'Й'; 7960 assert(g[2] == 'c'); 7961 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7962 assert(g[4] == 'e'); 7963 assert(!g.valid); 7964 7965 g ~= 'ц'; 7966 g ~= '~'; 7967 assert(g[0] == 'a'); 7968 assert(g[1] == 'b'); 7969 assert(g[2] == 'c'); 7970 assert(g[3] == 'Й'); 7971 assert(g[4] == 'e'); 7972 assert(g[5] == 'ц'); 7973 assert(g[6] == '~'); 7974 assert(!g.valid); 7975 7976 Grapheme copy = g; 7977 copy[0] = 'X'; 7978 copy[1] = '-'; 7979 assert(g[0] == 'a' && copy[0] == 'X'); 7980 assert(g[1] == 'b' && copy[1] == '-'); 7981 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7982 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7983 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7984 copy ~= "xyz"; 7985 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7986 assert(!copy.valid); 7987 7988 Grapheme h; 7989 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7990 h ~= v; 7991 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7992 } 7993 7994 // ensure Grapheme can be used as an AA key. 7995 @safe unittest 7996 { 7997 int[Grapheme] aa; 7998 } 7999 8000 /++ 8001 $(P Does basic case-insensitive comparison of `r1` and `r2`. 8002 This function uses simpler comparison rule thus achieving better performance 8003 than $(LREF icmp). However keep in mind the warning below.) 8004 8005 Params: 8006 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 8007 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 8008 8009 Returns: 8010 An `int` that is 0 if the strings match, 8011 <0 if `r1` is lexicographically "less" than `r2`, 8012 >0 if `r1` is lexicographically "greater" than `r2` 8013 8014 Warning: 8015 This function only handles 1:1 $(CODEPOINT) mapping 8016 and thus is not sufficient for certain alphabets 8017 like German, Greek and few others. 8018 8019 See_Also: 8020 $(LREF icmp) 8021 $(REF cmp, std,algorithm,comparison) 8022 +/ 8023 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 8024 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 8025 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8026 { 8027 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 8028 import std.range.primitives : isInfinite; 8029 import std.utf : decodeFront; 8030 import std.traits : isDynamicArray; 8031 import std.typecons : Yes; 8032 static import std.ascii; 8033 8034 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8035 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8036 && !(isInfinite!S1 && isInfinite!S2) 8037 && __traits(compiles, 8038 { 8039 size_t s = size_t.sizeof / 2; 8040 r1 = r1[s .. $]; 8041 r2 = r2[s .. $]; 8042 })) 8043 {{ 8044 // ASCII optimization for dynamic arrays & similar. 8045 size_t i = 0; 8046 static if (isInfinite!S1) 8047 immutable end = r2.length; 8048 else static if (isInfinite!S2) 8049 immutable end = r1.length; 8050 else 8051 immutable end = r1.length > r2.length ? r2.length : r1.length; 8052 for (; i < end; ++i) 8053 { 8054 auto lhs = r1[i]; 8055 auto rhs = r2[i]; 8056 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8057 if (lhs == rhs) continue; 8058 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8059 if (lowDiff) return lowDiff; 8060 } 8061 static if (isInfinite!S1) 8062 return 1; 8063 else static if (isInfinite!S2) 8064 return -1; 8065 else 8066 return (r1.length > r2.length) - (r2.length > r1.length); 8067 8068 NonAsciiPath: 8069 r1 = r1[i .. $]; 8070 r2 = r2[i .. $]; 8071 // Fall through to standard case. 8072 }} 8073 8074 while (!r1.empty) 8075 { 8076 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 8077 if (r2.empty) 8078 return 1; 8079 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 8080 int diff = lhs - rhs; 8081 if (!diff) 8082 continue; 8083 if ((lhs | rhs) < 0x80) 8084 { 8085 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8086 if (!d) continue; 8087 return d; 8088 } 8089 size_t idx = simpleCaseTrie[lhs]; 8090 size_t idx2 = simpleCaseTrie[rhs]; 8091 // simpleCaseTrie is packed index table 8092 if (idx != EMPTY_CASE_TRIE) 8093 { 8094 if (idx2 != EMPTY_CASE_TRIE) 8095 {// both cased chars 8096 // adjust idx --> start of bucket 8097 idx = idx - sTable(idx).n; 8098 idx2 = idx2 - sTable(idx2).n; 8099 if (idx == idx2)// one bucket, equivalent chars 8100 continue; 8101 else// not the same bucket 8102 diff = sTable(idx).ch - sTable(idx2).ch; 8103 } 8104 else 8105 diff = sTable(idx - sTable(idx).n).ch - rhs; 8106 } 8107 else if (idx2 != EMPTY_CASE_TRIE) 8108 { 8109 diff = lhs - sTable(idx2 - sTable(idx2).n).ch; 8110 } 8111 // one of chars is not cased at all 8112 return diff; 8113 } 8114 return int(r2.empty) - 1; 8115 } 8116 8117 /// 8118 @safe @nogc pure nothrow unittest 8119 { 8120 assert(sicmp("Август", "авгусТ") == 0); 8121 // Greek also works as long as there is no 1:M mapping in sight 8122 assert(sicmp("ΌΎ", "όύ") == 0); 8123 // things like the following won't get matched as equal 8124 // Greek small letter iota with dialytika and tonos 8125 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8126 8127 // while icmp has no problem with that 8128 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 8129 assert(icmp("ΌΎ", "όύ") == 0); 8130 } 8131 8132 // overloads for the most common cases to reduce compile time 8133 @safe @nogc pure nothrow 8134 { 8135 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 8136 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 8137 8138 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 8139 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8140 8141 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 8142 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8143 } 8144 8145 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 8146 { 8147 import std.algorithm.searching : skipOver; 8148 import std.internal.unicode_tables : fullCaseTable; // generated file 8149 alias fTable = fullCaseTable; 8150 size_t idx = fullCaseTrie[lhs]; 8151 // fullCaseTrie is packed index table 8152 if (idx == EMPTY_CASE_TRIE) 8153 return lhs; 8154 immutable start = idx - fTable(idx).n; 8155 immutable end = fTable(idx).size + start; 8156 assert(fTable(start).entry_len == 1); 8157 for (idx=start; idx<end; idx++) 8158 { 8159 const entryLen = fTable(idx).entry_len; 8160 if (entryLen == 1) 8161 { 8162 if (fTable(idx).seq[0] == rhs) 8163 { 8164 return 0; 8165 } 8166 } 8167 else 8168 {// OK it's a long chunk, like 'ss' for German 8169 dchar[3] arr = fTable(idx).seq; 8170 const dchar[] seq = arr[0 .. entryLen]; 8171 if (rhs == seq[0] 8172 && rtail.skipOver(seq[1..$])) 8173 { 8174 // note that this path modifies rtail 8175 // iff we managed to get there 8176 return 0; 8177 } 8178 } 8179 } 8180 return fTable(start).seq[0]; // new remapped character for accurate diffs 8181 } 8182 8183 /++ 8184 Does case insensitive comparison of `r1` and `r2`. 8185 Follows the rules of full case-folding mapping. 8186 This includes matching as equal german ß with "ss" and 8187 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 8188 The cost of `icmp` being pedantically correct is 8189 slightly worse performance. 8190 8191 Params: 8192 r1 = a forward range of characters 8193 r2 = a forward range of characters 8194 8195 Returns: 8196 An `int` that is 0 if the strings match, 8197 <0 if `str1` is lexicographically "less" than `str2`, 8198 >0 if `str1` is lexicographically "greater" than `str2` 8199 8200 See_Also: 8201 $(LREF sicmp) 8202 $(REF cmp, std,algorithm,comparison) 8203 +/ 8204 int icmp(S1, S2)(S1 r1, S2 r2) 8205 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 8206 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8207 { 8208 import std.range.primitives : isInfinite; 8209 import std.traits : isDynamicArray; 8210 import std.utf : byDchar; 8211 static import std.ascii; 8212 8213 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8214 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8215 && !(isInfinite!S1 && isInfinite!S2) 8216 && __traits(compiles, 8217 { 8218 size_t s = size_t.max / 2; 8219 r1 = r1[s .. $]; 8220 r2 = r2[s .. $]; 8221 })) 8222 {{ 8223 // ASCII optimization for dynamic arrays & similar. 8224 size_t i = 0; 8225 static if (isInfinite!S1) 8226 immutable end = r2.length; 8227 else static if (isInfinite!S2) 8228 immutable end = r1.length; 8229 else 8230 immutable end = r1.length > r2.length ? r2.length : r1.length; 8231 for (; i < end; ++i) 8232 { 8233 auto lhs = r1[i]; 8234 auto rhs = r2[i]; 8235 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8236 if (lhs == rhs) continue; 8237 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8238 if (lowDiff) return lowDiff; 8239 } 8240 static if (isInfinite!S1) 8241 return 1; 8242 else static if (isInfinite!S2) 8243 return -1; 8244 else 8245 return (r1.length > r2.length) - (r2.length > r1.length); 8246 8247 NonAsciiPath: 8248 r1 = r1[i .. $]; 8249 r2 = r2[i .. $]; 8250 // Fall through to standard case. 8251 }} 8252 8253 auto str1 = r1.byDchar; 8254 auto str2 = r2.byDchar; 8255 8256 for (;;) 8257 { 8258 if (str1.empty) 8259 return str2.empty ? 0 : -1; 8260 immutable lhs = str1.front; 8261 if (str2.empty) 8262 return 1; 8263 immutable rhs = str2.front; 8264 str1.popFront(); 8265 str2.popFront(); 8266 if (!(lhs - rhs)) 8267 continue; 8268 // first try to match lhs to <rhs,right-tail> sequence 8269 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8270 if (!cmpLR) 8271 continue; 8272 // then rhs to <lhs,left-tail> sequence 8273 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8274 if (!cmpRL) 8275 continue; 8276 // cmpXX contain remapped codepoints 8277 // to obtain stable ordering of icmp 8278 return cmpLR - cmpRL; 8279 } 8280 } 8281 8282 /// 8283 @safe @nogc pure nothrow unittest 8284 { 8285 assert(icmp("Rußland", "Russland") == 0); 8286 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8287 } 8288 8289 /** 8290 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8291 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8292 */ 8293 @safe @nogc nothrow pure unittest 8294 { 8295 import std.utf : byDchar; 8296 8297 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8298 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8299 } 8300 8301 // test different character types 8302 @safe unittest 8303 { 8304 assert(icmp("Rußland", "Russland") == 0); 8305 assert(icmp("Rußland"w, "Russland") == 0); 8306 assert(icmp("Rußland", "Russland"w) == 0); 8307 assert(icmp("Rußland"w, "Russland"w) == 0); 8308 assert(icmp("Rußland"d, "Russland"w) == 0); 8309 assert(icmp("Rußland"w, "Russland"d) == 0); 8310 } 8311 8312 // overloads for the most common cases to reduce compile time 8313 @safe @nogc pure nothrow 8314 { 8315 int icmp(const(char)[] str1, const(char)[] str2) 8316 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8317 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8318 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8319 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8320 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8321 } 8322 8323 @safe unittest 8324 { 8325 import std.algorithm.sorting : sort; 8326 import std.conv : to; 8327 import std.exception : assertCTFEable; 8328 assertCTFEable!( 8329 { 8330 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8331 {{ 8332 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8333 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8334 { 8335 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8336 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8337 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8338 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8339 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8340 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8341 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8342 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8343 // Check example: 8344 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8345 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8346 } 8347 // check that the order is properly agnostic to the case 8348 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8349 sort!((a,b) => cfunc(a,b) < 0)(strs); 8350 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8351 }} 8352 assert(icmp("ßb", "ssa") > 0); 8353 // Check example: 8354 assert(icmp("Russland", "Rußland") == 0); 8355 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8356 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8357 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8358 // https://issues.dlang.org/show_bug.cgi?id=11057 8359 assert( icmp("K", "L") < 0 ); 8360 }); 8361 } 8362 8363 // https://issues.dlang.org/show_bug.cgi?id=17372 8364 @safe pure unittest 8365 { 8366 import std.algorithm.iteration : joiner, map; 8367 import std.algorithm.sorting : sort; 8368 import std.array : array; 8369 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8370 } 8371 8372 // This is package(std) for the moment to be used as a support tool for std.regex 8373 // It needs a better API 8374 /* 8375 Return a range of all $(CODEPOINTS) that casefold to 8376 and from this `ch`. 8377 */ 8378 package(std) auto simpleCaseFoldings(dchar ch) @safe 8379 { 8380 import std.internal.unicode_tables : simpleCaseTable; // generated file 8381 alias sTable = simpleCaseTable; 8382 static struct Range 8383 { 8384 @safe pure nothrow: 8385 uint idx; //if == uint.max, then read c. 8386 union 8387 { 8388 dchar c; // == 0 - empty range 8389 uint len; 8390 } 8391 @property bool isSmall() const { return idx == uint.max; } 8392 8393 this(dchar ch) 8394 { 8395 idx = uint.max; 8396 c = ch; 8397 } 8398 8399 this(uint start, uint size) 8400 { 8401 idx = start; 8402 len = size; 8403 } 8404 8405 @property dchar front() const 8406 { 8407 assert(!empty); 8408 if (isSmall) 8409 { 8410 return c; 8411 } 8412 auto ch = sTable(idx).ch; 8413 return ch; 8414 } 8415 8416 @property bool empty() const 8417 { 8418 if (isSmall) 8419 { 8420 return c == 0; 8421 } 8422 return len == 0; 8423 } 8424 8425 @property size_t length() const 8426 { 8427 if (isSmall) 8428 { 8429 return c == 0 ? 0 : 1; 8430 } 8431 return len; 8432 } 8433 8434 void popFront() 8435 { 8436 if (isSmall) 8437 c = 0; 8438 else 8439 { 8440 idx++; 8441 len--; 8442 } 8443 } 8444 } 8445 immutable idx = simpleCaseTrie[ch]; 8446 if (idx == EMPTY_CASE_TRIE) 8447 return Range(ch); 8448 auto entry = sTable(idx); 8449 immutable start = idx - entry.n; 8450 return Range(start, entry.size); 8451 } 8452 8453 @safe unittest 8454 { 8455 import std.algorithm.comparison : equal; 8456 import std.algorithm.searching : canFind; 8457 import std.array : array; 8458 import std.exception : assertCTFEable; 8459 assertCTFEable!((){ 8460 auto r = simpleCaseFoldings('Э').array; 8461 assert(r.length == 2); 8462 assert(r.canFind('э') && r.canFind('Э')); 8463 auto sr = simpleCaseFoldings('~'); 8464 assert(sr.equal("~")); 8465 //A with ring above - casefolds to the same bucket as Angstrom sign 8466 sr = simpleCaseFoldings('Å'); 8467 assert(sr.length == 3); 8468 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8469 }); 8470 } 8471 8472 /++ 8473 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8474 +/ 8475 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8476 { 8477 return combiningClassTrie[ch]; 8478 } 8479 8480 /// 8481 @safe unittest 8482 { 8483 // shorten the code 8484 alias CC = combiningClass; 8485 8486 // combining tilda 8487 assert(CC('\u0303') == 230); 8488 // combining ring below 8489 assert(CC('\u0325') == 220); 8490 // the simple consequence is that "tilda" should be 8491 // placed after a "ring below" in a sequence 8492 } 8493 8494 @safe pure nothrow @nogc unittest 8495 { 8496 foreach (ch; 0 .. 0x80) 8497 assert(combiningClass(ch) == 0); 8498 assert(combiningClass('\u05BD') == 22); 8499 assert(combiningClass('\u0300') == 230); 8500 assert(combiningClass('\u0317') == 220); 8501 assert(combiningClass('\u1939') == 222); 8502 } 8503 8504 /// Unicode character decomposition type. 8505 enum UnicodeDecomposition { 8506 /// Canonical decomposition. The result is canonically equivalent sequence. 8507 Canonical, 8508 /** 8509 Compatibility decomposition. The result is compatibility equivalent sequence. 8510 Note: Compatibility decomposition is a $(B lossy) conversion, 8511 typically suitable only for fuzzy matching and internal processing. 8512 */ 8513 Compatibility 8514 } 8515 8516 /** 8517 Shorthand aliases for character decomposition type, passed as a 8518 template parameter to $(LREF decompose). 8519 */ 8520 enum { 8521 Canonical = UnicodeDecomposition.Canonical, 8522 Compatibility = UnicodeDecomposition.Compatibility 8523 } 8524 8525 /++ 8526 Try to canonically compose 2 $(CHARACTERS). 8527 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8528 8529 The assumption is that `first` comes before `second` in the original text, 8530 usually meaning that the first is a starter. 8531 8532 Note: Hangul syllables are not covered by this function. 8533 See `composeJamo` below. 8534 +/ 8535 public dchar compose(dchar first, dchar second) pure nothrow @safe 8536 { 8537 import std.algorithm.iteration : map; 8538 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8539 import std.range : assumeSorted, stride; 8540 immutable packed = compositionJumpTrie[first]; 8541 if (packed == ushort.max) 8542 return dchar.init; 8543 // unpack offset and length 8544 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8545 // TODO: optimize this micro binary search (no more then 4-5 steps) 8546 auto r = compositionTable.stride(2)[idx .. idx+cnt].assumeSorted(); 8547 immutable target = r.lowerBound(second).length; 8548 if (target == cnt) 8549 return dchar.init; 8550 immutable entry = compositionTable[(idx+target)*2]; 8551 if (entry != second) 8552 return dchar.init; 8553 return compositionTable[(idx+target)*2 + 1]; 8554 } 8555 8556 /// 8557 @safe unittest 8558 { 8559 assert(compose('A','\u0308') == '\u00C4'); 8560 assert(compose('A', 'B') == dchar.init); 8561 assert(compose('C', '\u0301') == '\u0106'); 8562 // note that the starter is the first one 8563 // thus the following doesn't compose 8564 assert(compose('\u0308', 'A') == dchar.init); 8565 } 8566 8567 /++ 8568 Returns a full $(S_LINK Canonical decomposition, Canonical) 8569 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8570 decomposition of $(CHARACTER) `ch`. 8571 If no decomposition is available returns a $(LREF Grapheme) 8572 with the `ch` itself. 8573 8574 Note: 8575 This function also decomposes hangul syllables 8576 as prescribed by the standard. 8577 8578 See_Also: $(LREF decomposeHangul) for a restricted version 8579 that takes into account only hangul syllables but 8580 no other decompositions. 8581 +/ 8582 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8583 { 8584 import std.algorithm.searching : until; 8585 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8586 static if (decompType == Canonical) 8587 { 8588 alias table = decompCanonTable; 8589 alias mapping = canonMappingTrie; 8590 } 8591 else static if (decompType == Compatibility) 8592 { 8593 alias table = decompCompatTable; 8594 alias mapping = compatMappingTrie; 8595 } 8596 immutable idx = mapping[ch]; 8597 if (!idx) // not found, check hangul arithmetic decomposition 8598 return decomposeHangul(ch); 8599 auto decomp = table[idx..$].until(0); 8600 return Grapheme(decomp); 8601 } 8602 8603 /// 8604 @safe unittest 8605 { 8606 import std.algorithm.comparison : equal; 8607 8608 assert(compose('A','\u0308') == '\u00C4'); 8609 assert(compose('A', 'B') == dchar.init); 8610 assert(compose('C', '\u0301') == '\u0106'); 8611 // note that the starter is the first one 8612 // thus the following doesn't compose 8613 assert(compose('\u0308', 'A') == dchar.init); 8614 8615 assert(decompose('Ĉ')[].equal("C\u0302")); 8616 assert(decompose('D')[].equal("D")); 8617 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8618 assert(decompose!Compatibility('¹')[].equal("1")); 8619 } 8620 8621 //---------------------------------------------------------------------------- 8622 // Hangul specific composition/decomposition 8623 enum jamoSBase = 0xAC00; 8624 enum jamoLBase = 0x1100; 8625 enum jamoVBase = 0x1161; 8626 enum jamoTBase = 0x11A7; 8627 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8628 enum jamoNCount = jamoVCount * jamoTCount; 8629 enum jamoSCount = jamoLCount * jamoNCount; 8630 8631 // Tests if `ch` is a Hangul leading consonant jamo. 8632 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8633 { 8634 // first cmp rejects ~ 1M code points above leading jamo range 8635 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8636 } 8637 8638 // Tests if `ch` is a Hangul vowel jamo. 8639 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8640 { 8641 // first cmp rejects ~ 1M code points above trailing jamo range 8642 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8643 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8644 } 8645 8646 // Tests if `ch` is a Hangul trailnig consonant jamo. 8647 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8648 { 8649 // first cmp rejects ~ 1M code points above vowel range 8650 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8651 } 8652 8653 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8654 { 8655 int idxS = cast(int) ch - jamoSBase; 8656 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8657 } 8658 8659 // internal helper: compose hangul syllables leaving dchar.init in holes 8660 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8661 { 8662 for (size_t idx = 0; idx + 1 < seq.length; ) 8663 { 8664 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8665 { 8666 immutable int indexL = seq[idx] - jamoLBase; 8667 immutable int indexV = seq[idx+1] - jamoVBase; 8668 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8669 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8670 { 8671 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8672 seq[idx+1] = dchar.init; 8673 seq[idx+2] = dchar.init; 8674 idx += 3; 8675 } 8676 else 8677 { 8678 seq[idx] = jamoSBase + indexLV; 8679 seq[idx+1] = dchar.init; 8680 idx += 2; 8681 } 8682 } 8683 else 8684 idx++; 8685 } 8686 } 8687 8688 //---------------------------------------------------------------------------- 8689 public: 8690 8691 /** 8692 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8693 then this function returns $(LREF Grapheme) containing only `ch` as is. 8694 */ 8695 Grapheme decomposeHangul(dchar ch) nothrow pure @safe 8696 { 8697 immutable idxS = cast(int) ch - jamoSBase; 8698 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8699 immutable idxL = idxS / jamoNCount; 8700 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8701 immutable idxT = idxS % jamoTCount; 8702 8703 immutable partL = jamoLBase + idxL; 8704 immutable partV = jamoVBase + idxV; 8705 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8706 return Grapheme(partL, partV, jamoTBase + idxT); 8707 else // <L, V> decomposition 8708 return Grapheme(partL, partV); 8709 } 8710 8711 /// 8712 @safe unittest 8713 { 8714 import std.algorithm.comparison : equal; 8715 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8716 } 8717 8718 /++ 8719 Try to compose hangul syllable out of a leading consonant (`lead`), 8720 a `vowel` and optional `trailing` consonant jamos. 8721 8722 On success returns the composed LV or LVT hangul syllable. 8723 8724 If any of `lead` and `vowel` are not a valid hangul jamo 8725 of the respective $(CHARACTER) class returns dchar.init. 8726 +/ 8727 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8728 { 8729 if (!isJamoL(lead)) 8730 return dchar.init; 8731 immutable indexL = lead - jamoLBase; 8732 if (!isJamoV(vowel)) 8733 return dchar.init; 8734 immutable indexV = vowel - jamoVBase; 8735 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8736 immutable dchar syllable = jamoSBase + indexLV; 8737 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8738 } 8739 8740 /// 8741 @safe unittest 8742 { 8743 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8744 // leaving out T-vowel, or passing any codepoint 8745 // that is not trailing consonant composes an LV-syllable 8746 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8747 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8748 assert(composeJamo('\u1111', 'A') == dchar.init); 8749 assert(composeJamo('A', '\u1171') == dchar.init); 8750 } 8751 8752 @safe unittest 8753 { 8754 import std.algorithm.comparison : equal; 8755 import std.conv : text; 8756 8757 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8758 { 8759 Grapheme g = decompose!T(ch); 8760 assert(equal(g[], r), text(g[], " vs ", r)); 8761 } 8762 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8763 testDecomp!Canonical('\uF907', "\u9F9C"); 8764 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8765 testDecomp!Compatibility('\uA7F9', "\u0153"); 8766 8767 // check examples 8768 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8769 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8770 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8771 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8772 assert(composeJamo('\u1111', 'A') == dchar.init); 8773 assert(composeJamo('A', '\u1171') == dchar.init); 8774 } 8775 8776 /** 8777 Enumeration type for normalization forms, 8778 passed as template parameter for functions like $(LREF normalize). 8779 */ 8780 enum NormalizationForm { 8781 NFC, 8782 NFD, 8783 NFKC, 8784 NFKD 8785 } 8786 8787 8788 enum { 8789 /** 8790 Shorthand aliases from values indicating normalization forms. 8791 */ 8792 NFC = NormalizationForm.NFC, 8793 ///ditto 8794 NFD = NormalizationForm.NFD, 8795 ///ditto 8796 NFKC = NormalizationForm.NFKC, 8797 ///ditto 8798 NFKD = NormalizationForm.NFKD 8799 } 8800 8801 /++ 8802 Returns `input` string normalized to the chosen form. 8803 Form C is used by default. 8804 8805 For more information on normalization forms see 8806 the $(S_LINK Normalization, normalization section). 8807 8808 Note: 8809 In cases where the string in question is already normalized, 8810 it is returned unmodified and no memory allocation happens. 8811 +/ 8812 /* 8813 WARNING: @trusted lambda inside - handle with same care as @trusted 8814 functions 8815 8816 Despite being a template, the attributes do no harm since this doesn't work 8817 with user-defined range or character types anyway. 8818 */ 8819 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C) 8820 (return scope inout(C)[] input) 8821 { 8822 import std.algorithm.mutation : SwapStrategy; 8823 import std.algorithm.sorting : sort; 8824 import std.array : appender; 8825 import std.range : zip; 8826 8827 auto anchors = splitNormalized!norm(input); 8828 if (anchors[0] == input.length && anchors[1] == input.length) 8829 return input; 8830 dchar[] decomposed; 8831 decomposed.reserve(31); 8832 ubyte[] ccc; 8833 ccc.reserve(31); 8834 auto app = appender!(C[])(); 8835 do 8836 { 8837 app.put(input[0 .. anchors[0]]); 8838 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8839 static if (norm == NFD || norm == NFC) 8840 { 8841 foreach (dchar c; decompose!Canonical(ch)[]) 8842 decomposed ~= c; 8843 } 8844 else // NFKD & NFKC 8845 { 8846 foreach (dchar c; decompose!Compatibility(ch)[]) 8847 decomposed ~= c; 8848 } 8849 ccc.length = decomposed.length; 8850 size_t firstNonStable = 0; 8851 ubyte lastClazz = 0; 8852 8853 foreach (idx, dchar ch; decomposed) 8854 { 8855 immutable clazz = combiningClass(ch); 8856 ccc[idx] = clazz; 8857 if (clazz == 0 && lastClazz != 0) 8858 { 8859 // found a stable code point after unstable ones 8860 sort!("a[0] < b[0]", SwapStrategy.stable) 8861 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8862 firstNonStable = decomposed.length; 8863 } 8864 else if (clazz != 0 && lastClazz == 0) 8865 { 8866 // found first unstable code point after stable ones 8867 firstNonStable = idx; 8868 } 8869 lastClazz = clazz; 8870 } 8871 sort!("a[0] < b[0]", SwapStrategy.stable) 8872 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8873 static if (norm == NFC || norm == NFKC) 8874 { 8875 import std.algorithm.searching : countUntil; 8876 auto first = countUntil(ccc, 0); 8877 if (first >= 0) // no starters?? no recomposition 8878 { 8879 for (;;) 8880 { 8881 immutable second = recompose(first, decomposed, ccc); 8882 if (second == decomposed.length) 8883 break; 8884 first = second; 8885 } 8886 // 2nd pass for hangul syllables 8887 hangulRecompose(decomposed); 8888 } 8889 } 8890 static if (norm == NFD || norm == NFKD) 8891 app.put(decomposed); 8892 else 8893 { 8894 import std.algorithm.mutation : remove; 8895 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8896 app.put(decomposed[0 .. clean.length]); 8897 } 8898 // reset variables 8899 decomposed.length = 0; 8900 () @trusted { 8901 // assumeSafeAppend isn't considered pure as of writing, hence the 8902 // cast. It isn't pure in the sense that the elements after 8903 // the array in question are affected, but we don't use those 8904 // making the call pure for our purposes. 8905 (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})(); 8906 ccc.length = 0; 8907 (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})(); 8908 } (); 8909 input = input[anchors[1]..$]; 8910 // and move on 8911 anchors = splitNormalized!norm(input); 8912 } while (anchors[0] != input.length); 8913 app.put(input[0 .. anchors[0]]); 8914 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8915 } 8916 8917 /// 8918 @safe pure unittest 8919 { 8920 // any encoding works 8921 wstring greet = "Hello world"; 8922 assert(normalize(greet) is greet); // the same exact slice 8923 8924 // An example of a character with all 4 forms being different: 8925 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8926 assert(normalize!NFC("ϓ") == "\u03D3"); 8927 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8928 assert(normalize!NFKC("ϓ") == "\u038E"); 8929 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8930 } 8931 8932 @safe pure unittest 8933 { 8934 import std.conv : text; 8935 8936 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8937 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8938 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8939 8940 // test with dstring 8941 dstring greet = "Hello world"; 8942 assert(normalize(greet) is greet); // the same exact slice 8943 } 8944 8945 // canonically recompose given slice of code points, works in-place and mutates data 8946 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8947 { 8948 assert(input.length == ccc.length); 8949 int accumCC = -1;// so that it's out of 0 .. 255 range 8950 // writefln("recomposing %( %04x %)", input); 8951 // first one is always a starter thus we start at i == 1 8952 size_t i = start+1; 8953 for (; ; ) 8954 { 8955 if (i == input.length) 8956 break; 8957 immutable curCC = ccc[i]; 8958 // In any character sequence beginning with a starter S 8959 // a character C is blocked from S if and only if there 8960 // is some character B between S and C, and either B 8961 // is a starter or it has the same or higher combining class as C. 8962 //------------------------ 8963 // Applying to our case: 8964 // S is input[0] 8965 // accumCC is the maximum CCC of characters between C and S, 8966 // as ccc are sorted 8967 // C is input[i] 8968 8969 if (curCC > accumCC) 8970 { 8971 immutable comp = compose(input[start], input[i]); 8972 if (comp != dchar.init) 8973 { 8974 input[start] = comp; 8975 input[i] = dchar.init;// put a sentinel 8976 // current was merged so its CCC shouldn't affect 8977 // composing with the next one 8978 } 8979 else 8980 { 8981 // if it was a starter then accumCC is now 0, end of loop 8982 accumCC = curCC; 8983 if (accumCC == 0) 8984 break; 8985 } 8986 } 8987 else 8988 { 8989 // ditto here 8990 accumCC = curCC; 8991 if (accumCC == 0) 8992 break; 8993 } 8994 i++; 8995 } 8996 return i; 8997 } 8998 8999 // returns tuple of 2 indexes that delimit: 9000 // normalized text, piece that needs normalization and 9001 // the rest of input starting with stable code point 9002 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 9003 { 9004 import std.typecons : tuple; 9005 ubyte lastCC = 0; 9006 9007 foreach (idx, dchar ch; input) 9008 { 9009 static if (norm == NFC) 9010 if (ch < 0x0300) 9011 { 9012 lastCC = 0; 9013 continue; 9014 } 9015 immutable ubyte CC = combiningClass(ch); 9016 if (lastCC > CC && CC != 0) 9017 { 9018 return seekStable!norm(idx, input); 9019 } 9020 9021 if (notAllowedIn!norm(ch)) 9022 { 9023 return seekStable!norm(idx, input); 9024 } 9025 lastCC = CC; 9026 } 9027 return tuple(input.length, input.length); 9028 } 9029 9030 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 9031 { 9032 import std.typecons : tuple; 9033 import std.utf : codeLength; 9034 9035 auto br = input[0 .. idx]; 9036 size_t region_start = 0;// default 9037 for (;;) 9038 { 9039 if (br.empty)// start is 0 9040 break; 9041 dchar ch = br.back; 9042 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 9043 { 9044 region_start = br.length - codeLength!C(ch); 9045 break; 9046 } 9047 br.popFront(); 9048 } 9049 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 9050 size_t region_end=input.length;// end is $ by default 9051 foreach (i, dchar ch; input[idx..$]) 9052 { 9053 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 9054 { 9055 region_end = i+idx; 9056 break; 9057 } 9058 } 9059 // writeln("Region to normalize: ", input[region_start .. region_end]); 9060 return tuple(region_start, region_end); 9061 } 9062 9063 /** 9064 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 9065 form `norm`. 9066 */ 9067 public bool allowedIn(NormalizationForm norm)(dchar ch) 9068 { 9069 return !notAllowedIn!norm(ch); 9070 } 9071 9072 /// 9073 @safe unittest 9074 { 9075 // e.g. Cyrillic is always allowed, so is ASCII 9076 assert(allowedIn!NFC('я')); 9077 assert(allowedIn!NFD('я')); 9078 assert(allowedIn!NFKC('я')); 9079 assert(allowedIn!NFKD('я')); 9080 assert(allowedIn!NFC('Z')); 9081 } 9082 9083 // not user friendly name but more direct 9084 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 9085 { 9086 static if (norm == NFC) 9087 alias qcTrie = nfcQCTrie; 9088 else static if (norm == NFD) 9089 alias qcTrie = nfdQCTrie; 9090 else static if (norm == NFKC) 9091 alias qcTrie = nfkcQCTrie; 9092 else static if (norm == NFKD) 9093 alias qcTrie = nfkdQCTrie; 9094 else 9095 static assert("Unknown normalization form "~norm); 9096 return qcTrie[ch]; 9097 } 9098 9099 @safe unittest 9100 { 9101 assert(allowedIn!NFC('я')); 9102 assert(allowedIn!NFD('я')); 9103 assert(allowedIn!NFKC('я')); 9104 assert(allowedIn!NFKD('я')); 9105 assert(allowedIn!NFC('Z')); 9106 } 9107 9108 } 9109 9110 version (std_uni_bootstrap) 9111 { 9112 // old version used for bootstrapping of gen_uni.d that generates 9113 // up to date optimal versions of all of isXXX functions 9114 @safe pure nothrow @nogc public bool isWhite(dchar c) 9115 { 9116 import std.ascii : isWhite; 9117 return isWhite(c) || 9118 c == lineSep || c == paraSep || 9119 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 9120 (c >= '\u2000' && c <= '\u200A') || 9121 c == '\u202F' || c == '\u205F' || c == '\u3000'; 9122 } 9123 } 9124 else 9125 { 9126 9127 // trusted -> avoid bounds check 9128 @trusted pure nothrow @nogc private 9129 { 9130 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 9131 9132 // hide template instances behind functions 9133 // https://issues.dlang.org/show_bug.cgi?id=13232 9134 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 9135 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 9136 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 9137 9138 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 9139 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 9140 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 9141 9142 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 9143 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 9144 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 9145 } 9146 9147 public: 9148 9149 /++ 9150 Whether or not `c` is a Unicode whitespace $(CHARACTER). 9151 (general Unicode category: Part of C0(tab, vertical tab, form feed, 9152 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 9153 +/ 9154 @safe pure nothrow @nogc 9155 public bool isWhite(dchar c) 9156 { 9157 import std.internal.unicode_tables : isWhiteGen; // generated file 9158 return isWhiteGen(c); // call pregenerated binary search 9159 } 9160 9161 /++ 9162 Return whether `c` is a Unicode lowercase $(CHARACTER). 9163 +/ 9164 @safe pure nothrow @nogc 9165 bool isLower(dchar c) 9166 { 9167 import std.ascii : isLower, isASCII; 9168 if (isASCII(c)) 9169 return isLower(c); 9170 return lowerCaseTrie[c]; 9171 } 9172 9173 @safe unittest 9174 { 9175 import std.ascii : isLower; 9176 foreach (v; 0 .. 0x80) 9177 assert(isLower(v) == .isLower(v)); 9178 assert(.isLower('я')); 9179 assert(.isLower('й')); 9180 assert(!.isLower('Ж')); 9181 // Greek HETA 9182 assert(!.isLower('\u0370')); 9183 assert(.isLower('\u0371')); 9184 assert(!.isLower('\u039C')); // capital MU 9185 assert(.isLower('\u03B2')); // beta 9186 // from extended Greek 9187 assert(!.isLower('\u1F18')); 9188 assert(.isLower('\u1F00')); 9189 foreach (v; unicode.lowerCase.byCodepoint) 9190 assert(.isLower(v) && !isUpper(v)); 9191 } 9192 9193 9194 /++ 9195 Return whether `c` is a Unicode uppercase $(CHARACTER). 9196 +/ 9197 @safe pure nothrow @nogc 9198 bool isUpper(dchar c) 9199 { 9200 import std.ascii : isUpper, isASCII; 9201 if (isASCII(c)) 9202 return isUpper(c); 9203 return upperCaseTrie[c]; 9204 } 9205 9206 @safe unittest 9207 { 9208 import std.ascii : isLower; 9209 foreach (v; 0 .. 0x80) 9210 assert(isLower(v) == .isLower(v)); 9211 assert(!isUpper('й')); 9212 assert(isUpper('Ж')); 9213 // Greek HETA 9214 assert(isUpper('\u0370')); 9215 assert(!isUpper('\u0371')); 9216 assert(isUpper('\u039C')); // capital MU 9217 assert(!isUpper('\u03B2')); // beta 9218 // from extended Greek 9219 assert(!isUpper('\u1F00')); 9220 assert(isUpper('\u1F18')); 9221 foreach (v; unicode.upperCase.byCodepoint) 9222 assert(isUpper(v) && !.isLower(v)); 9223 } 9224 9225 9226 //TODO: Hidden for now, needs better API. 9227 //Other transforms could use better API as well, but this one is a new primitive. 9228 @safe pure nothrow @nogc 9229 private dchar toTitlecase(dchar c) 9230 { 9231 // optimize ASCII case 9232 if (c < 0xAA) 9233 { 9234 if (c < 'a') 9235 return c; 9236 if (c <= 'z') 9237 return c - 32; 9238 return c; 9239 } 9240 size_t idx = toTitleSimpleIndex(c); 9241 if (idx != ushort.max) 9242 { 9243 return toTitleTab(idx); 9244 } 9245 return c; 9246 } 9247 9248 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9249 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9250 9251 // generic toUpper/toLower on whole string, creates new or returns as is 9252 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9253 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9254 { 9255 import std.array : appender, array; 9256 import std.ascii : isASCII; 9257 import std.utf : byDchar, codeLength; 9258 9259 alias C = ElementEncodingType!S; 9260 9261 auto r = s.byDchar; 9262 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9263 { 9264 auto cOuter = r.front; 9265 ushort idx = indexFn(cOuter); 9266 if (idx == ushort.max) 9267 continue; 9268 auto result = appender!(C[])(); 9269 result.reserve(s.length); 9270 result.put(s[0 .. i]); 9271 foreach (dchar c; s[i .. $].byDchar) 9272 { 9273 if (c.isASCII) 9274 { 9275 result.put(asciiConvert(c)); 9276 } 9277 else 9278 { 9279 idx = indexFn(c); 9280 if (idx == ushort.max) 9281 result.put(c); 9282 else if (idx < maxIdx) 9283 { 9284 c = tableFn(idx); 9285 result.put(c); 9286 } 9287 else 9288 { 9289 auto val = tableFn(idx); 9290 // unpack length + codepoint 9291 immutable uint len = val >> 24; 9292 result.put(cast(dchar)(val & 0xFF_FFFF)); 9293 foreach (j; idx+1 .. idx+len) 9294 result.put(tableFn(j)); 9295 } 9296 } 9297 } 9298 return result.data; 9299 } 9300 9301 static if (isSomeString!S) 9302 return s; 9303 else 9304 return s.array; 9305 } 9306 9307 // https://issues.dlang.org/show_bug.cgi?id=12428 9308 @safe unittest 9309 { 9310 import std.array : replicate; 9311 auto s = "abcdefghij".replicate(300); 9312 s = s[0 .. 10]; 9313 9314 toUpper(s); 9315 9316 assert(s == "abcdefghij"); 9317 } 9318 9319 // https://issues.dlang.org/show_bug.cgi?id=18993 9320 @safe unittest 9321 { 9322 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9323 } 9324 9325 9326 // generic toUpper/toLower on whole range, returns range 9327 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9328 // Accept range of dchar's 9329 if (isInputRange!Range && 9330 isSomeChar!(ElementEncodingType!Range) && 9331 ElementEncodingType!Range.sizeof == dchar.sizeof) 9332 { 9333 static struct ToCaserImpl 9334 { 9335 @property bool empty() 9336 { 9337 return !nLeft && r.empty; 9338 } 9339 9340 @property auto front() 9341 { 9342 import std.ascii : isASCII; 9343 9344 if (!nLeft) 9345 { 9346 dchar c = r.front; 9347 if (c.isASCII) 9348 { 9349 buf[0] = asciiConvert(c); 9350 nLeft = 1; 9351 } 9352 else 9353 { 9354 const idx = indexFn(c); 9355 if (idx == ushort.max) 9356 { 9357 buf[0] = c; 9358 nLeft = 1; 9359 } 9360 else if (idx < maxIdx) 9361 { 9362 buf[0] = tableFn(idx); 9363 nLeft = 1; 9364 } 9365 else 9366 { 9367 immutable val = tableFn(idx); 9368 // unpack length + codepoint 9369 nLeft = val >> 24; 9370 if (nLeft == 0) 9371 nLeft = 1; 9372 assert(nLeft <= buf.length); 9373 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9374 foreach (j; 1 .. nLeft) 9375 buf[nLeft - j - 1] = tableFn(idx + j); 9376 } 9377 } 9378 } 9379 return buf[nLeft - 1]; 9380 } 9381 9382 void popFront() 9383 { 9384 if (!nLeft) 9385 front; 9386 assert(nLeft); 9387 --nLeft; 9388 if (!nLeft) 9389 r.popFront(); 9390 } 9391 9392 static if (isForwardRange!Range) 9393 { 9394 @property auto save() 9395 { 9396 auto ret = this; 9397 ret.r = r.save; 9398 return ret; 9399 } 9400 } 9401 9402 private: 9403 Range r; 9404 uint nLeft; 9405 dchar[3] buf = void; 9406 } 9407 9408 return ToCaserImpl(str); 9409 } 9410 9411 /********************* 9412 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9413 * or a string to upper or lower case. 9414 * 9415 * Does not allocate memory. 9416 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9417 * are treated as $(REF replacementDchar, std,utf). 9418 * 9419 * Params: 9420 * str = string or range of characters 9421 * 9422 * Returns: 9423 * an input range of `dchar`s 9424 * 9425 * See_Also: 9426 * $(LREF toUpper), $(LREF toLower) 9427 */ 9428 9429 auto asLowerCase(Range)(Range str) 9430 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9431 !isConvertibleToString!Range) 9432 { 9433 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9434 { 9435 import std.utf : byDchar; 9436 9437 // Decode first 9438 return asLowerCase(str.byDchar); 9439 } 9440 else 9441 { 9442 static import std.ascii; 9443 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9444 } 9445 } 9446 9447 /// ditto 9448 auto asUpperCase(Range)(Range str) 9449 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9450 !isConvertibleToString!Range) 9451 { 9452 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9453 { 9454 import std.utf : byDchar; 9455 9456 // Decode first 9457 return asUpperCase(str.byDchar); 9458 } 9459 else 9460 { 9461 static import std.ascii; 9462 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9463 } 9464 } 9465 9466 /// 9467 @safe pure unittest 9468 { 9469 import std.algorithm.comparison : equal; 9470 9471 assert("hEllo".asUpperCase.equal("HELLO")); 9472 } 9473 9474 // explicitly undocumented 9475 auto asLowerCase(Range)(auto ref Range str) 9476 if (isConvertibleToString!Range) 9477 { 9478 import std.traits : StringTypeOf; 9479 return asLowerCase!(StringTypeOf!Range)(str); 9480 } 9481 9482 // explicitly undocumented 9483 auto asUpperCase(Range)(auto ref Range str) 9484 if (isConvertibleToString!Range) 9485 { 9486 import std.traits : StringTypeOf; 9487 return asUpperCase!(StringTypeOf!Range)(str); 9488 } 9489 9490 @safe unittest 9491 { 9492 static struct TestAliasedString 9493 { 9494 string get() @safe @nogc pure nothrow { return _s; } 9495 alias get this; 9496 @disable this(this); 9497 string _s; 9498 } 9499 9500 static bool testAliasedString(alias func, Args...)(string s, Args args) 9501 { 9502 import std.algorithm.comparison : equal; 9503 auto a = func(TestAliasedString(s), args); 9504 auto b = func(s, args); 9505 static if (is(typeof(equal(a, b)))) 9506 { 9507 // For ranges, compare contents instead of object identity. 9508 return equal(a, b); 9509 } 9510 else 9511 { 9512 return a == b; 9513 } 9514 } 9515 assert(testAliasedString!asLowerCase("hEllo")); 9516 assert(testAliasedString!asUpperCase("hEllo")); 9517 assert(testAliasedString!asCapitalized("hEllo")); 9518 } 9519 9520 @safe unittest 9521 { 9522 import std.array : array; 9523 9524 auto a = "HELLo".asLowerCase; 9525 auto savea = a.save; 9526 auto s = a.array; 9527 assert(s == "hello"); 9528 s = savea.array; 9529 assert(s == "hello"); 9530 9531 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9532 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9533 9534 foreach (i, slwr; lower) 9535 { 9536 import std.utf : byChar; 9537 9538 auto sx = slwr.asUpperCase.byChar.array; 9539 assert(sx == toUpper(slwr)); 9540 auto sy = upper[i].asLowerCase.byChar.array; 9541 assert(sy == toLower(upper[i])); 9542 } 9543 9544 // Not necessary to call r.front 9545 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9546 { 9547 } 9548 9549 import std.algorithm.comparison : equal; 9550 9551 "HELLo"w.asLowerCase.equal("hello"d); 9552 "HELLo"w.asUpperCase.equal("HELLO"d); 9553 "HELLo"d.asLowerCase.equal("hello"d); 9554 "HELLo"d.asUpperCase.equal("HELLO"d); 9555 9556 import std.utf : byChar; 9557 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9558 } 9559 9560 // generic capitalizer on whole range, returns range 9561 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9562 Range)(Range str) 9563 // Accept range of dchar's 9564 if (isInputRange!Range && 9565 isSomeChar!(ElementEncodingType!Range) && 9566 ElementEncodingType!Range.sizeof == dchar.sizeof) 9567 { 9568 static struct ToCapitalizerImpl 9569 { 9570 @property bool empty() 9571 { 9572 return lower ? lwr.empty : !nLeft && r.empty; 9573 } 9574 9575 @property auto front() 9576 { 9577 if (lower) 9578 return lwr.front; 9579 9580 if (!nLeft) 9581 { 9582 immutable dchar c = r.front; 9583 const idx = indexFnUpper(c); 9584 if (idx == ushort.max) 9585 { 9586 buf[0] = c; 9587 nLeft = 1; 9588 } 9589 else if (idx < maxIdxUpper) 9590 { 9591 buf[0] = tableFnUpper(idx); 9592 nLeft = 1; 9593 } 9594 else 9595 { 9596 immutable val = tableFnUpper(idx); 9597 // unpack length + codepoint 9598 nLeft = val >> 24; 9599 if (nLeft == 0) 9600 nLeft = 1; 9601 assert(nLeft <= buf.length); 9602 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9603 foreach (j; 1 .. nLeft) 9604 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9605 } 9606 } 9607 return buf[nLeft - 1]; 9608 } 9609 9610 void popFront() 9611 { 9612 if (lower) 9613 lwr.popFront(); 9614 else 9615 { 9616 if (!nLeft) 9617 front; 9618 assert(nLeft); 9619 --nLeft; 9620 if (!nLeft) 9621 { 9622 r.popFront(); 9623 lwr = r.asLowerCase(); 9624 lower = true; 9625 } 9626 } 9627 } 9628 9629 static if (isForwardRange!Range) 9630 { 9631 @property auto save() 9632 { 9633 auto ret = this; 9634 ret.r = r.save; 9635 ret.lwr = lwr.save; 9636 return ret; 9637 } 9638 } 9639 9640 private: 9641 Range r; 9642 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9643 bool lower = false; // false for first character, true for rest of string 9644 dchar[3] buf = void; 9645 uint nLeft = 0; 9646 } 9647 9648 return ToCapitalizerImpl(str); 9649 } 9650 9651 /********************* 9652 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9653 * or string, meaning convert the first 9654 * character to upper case and subsequent characters to lower case. 9655 * 9656 * Does not allocate memory. 9657 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9658 * are treated as $(REF replacementDchar, std,utf). 9659 * 9660 * Params: 9661 * str = string or range of characters 9662 * 9663 * Returns: 9664 * an InputRange of dchars 9665 * 9666 * See_Also: 9667 * $(LREF toUpper), $(LREF toLower) 9668 * $(LREF asUpperCase), $(LREF asLowerCase) 9669 */ 9670 9671 auto asCapitalized(Range)(Range str) 9672 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9673 !isConvertibleToString!Range) 9674 { 9675 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9676 { 9677 import std.utf : byDchar; 9678 9679 // Decode first 9680 return toCapitalizer!UpperTriple(str.byDchar); 9681 } 9682 else 9683 { 9684 return toCapitalizer!UpperTriple(str); 9685 } 9686 } 9687 9688 /// 9689 @safe pure unittest 9690 { 9691 import std.algorithm.comparison : equal; 9692 9693 assert("hEllo".asCapitalized.equal("Hello")); 9694 } 9695 9696 auto asCapitalized(Range)(auto ref Range str) 9697 if (isConvertibleToString!Range) 9698 { 9699 import std.traits : StringTypeOf; 9700 return asCapitalized!(StringTypeOf!Range)(str); 9701 } 9702 9703 @safe pure nothrow @nogc unittest 9704 { 9705 auto r = "hEllo".asCapitalized(); 9706 assert(r.front == 'H'); 9707 } 9708 9709 @safe unittest 9710 { 9711 import std.array : array; 9712 9713 auto a = "hELLo".asCapitalized; 9714 auto savea = a.save; 9715 auto s = a.array; 9716 assert(s == "Hello"); 9717 s = savea.array; 9718 assert(s == "Hello"); 9719 9720 string[2][] cases = 9721 [ 9722 ["", ""], 9723 ["h", "H"], 9724 ["H", "H"], 9725 ["3", "3"], 9726 ["123", "123"], 9727 ["h123A", "H123a"], 9728 ["феж", "Феж"], 9729 ["\u1Fe2", "\u03a5\u0308\u0300"], 9730 ]; 9731 9732 foreach (i; 0 .. cases.length) 9733 { 9734 import std.utf : byChar; 9735 9736 auto r = cases[i][0].asCapitalized.byChar.array; 9737 auto result = cases[i][1]; 9738 assert(r == result); 9739 } 9740 9741 // Don't call r.front 9742 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9743 { 9744 } 9745 9746 import std.algorithm.comparison : equal; 9747 9748 "HELLo"w.asCapitalized.equal("Hello"d); 9749 "hElLO"w.asCapitalized.equal("Hello"d); 9750 "hello"d.asCapitalized.equal("Hello"d); 9751 "HELLO"d.asCapitalized.equal("Hello"d); 9752 9753 import std.utf : byChar; 9754 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9755 } 9756 9757 // TODO: helper, I wish std.utf was more flexible (and stright) 9758 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9759 { 9760 if (c <= 0x7F) 9761 { 9762 buf[idx] = cast(char) c; 9763 idx++; 9764 } 9765 else if (c <= 0x7FF) 9766 { 9767 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9768 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9769 idx += 2; 9770 } 9771 else if (c <= 0xFFFF) 9772 { 9773 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9774 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9775 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9776 idx += 3; 9777 } 9778 else if (c <= 0x10FFFF) 9779 { 9780 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9781 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9782 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9783 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9784 idx += 4; 9785 } 9786 else 9787 assert(0); 9788 return idx; 9789 } 9790 9791 @safe unittest 9792 { 9793 char[] s = "abcd".dup; 9794 size_t i = 0; 9795 i = encodeTo(s, i, 'X'); 9796 assert(s == "Xbcd"); 9797 9798 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9799 assert(s == "X\xC2\xA9d"); 9800 } 9801 9802 // TODO: helper, I wish std.utf was more flexible (and stright) 9803 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9804 { 9805 import std.utf : UTFException; 9806 if (c <= 0xFFFF) 9807 { 9808 if (0xD800 <= c && c <= 0xDFFF) 9809 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9810 buf[idx] = cast(wchar) c; 9811 idx++; 9812 } 9813 else if (c <= 0x10FFFF) 9814 { 9815 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9816 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9817 idx += 2; 9818 } 9819 else 9820 assert(0); 9821 return idx; 9822 } 9823 9824 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9825 { 9826 buf[idx] = c; 9827 idx++; 9828 return idx; 9829 } 9830 9831 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9832 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9833 { 9834 import std.utf : decode, codeLength; 9835 size_t curIdx = 0; 9836 size_t destIdx = 0; 9837 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9838 size_t lastUnchanged = 0; 9839 // in-buffer move of bytes to a new start index 9840 // the trick is that it may not need to copy at all 9841 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9842 { 9843 // Interestingly we may just bump pointer for a while 9844 // then have to copy if a re-cased char was smaller the original 9845 // later we may regain pace with char that got bigger 9846 // In the end it sometimes flip-flops between the 2 cases below 9847 if (dest == from) 9848 return to; 9849 // got to copy 9850 foreach (C c; str[from .. to]) 9851 str[dest++] = c; 9852 return dest; 9853 } 9854 while (curIdx != s.length) 9855 { 9856 size_t startIdx = curIdx; 9857 immutable ch = decode(s, curIdx); 9858 // TODO: special case for ASCII 9859 immutable caseIndex = indexFn(ch); 9860 if (caseIndex == ushort.max) // unchanged, skip over 9861 { 9862 continue; 9863 } 9864 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9865 { 9866 // previous cased chars had the same length as uncased ones 9867 // thus can just adjust pointer 9868 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9869 lastUnchanged = curIdx; 9870 immutable cased = tableFn(caseIndex); 9871 immutable casedLen = codeLength!C(cased); 9872 if (casedLen + destIdx > curIdx) // no place to fit cased char 9873 { 9874 // switch to slow codepath, where we allocate 9875 return slowToCase(s, startIdx, destIdx); 9876 } 9877 else 9878 { 9879 destIdx = encodeTo(s, destIdx, cased); 9880 } 9881 } 9882 else // 1:m codepoint mapping, slow codepath 9883 { 9884 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9885 lastUnchanged = curIdx; 9886 return slowToCase(s, startIdx, destIdx); 9887 } 9888 assert(destIdx <= curIdx); 9889 } 9890 if (lastUnchanged != s.length) 9891 { 9892 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9893 } 9894 s = s[0 .. destIdx]; 9895 } 9896 9897 // helper to precalculate size of case-converted string 9898 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9899 { 9900 size_t toCaseLength(C)(const scope C[] str) 9901 { 9902 import std.utf : decode, codeLength; 9903 size_t codeLen = 0; 9904 size_t lastNonTrivial = 0; 9905 size_t curIdx = 0; 9906 while (curIdx != str.length) 9907 { 9908 immutable startIdx = curIdx; 9909 immutable ch = decode(str, curIdx); 9910 immutable ushort caseIndex = indexFn(ch); 9911 if (caseIndex == ushort.max) 9912 continue; 9913 else if (caseIndex < maxIdx) 9914 { 9915 codeLen += startIdx - lastNonTrivial; 9916 lastNonTrivial = curIdx; 9917 immutable cased = tableFn(caseIndex); 9918 codeLen += codeLength!C(cased); 9919 } 9920 else 9921 { 9922 codeLen += startIdx - lastNonTrivial; 9923 lastNonTrivial = curIdx; 9924 immutable val = tableFn(caseIndex); 9925 immutable len = val >> 24; 9926 immutable dchar cased = val & 0xFF_FFFF; 9927 codeLen += codeLength!C(cased); 9928 foreach (j; caseIndex+1 .. caseIndex+len) 9929 codeLen += codeLength!C(tableFn(j)); 9930 } 9931 } 9932 if (lastNonTrivial != str.length) 9933 codeLen += str.length - lastNonTrivial; 9934 return codeLen; 9935 } 9936 } 9937 9938 @safe unittest 9939 { 9940 alias toLowerLength = toCaseLength!(LowerTriple); 9941 assert(toLowerLength("abcd") == 4); 9942 assert(toLowerLength("аБВгд456") == 10+3); 9943 } 9944 9945 // slower code path that preallocates and then copies 9946 // case-converted stuf to the new string 9947 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9948 { 9949 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9950 size_t destIdx) @trusted pure 9951 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9952 { 9953 import std.utf : decode; 9954 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9955 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9956 C[] ns = new C[trueLength]; 9957 ns[0 .. destIdx] = s[0 .. destIdx]; 9958 size_t lastUnchanged = curIdx; 9959 while (curIdx != s.length) 9960 { 9961 immutable startIdx = curIdx; // start of current codepoint 9962 immutable ch = decode(s, curIdx); 9963 immutable caseIndex = indexFn(ch); 9964 if (caseIndex == ushort.max) // skip over 9965 { 9966 continue; 9967 } 9968 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9969 { 9970 immutable cased = tableFn(caseIndex); 9971 auto toCopy = startIdx - lastUnchanged; 9972 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9973 lastUnchanged = curIdx; 9974 destIdx += toCopy; 9975 destIdx = encodeTo(ns, destIdx, cased); 9976 } 9977 else // 1:m codepoint mapping, slow codepath 9978 { 9979 auto toCopy = startIdx - lastUnchanged; 9980 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9981 lastUnchanged = curIdx; 9982 destIdx += toCopy; 9983 auto val = tableFn(caseIndex); 9984 // unpack length + codepoint 9985 immutable uint len = val >> 24; 9986 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9987 foreach (j; caseIndex+1 .. caseIndex+len) 9988 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9989 } 9990 } 9991 if (lastUnchanged != s.length) 9992 { 9993 auto toCopy = s.length - lastUnchanged; 9994 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9995 destIdx += toCopy; 9996 } 9997 assert(ns.length == destIdx); 9998 s = ns; 9999 } 10000 } 10001 10002 /++ 10003 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 10004 For a few characters string length may increase after the transformation, 10005 in such a case the function reallocates exactly once. 10006 If `s` does not have any uppercase characters, then `s` is unaltered. 10007 +/ 10008 void toLowerInPlace(C)(ref C[] s) @trusted pure 10009 if (is(C == char) || is(C == wchar) || is(C == dchar)) 10010 { 10011 toCaseInPlace!(LowerTriple)(s); 10012 } 10013 // overloads for the most common cases to reduce compile time 10014 @safe pure /*TODO nothrow*/ 10015 { 10016 void toLowerInPlace(ref char[] s) 10017 { toLowerInPlace!char(s); } 10018 void toLowerInPlace(ref wchar[] s) 10019 { toLowerInPlace!wchar(s); } 10020 void toLowerInPlace(ref dchar[] s) 10021 { toLowerInPlace!dchar(s); } 10022 } 10023 10024 /++ 10025 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 10026 For a few characters string length may increase after the transformation, 10027 in such a case the function reallocates exactly once. 10028 If `s` does not have any lowercase characters, then `s` is unaltered. 10029 +/ 10030 void toUpperInPlace(C)(ref C[] s) @trusted pure 10031 if (is(C == char) || is(C == wchar) || is(C == dchar)) 10032 { 10033 toCaseInPlace!(UpperTriple)(s); 10034 } 10035 // overloads for the most common cases to reduce compile time/code size 10036 @safe pure /*TODO nothrow*/ 10037 { 10038 void toUpperInPlace(ref char[] s) 10039 { toUpperInPlace!char(s); } 10040 void toUpperInPlace(ref wchar[] s) 10041 { toUpperInPlace!wchar(s); } 10042 void toUpperInPlace(ref dchar[] s) 10043 { toUpperInPlace!dchar(s); } 10044 } 10045 10046 /++ 10047 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 10048 is returned. Otherwise `c` is returned. 10049 10050 Warning: certain alphabets like German and Greek have no 1:1 10051 upper-lower mapping. Use overload of toLower which takes full string instead. 10052 +/ 10053 @safe pure nothrow @nogc 10054 dchar toLower(dchar c) 10055 { 10056 // optimize ASCII case 10057 if (c < 0xAA) 10058 { 10059 if (c < 'A') 10060 return c; 10061 if (c <= 'Z') 10062 return c + 32; 10063 return c; 10064 } 10065 size_t idx = toLowerSimpleIndex(c); 10066 if (idx != ushort.max) 10067 { 10068 return toLowerTab(idx); 10069 } 10070 return c; 10071 } 10072 10073 /++ 10074 Creates a new array which is identical to `s` except that all of its 10075 characters are converted to lowercase (by performing Unicode lowercase mapping). 10076 If none of `s` characters were affected, then `s` itself is returned if `s` is a 10077 `string`-like type. 10078 10079 Params: 10080 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10081 of characters 10082 Returns: 10083 An array with the same element type as `s`. 10084 +/ 10085 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 10086 if (isSomeString!S) 10087 { 10088 static import std.ascii; 10089 return toCase!(LowerTriple, std.ascii.toLower)(s); 10090 } 10091 10092 /// ditto 10093 ElementEncodingType!S[] toLower(S)(S s) 10094 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10095 { 10096 static import std.ascii; 10097 return toCase!(LowerTriple, std.ascii.toLower)(s); 10098 } 10099 10100 // overloads for the most common cases to reduce compile time 10101 @safe pure /*TODO nothrow*/ 10102 { 10103 string toLower(return scope string s) 10104 { return toLower!string(s); } 10105 wstring toLower(return scope wstring s) 10106 { return toLower!wstring(s); } 10107 dstring toLower(return scope dstring s) 10108 { return toLower!dstring(s); } 10109 10110 @safe unittest 10111 { 10112 // https://issues.dlang.org/show_bug.cgi?id=16663 10113 10114 static struct String 10115 { 10116 string data; 10117 alias data this; 10118 } 10119 10120 void foo() 10121 { 10122 auto u = toLower(String("")); 10123 } 10124 } 10125 } 10126 10127 10128 @safe unittest 10129 { 10130 static import std.ascii; 10131 import std.format : format; 10132 foreach (ch; 0 .. 0x80) 10133 assert(std.ascii.toLower(ch) == toLower(ch)); 10134 assert(toLower('Я') == 'я'); 10135 assert(toLower('Δ') == 'δ'); 10136 foreach (ch; unicode.upperCase.byCodepoint) 10137 { 10138 dchar low = ch.toLower(); 10139 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 10140 } 10141 assert(toLower("АЯ") == "ая"); 10142 10143 assert("\u1E9E".toLower == "\u00df"); 10144 assert("\u00df".toUpper == "SS"); 10145 } 10146 10147 // https://issues.dlang.org/show_bug.cgi?id=9629 10148 @safe unittest 10149 { 10150 wchar[] test = "hello þ world"w.dup; 10151 auto piece = test[6 .. 7]; 10152 toUpperInPlace(piece); 10153 assert(test == "hello Þ world"); 10154 } 10155 10156 10157 @safe unittest 10158 { 10159 import std.algorithm.comparison : cmp; 10160 string s1 = "FoL"; 10161 string s2 = toLower(s1); 10162 assert(cmp(s2, "fol") == 0, s2); 10163 assert(s2 != s1); 10164 10165 char[] s3 = s1.dup; 10166 toLowerInPlace(s3); 10167 assert(s3 == s2); 10168 10169 s1 = "A\u0100B\u0101d"; 10170 s2 = toLower(s1); 10171 s3 = s1.dup; 10172 assert(cmp(s2, "a\u0101b\u0101d") == 0); 10173 assert(s2 !is s1); 10174 toLowerInPlace(s3); 10175 assert(s3 == s2); 10176 10177 s1 = "A\u0460B\u0461d"; 10178 s2 = toLower(s1); 10179 s3 = s1.dup; 10180 assert(cmp(s2, "a\u0461b\u0461d") == 0); 10181 assert(s2 !is s1); 10182 toLowerInPlace(s3); 10183 assert(s3 == s2); 10184 10185 s1 = "\u0130"; 10186 s2 = toLower(s1); 10187 s3 = s1.dup; 10188 assert(s2 == "i\u0307"); 10189 assert(s2 !is s1); 10190 toLowerInPlace(s3); 10191 assert(s3 == s2); 10192 10193 // Test on wchar and dchar strings. 10194 assert(toLower("Some String"w) == "some string"w); 10195 assert(toLower("Some String"d) == "some string"d); 10196 10197 // https://issues.dlang.org/show_bug.cgi?id=12455 10198 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 10199 assert(isUpper(c)); 10200 assert(toLower(c) == 'i'); 10201 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 10202 // check simple-case toUpper too 10203 c = '\u1f87'; 10204 assert(isLower(c)); 10205 assert(toUpper(c) == '\u1F8F'); 10206 } 10207 10208 @safe pure unittest 10209 { 10210 import std.algorithm.comparison : cmp, equal; 10211 import std.utf : byCodeUnit; 10212 auto r1 = "FoL".byCodeUnit; 10213 assert(r1.toLower.cmp("fol") == 0); 10214 auto r2 = "A\u0460B\u0461d".byCodeUnit; 10215 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 10216 } 10217 10218 /++ 10219 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 10220 is returned. Otherwise `c` is returned. 10221 10222 Warning: 10223 Certain alphabets like German and Greek have no 1:1 10224 upper-lower mapping. Use overload of toUpper which takes full string instead. 10225 10226 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 10227 to produce an algorithm that can convert a range of characters to upper case 10228 without allocating memory. 10229 A string can then be produced by using $(REF copy, std,algorithm,mutation) 10230 to send it to an $(REF appender, std,array). 10231 +/ 10232 @safe pure nothrow @nogc 10233 dchar toUpper(dchar c) 10234 { 10235 // optimize ASCII case 10236 if (c < 0xAA) 10237 { 10238 if (c < 'a') 10239 return c; 10240 if (c <= 'z') 10241 return c - 32; 10242 return c; 10243 } 10244 size_t idx = toUpperSimpleIndex(c); 10245 if (idx != ushort.max) 10246 { 10247 return toUpperTab(idx); 10248 } 10249 return c; 10250 } 10251 10252 /// 10253 @safe unittest 10254 { 10255 import std.algorithm.iteration : map; 10256 import std.algorithm.mutation : copy; 10257 import std.array : appender; 10258 10259 auto abuf = appender!(char[])(); 10260 "hello".map!toUpper.copy(abuf); 10261 assert(abuf.data == "HELLO"); 10262 } 10263 10264 @safe unittest 10265 { 10266 static import std.ascii; 10267 import std.format : format; 10268 foreach (ch; 0 .. 0x80) 10269 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10270 assert(toUpper('я') == 'Я'); 10271 assert(toUpper('δ') == 'Δ'); 10272 auto title = unicode.Titlecase_Letter; 10273 foreach (ch; unicode.lowerCase.byCodepoint) 10274 { 10275 dchar up = ch.toUpper(); 10276 assert(up == ch || isUpper(up) || title[up], 10277 format("%x -> %x", ch, up)); 10278 } 10279 } 10280 10281 /++ 10282 Allocates a new array which is identical to `s` except that all of its 10283 characters are converted to uppercase (by performing Unicode uppercase mapping). 10284 If none of `s` characters were affected, then `s` itself is returned if `s` 10285 is a `string`-like type. 10286 10287 Params: 10288 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10289 of characters 10290 Returns: 10291 An new array with the same element type as `s`. 10292 +/ 10293 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10294 if (isSomeString!S) 10295 { 10296 static import std.ascii; 10297 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10298 } 10299 10300 /// ditto 10301 ElementEncodingType!S[] toUpper(S)(S s) 10302 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10303 { 10304 static import std.ascii; 10305 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10306 } 10307 10308 // overloads for the most common cases to reduce compile time 10309 @safe pure /*TODO nothrow*/ 10310 { 10311 string toUpper(return scope string s) 10312 { return toUpper!string(s); } 10313 wstring toUpper(return scope wstring s) 10314 { return toUpper!wstring(s); } 10315 dstring toUpper(return scope dstring s) 10316 { return toUpper!dstring(s); } 10317 10318 @safe unittest 10319 { 10320 // https://issues.dlang.org/show_bug.cgi?id=16663 10321 10322 static struct String 10323 { 10324 string data; 10325 alias data this; 10326 } 10327 10328 void foo() 10329 { 10330 auto u = toUpper(String("")); 10331 } 10332 } 10333 } 10334 10335 @safe unittest 10336 { 10337 import std.algorithm.comparison : cmp; 10338 10339 string s1 = "FoL"; 10340 string s2; 10341 char[] s3; 10342 10343 s2 = toUpper(s1); 10344 s3 = s1.dup; toUpperInPlace(s3); 10345 assert(s3 == s2, s3); 10346 assert(cmp(s2, "FOL") == 0); 10347 assert(s2 !is s1); 10348 10349 s1 = "a\u0100B\u0101d"; 10350 s2 = toUpper(s1); 10351 s3 = s1.dup; toUpperInPlace(s3); 10352 assert(s3 == s2); 10353 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10354 assert(s2 !is s1); 10355 10356 s1 = "a\u0460B\u0461d"; 10357 s2 = toUpper(s1); 10358 s3 = s1.dup; toUpperInPlace(s3); 10359 assert(s3 == s2); 10360 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10361 assert(s2 !is s1); 10362 } 10363 10364 @safe unittest 10365 { 10366 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10367 { 10368 import std.format : format; 10369 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10370 auto low = s.toLower() , up = s.toUpper(); 10371 auto lowInp = s.dup, upInp = s.dup; 10372 lowInp.toLowerInPlace(); 10373 upInp.toUpperInPlace(); 10374 assert(low == trueLow, format(diff, low, trueLow)); 10375 assert(up == trueUp, format(diff, up, trueUp)); 10376 assert(lowInp == trueLow, 10377 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10378 assert(upInp == trueUp, 10379 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10380 } 10381 static foreach (S; AliasSeq!(dstring, wstring, string)) 10382 {{ 10383 10384 S easy = "123"; 10385 S good = "abCФеж"; 10386 S awful = "\u0131\u023f\u2126"; 10387 S wicked = "\u0130\u1FE2"; 10388 auto options = [easy, good, awful, wicked]; 10389 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10390 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10391 10392 foreach (val; [easy, good]) 10393 { 10394 auto e = val.dup; 10395 auto g = e; 10396 e.toUpperInPlace(); 10397 assert(e is g); 10398 e.toLowerInPlace(); 10399 assert(e is g); 10400 } 10401 foreach (i, v; options) 10402 { 10403 doTest(v, upper[i], lower[i]); 10404 } 10405 10406 // a few combinatorial runs 10407 foreach (i; 0 .. options.length) 10408 foreach (j; i .. options.length) 10409 foreach (k; j .. options.length) 10410 { 10411 auto sample = options[i] ~ options[j] ~ options[k]; 10412 auto sample2 = options[k] ~ options[j] ~ options[i]; 10413 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10414 lower[i] ~ lower[j] ~ lower[k]); 10415 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10416 lower[k] ~ lower[j] ~ lower[i]); 10417 } 10418 }} 10419 } 10420 10421 // test random access ranges 10422 @safe pure unittest 10423 { 10424 import std.algorithm.comparison : cmp; 10425 import std.utf : byCodeUnit; 10426 auto s1 = "FoL".byCodeUnit; 10427 assert(s1.toUpper.cmp("FOL") == 0); 10428 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10429 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10430 } 10431 10432 /++ 10433 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10434 (general Unicode category: Alphabetic). 10435 +/ 10436 @safe pure nothrow @nogc 10437 bool isAlpha(dchar c) 10438 { 10439 // optimization 10440 if (c < 0xAA) 10441 { 10442 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); 10443 } 10444 10445 return alphaTrie[c]; 10446 } 10447 10448 @safe unittest 10449 { 10450 auto alpha = unicode("Alphabetic"); 10451 foreach (ch; alpha.byCodepoint) 10452 assert(isAlpha(ch)); 10453 foreach (ch; 0 .. 0x4000) 10454 assert((ch in alpha) == isAlpha(ch)); 10455 } 10456 10457 10458 /++ 10459 Returns whether `c` is a Unicode mark 10460 (general Unicode category: Mn, Me, Mc). 10461 +/ 10462 @safe pure nothrow @nogc 10463 bool isMark(dchar c) 10464 { 10465 return markTrie[c]; 10466 } 10467 10468 @safe unittest 10469 { 10470 auto mark = unicode("Mark"); 10471 foreach (ch; mark.byCodepoint) 10472 assert(isMark(ch)); 10473 foreach (ch; 0 .. 0x4000) 10474 assert((ch in mark) == isMark(ch)); 10475 } 10476 10477 /++ 10478 Returns whether `c` is a Unicode numerical $(CHARACTER) 10479 (general Unicode category: Nd, Nl, No). 10480 +/ 10481 @safe pure nothrow @nogc 10482 bool isNumber(dchar c) 10483 { 10484 // optimization for ascii case 10485 if (c <= 0x7F) 10486 { 10487 return c >= '0' && c <= '9'; 10488 } 10489 else 10490 { 10491 return numberTrie[c]; 10492 } 10493 } 10494 10495 @safe unittest 10496 { 10497 auto n = unicode("N"); 10498 foreach (ch; n.byCodepoint) 10499 assert(isNumber(ch)); 10500 foreach (ch; 0 .. 0x4000) 10501 assert((ch in n) == isNumber(ch)); 10502 } 10503 10504 /++ 10505 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10506 (general Unicode category: Alphabetic, Nd, Nl, No). 10507 10508 Params: 10509 c = any Unicode character 10510 Returns: 10511 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10512 categories 10513 +/ 10514 @safe pure nothrow @nogc 10515 bool isAlphaNum(dchar c) 10516 { 10517 static import std.ascii; 10518 10519 // optimization for ascii case 10520 if (std.ascii.isASCII(c)) 10521 { 10522 return std.ascii.isAlphaNum(c); 10523 } 10524 else 10525 { 10526 return isAlpha(c) || isNumber(c); 10527 } 10528 } 10529 10530 @safe unittest 10531 { 10532 auto n = unicode("N"); 10533 auto alpha = unicode("Alphabetic"); 10534 10535 foreach (ch; n.byCodepoint) 10536 assert(isAlphaNum(ch)); 10537 10538 foreach (ch; alpha.byCodepoint) 10539 assert(isAlphaNum(ch)); 10540 10541 foreach (ch; 0 .. 0x4000) 10542 { 10543 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10544 } 10545 } 10546 10547 /++ 10548 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10549 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10550 +/ 10551 @safe pure nothrow @nogc 10552 bool isPunctuation(dchar c) 10553 { 10554 static import std.ascii; 10555 10556 // optimization for ascii case 10557 if (c <= 0x7F) 10558 { 10559 return std.ascii.isPunctuation(c); 10560 } 10561 else 10562 { 10563 return punctuationTrie[c]; 10564 } 10565 } 10566 10567 @safe unittest 10568 { 10569 assert(isPunctuation('\u0021')); 10570 assert(isPunctuation('\u0028')); 10571 assert(isPunctuation('\u0029')); 10572 assert(isPunctuation('\u002D')); 10573 assert(isPunctuation('\u005F')); 10574 assert(isPunctuation('\u00AB')); 10575 assert(isPunctuation('\u00BB')); 10576 foreach (ch; unicode("P").byCodepoint) 10577 assert(isPunctuation(ch)); 10578 } 10579 10580 /++ 10581 Returns whether `c` is a Unicode symbol $(CHARACTER) 10582 (general Unicode category: Sm, Sc, Sk, So). 10583 +/ 10584 @safe pure nothrow @nogc 10585 bool isSymbol(dchar c) 10586 { 10587 return symbolTrie[c]; 10588 } 10589 10590 @safe unittest 10591 { 10592 import std.format : format; 10593 assert(isSymbol('\u0024')); 10594 assert(isSymbol('\u002B')); 10595 assert(isSymbol('\u005E')); 10596 assert(isSymbol('\u00A6')); 10597 foreach (ch; unicode("S").byCodepoint) 10598 assert(isSymbol(ch), format("%04x", ch)); 10599 } 10600 10601 /++ 10602 Returns whether `c` is a Unicode space $(CHARACTER) 10603 (general Unicode category: Zs) 10604 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10605 For commonly used less strict semantics see $(LREF isWhite). 10606 +/ 10607 @safe pure nothrow @nogc 10608 bool isSpace(dchar c) 10609 { 10610 import std.internal.unicode_tables : isSpaceGen; // generated file 10611 return isSpaceGen(c); 10612 } 10613 10614 @safe unittest 10615 { 10616 assert(isSpace('\u0020')); 10617 auto space = unicode.Zs; 10618 foreach (ch; space.byCodepoint) 10619 assert(isSpace(ch)); 10620 foreach (ch; 0 .. 0x1000) 10621 assert(isSpace(ch) == space[ch]); 10622 } 10623 10624 10625 /++ 10626 Returns whether `c` is a Unicode graphical $(CHARACTER) 10627 (general Unicode category: L, M, N, P, S, Zs). 10628 10629 +/ 10630 @safe pure nothrow @nogc 10631 bool isGraphical(dchar c) 10632 { 10633 return graphicalTrie[c]; 10634 } 10635 10636 10637 @safe unittest 10638 { 10639 auto set = unicode("Graphical"); 10640 import std.format : format; 10641 foreach (ch; set.byCodepoint) 10642 assert(isGraphical(ch), format("%4x", ch)); 10643 foreach (ch; 0 .. 0x4000) 10644 assert((ch in set) == isGraphical(ch)); 10645 } 10646 10647 10648 /++ 10649 Returns whether `c` is a Unicode control $(CHARACTER) 10650 (general Unicode category: Cc). 10651 +/ 10652 @safe pure nothrow @nogc 10653 bool isControl(dchar c) 10654 { 10655 import std.internal.unicode_tables : isControlGen; // generated file 10656 return isControlGen(c); 10657 } 10658 10659 @safe unittest 10660 { 10661 assert(isControl('\u0000')); 10662 assert(isControl('\u0081')); 10663 assert(!isControl('\u0100')); 10664 auto cc = unicode.Cc; 10665 foreach (ch; cc.byCodepoint) 10666 assert(isControl(ch)); 10667 foreach (ch; 0 .. 0x1000) 10668 assert(isControl(ch) == cc[ch]); 10669 } 10670 10671 10672 /++ 10673 Returns whether `c` is a Unicode formatting $(CHARACTER) 10674 (general Unicode category: Cf). 10675 +/ 10676 @safe pure nothrow @nogc 10677 bool isFormat(dchar c) 10678 { 10679 import std.internal.unicode_tables : isFormatGen; // generated file 10680 return isFormatGen(c); 10681 } 10682 10683 10684 @safe unittest 10685 { 10686 assert(isFormat('\u00AD')); 10687 foreach (ch; unicode("Format").byCodepoint) 10688 assert(isFormat(ch)); 10689 } 10690 10691 // code points for private use, surrogates are not likely to change in near feature 10692 // if need be they can be generated from unicode data as well 10693 10694 /++ 10695 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10696 (general Unicode category: Co). 10697 +/ 10698 @safe pure nothrow @nogc 10699 bool isPrivateUse(dchar c) 10700 { 10701 return (0x00_E000 <= c && c <= 0x00_F8FF) 10702 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10703 || (0x10_0000 <= c && c <= 0x10_FFFD); 10704 } 10705 10706 /++ 10707 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10708 (general Unicode category: Cs). 10709 +/ 10710 @safe pure nothrow @nogc 10711 bool isSurrogate(dchar c) 10712 { 10713 return (0xD800 <= c && c <= 0xDFFF); 10714 } 10715 10716 /++ 10717 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10718 +/ 10719 @safe pure nothrow @nogc 10720 bool isSurrogateHi(dchar c) 10721 { 10722 return (0xD800 <= c && c <= 0xDBFF); 10723 } 10724 10725 /++ 10726 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10727 +/ 10728 @safe pure nothrow @nogc 10729 bool isSurrogateLo(dchar c) 10730 { 10731 return (0xDC00 <= c && c <= 0xDFFF); 10732 } 10733 10734 /++ 10735 Returns whether `c` is a Unicode non-character i.e. 10736 a $(CODEPOINT) with no assigned abstract character. 10737 (general Unicode category: Cn) 10738 +/ 10739 @safe pure nothrow @nogc 10740 bool isNonCharacter(dchar c) 10741 { 10742 return nonCharacterTrie[c]; 10743 } 10744 10745 @safe unittest 10746 { 10747 auto set = unicode("Cn"); 10748 foreach (ch; set.byCodepoint) 10749 assert(isNonCharacter(ch)); 10750 } 10751 10752 private: 10753 // load static data from pre-generated tables into usable datastructures 10754 10755 10756 @safe auto asSet(const (ubyte)[] compressed) pure 10757 { 10758 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10759 } 10760 10761 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10762 { 10763 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10764 } 10765 10766 @safe pure nothrow @nogc @property 10767 { 10768 // It's important to use auto return here, so that the compiler 10769 // only runs semantic on the return type if the function gets 10770 // used. Also these are functions rather than templates to not 10771 // increase the object size of the caller. 10772 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10773 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10774 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10775 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10776 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10777 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10778 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10779 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10780 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10781 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10782 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10783 10784 //normalization quick-check tables 10785 auto nfcQCTrie() 10786 { 10787 import std.internal.unicode_norm : nfcQCTrieEntries; 10788 static immutable res = asTrie(nfcQCTrieEntries); 10789 return res; 10790 } 10791 10792 auto nfdQCTrie() 10793 { 10794 import std.internal.unicode_norm : nfdQCTrieEntries; 10795 static immutable res = asTrie(nfdQCTrieEntries); 10796 return res; 10797 } 10798 10799 auto nfkcQCTrie() 10800 { 10801 import std.internal.unicode_norm : nfkcQCTrieEntries; 10802 static immutable res = asTrie(nfkcQCTrieEntries); 10803 return res; 10804 } 10805 10806 auto nfkdQCTrie() 10807 { 10808 import std.internal.unicode_norm : nfkdQCTrieEntries; 10809 static immutable res = asTrie(nfkdQCTrieEntries); 10810 return res; 10811 } 10812 10813 //grapheme breaking algorithm tables 10814 auto spacingMarkTrie() 10815 { 10816 import std.internal.unicode_grapheme : spacingMarkTrieEntries; 10817 static immutable res = asTrie(spacingMarkTrieEntries); 10818 return res; 10819 } 10820 10821 auto graphemeExtendTrie() 10822 { 10823 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10824 static immutable res = asTrie(graphemeExtendTrieEntries); 10825 return res; 10826 } 10827 10828 auto hangLV() 10829 { 10830 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10831 static immutable res = asTrie(hangulLVTrieEntries); 10832 return res; 10833 } 10834 10835 auto hangLVT() 10836 { 10837 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10838 static immutable res = asTrie(hangulLVTTrieEntries); 10839 return res; 10840 } 10841 10842 auto prependTrie() 10843 { 10844 import std.internal.unicode_grapheme : prependTrieEntries; 10845 static immutable res = asTrie(prependTrieEntries); 10846 return res; 10847 } 10848 10849 auto graphemeControlTrie() 10850 { 10851 import std.internal.unicode_grapheme : controlTrieEntries; 10852 static immutable res = asTrie(controlTrieEntries); 10853 return res; 10854 } 10855 10856 auto xpictoTrie() 10857 { 10858 import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; 10859 static immutable res = asTrie(Extended_PictographicTrieEntries); 10860 return res; 10861 } 10862 10863 // tables below are used for composition/decomposition 10864 auto combiningClassTrie() 10865 { 10866 import std.internal.unicode_comp : combiningClassTrieEntries; 10867 static immutable res = asTrie(combiningClassTrieEntries); 10868 return res; 10869 } 10870 10871 auto compatMappingTrie() 10872 { 10873 import std.internal.unicode_decomp : compatMappingTrieEntries; 10874 static immutable res = asTrie(compatMappingTrieEntries); 10875 return res; 10876 } 10877 10878 auto canonMappingTrie() 10879 { 10880 import std.internal.unicode_decomp : canonMappingTrieEntries; 10881 static immutable res = asTrie(canonMappingTrieEntries); 10882 return res; 10883 } 10884 10885 auto compositionJumpTrie() 10886 { 10887 import std.internal.unicode_comp : compositionJumpTrieEntries; 10888 static immutable res = asTrie(compositionJumpTrieEntries); 10889 return res; 10890 } 10891 10892 //case conversion tables 10893 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10894 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10895 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10896 //simple case conversion tables 10897 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10898 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10899 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10900 10901 } 10902 10903 }// version (!std_uni_bootstrap)