1 // Written in the D programming language. 2 3 /++ 4 $(P The `std.uni` module provides an implementation 5 of fundamental Unicode algorithms and data structures. 6 This doesn't include UTF encoding and decoding primitives, 7 see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf) 8 for this functionality. ) 9 10 $(SCRIPT inhibitQuickIndex = 1;) 11 $(DIVC quickindex, 12 $(BOOKTABLE, 13 $(TR $(TH Category) $(TH Functions)) 14 $(TR $(TD Decode) $(TD 15 $(LREF byCodePoint) 16 $(LREF byGrapheme) 17 $(LREF decodeGrapheme) 18 $(LREF graphemeStride) 19 )) 20 $(TR $(TD Comparison) $(TD 21 $(LREF icmp) 22 $(LREF sicmp) 23 )) 24 $(TR $(TD Classification) $(TD 25 $(LREF isAlpha) 26 $(LREF isAlphaNum) 27 $(LREF isCodepointSet) 28 $(LREF isControl) 29 $(LREF isFormat) 30 $(LREF isGraphical) 31 $(LREF isIntegralPair) 32 $(LREF isMark) 33 $(LREF isNonCharacter) 34 $(LREF isNumber) 35 $(LREF isPrivateUse) 36 $(LREF isPunctuation) 37 $(LREF isSpace) 38 $(LREF isSurrogate) 39 $(LREF isSurrogateHi) 40 $(LREF isSurrogateLo) 41 $(LREF isSymbol) 42 $(LREF isWhite) 43 )) 44 $(TR $(TD Normalization) $(TD 45 $(LREF NFC) 46 $(LREF NFD) 47 $(LREF NFKD) 48 $(LREF NormalizationForm) 49 $(LREF normalize) 50 )) 51 $(TR $(TD Decompose) $(TD 52 $(LREF decompose) 53 $(LREF decomposeHangul) 54 $(LREF UnicodeDecomposition) 55 )) 56 $(TR $(TD Compose) $(TD 57 $(LREF compose) 58 $(LREF composeJamo) 59 )) 60 $(TR $(TD Sets) $(TD 61 $(LREF CodepointInterval) 62 $(LREF CodepointSet) 63 $(LREF InversionList) 64 $(LREF unicode) 65 )) 66 $(TR $(TD Trie) $(TD 67 $(LREF codepointSetTrie) 68 $(LREF CodepointSetTrie) 69 $(LREF codepointTrie) 70 $(LREF CodepointTrie) 71 $(LREF toTrie) 72 $(LREF toDelegate) 73 )) 74 $(TR $(TD Casing) $(TD 75 $(LREF asCapitalized) 76 $(LREF asLowerCase) 77 $(LREF asUpperCase) 78 $(LREF isLower) 79 $(LREF isUpper) 80 $(LREF toLower) 81 $(LREF toLowerInPlace) 82 $(LREF toUpper) 83 $(LREF toUpperInPlace) 84 )) 85 $(TR $(TD Utf8Matcher) $(TD 86 $(LREF isUtfMatcher) 87 $(LREF MatcherConcept) 88 $(LREF utfMatcher) 89 )) 90 $(TR $(TD Separators) $(TD 91 $(LREF lineSep) 92 $(LREF nelSep) 93 $(LREF paraSep) 94 )) 95 $(TR $(TD Building blocks) $(TD 96 $(LREF allowedIn) 97 $(LREF combiningClass) 98 $(LREF Grapheme) 99 )) 100 )) 101 102 $(P All primitives listed operate on Unicode characters and 103 sets of characters. For functions which operate on ASCII characters 104 and ignore Unicode $(CHARACTERS), see $(MREF std, ascii). 105 For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms 106 used throughout this module see the $(S_LINK Terminology, terminology) section 107 below. 108 ) 109 $(P The focus of this module is the core needs of developing Unicode-aware 110 applications. To that effect it provides the following optimized primitives: 111 ) 112 $(UL 113 $(LI Character classification by category and common properties: 114 $(LREF isAlpha), $(LREF isWhite) and others. 115 ) 116 $(LI 117 Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)). 118 ) 119 $(LI 120 Converting text to any of the four normalization forms via $(LREF normalize). 121 ) 122 $(LI 123 Decoding ($(LREF decodeGrapheme)) and iteration ($(LREF byGrapheme), $(LREF graphemeStride)) 124 by user-perceived characters, that is by $(LREF Grapheme) clusters. 125 ) 126 $(LI 127 Decomposing and composing of individual character(s) according to canonical 128 or compatibility rules, see $(LREF compose) and $(LREF decompose), 129 including the specific version for Hangul syllables $(LREF composeJamo) 130 and $(LREF decomposeHangul). 131 ) 132 ) 133 $(P It's recognized that an application may need further enhancements 134 and extensions, such as less commonly known algorithms, 135 or tailoring existing ones for region specific needs. To help users 136 with building any extra functionality beyond the core primitives, 137 the module provides: 138 ) 139 $(UL 140 $(LI 141 $(LREF CodepointSet), a type for easy manipulation of sets of characters. 142 Besides the typical set algebra it provides an unusual feature: 143 a D source code generator for detection of $(CODEPOINTS) in this set. 144 This is a boon for meta-programming parser frameworks, 145 and is used internally to power classification in small 146 sets like $(LREF isWhite). 147 ) 148 $(LI 149 A way to construct optimal packed multi-stage tables also known as a 150 special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie). 151 The functions $(LREF codepointTrie), $(LREF codepointSetTrie) 152 construct custom tries that map dchar to value. 153 The end result is a fast and predictable $(BIGOH 1) lookup that powers 154 functions like $(LREF isAlpha) and $(LREF combiningClass), 155 but for user-defined data sets. 156 ) 157 $(LI 158 A useful technique for Unicode-aware parsers that perform 159 character classification of encoded $(CODEPOINTS) 160 is to avoid unnecassary decoding at all costs. 161 $(LREF utfMatcher) provides an improvement over the usual workflow 162 of decode-classify-process, combining the decoding and classification 163 steps. By extracting necessary bits directly from encoded 164 $(S_LINK Code unit, code units) matchers achieve 165 significant performance improvements. See $(LREF MatcherConcept) for 166 the common interface of UTF matchers. 167 ) 168 $(LI 169 Generally useful building blocks for customized normalization: 170 $(LREF combiningClass) for querying combining class 171 and $(LREF allowedIn) for testing the Quick_Check 172 property of a given normalization form. 173 ) 174 $(LI 175 Access to a large selection of commonly used sets of $(CODEPOINTS). 176 $(S_LINK Unicode properties, Supported sets) include Script, 177 Block and General Category. The exact contents of a set can be 178 observed in the CLDR utility, on the 179 $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page 180 of the Unicode website. 181 See $(LREF unicode) for easy and (optionally) compile-time checked set 182 queries. 183 ) 184 ) 185 $(SECTION Synopsis) 186 --- 187 import std.uni; 188 void main() 189 { 190 // initialize code point sets using script/block or property name 191 // now 'set' contains code points from both scripts. 192 auto set = unicode("Cyrillic") | unicode("Armenian"); 193 // same thing but simpler and checked at compile-time 194 auto ascii = unicode.ASCII; 195 auto currency = unicode.Currency_Symbol; 196 197 // easy set ops 198 auto a = set & ascii; 199 assert(a.empty); // as it has no intersection with ascii 200 a = set | ascii; 201 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 202 203 // some properties of code point sets 204 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 205 // testing presence of a code point in a set 206 // is just fine, it is O(logN) 207 assert(!b['$']); 208 assert(!b['\u058F']); // Armenian dram sign 209 assert(b['¥']); 210 211 // building fast lookup tables, these guarantee O(1) complexity 212 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 213 auto oneTrie = toTrie!1(b); 214 // 2-level far more compact but typically slightly slower 215 auto twoTrie = toTrie!2(b); 216 // 3-level even smaller, and a bit slower yet 217 auto threeTrie = toTrie!3(b); 218 assert(oneTrie['£']); 219 assert(twoTrie['£']); 220 assert(threeTrie['£']); 221 222 // build the trie with the most sensible trie level 223 // and bind it as a functor 224 auto cyrillicOrArmenian = toDelegate(set); 225 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 226 assert(balance == "ընկեր!"); 227 // compatible with bool delegate(dchar) 228 bool delegate(dchar) bindIt = cyrillicOrArmenian; 229 230 // Normalization 231 string s = "Plain ascii (and not only), is always normalized!"; 232 assert(s is normalize(s));// is the same string 233 234 string nonS = "A\u0308ffin"; // A ligature 235 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 236 assert(nS == "Äffin"); 237 assert(nS != nonS); 238 string composed = "Äffin"; 239 240 assert(normalize!NFD(composed) == "A\u0308ffin"); 241 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 242 assert(normalize!NFKD("2¹⁰") == "210"); 243 } 244 --- 245 $(SECTION Terminology) 246 $(P The following is a list of important Unicode notions 247 and definitions. Any conventions used specifically in this 248 module alone are marked as such. The descriptions are based on the formal 249 definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf, 250 chapter three of The Unicode Standard Core Specification.) 251 ) 252 $(P $(DEF Abstract character) A unit of information used for the organization, 253 control, or representation of textual data. 254 Note that: 255 $(UL 256 $(LI When representing data, the nature of that data 257 is generally symbolic as opposed to some other 258 kind of data (for example, visual). 259 ) 260 $(LI An abstract character has no concrete form 261 and should not be confused with a $(S_LINK Glyph, glyph). 262 ) 263 $(LI An abstract character does not necessarily 264 correspond to what a user thinks of as a “character” 265 and should not be confused with a $(LREF Grapheme). 266 ) 267 $(LI The abstract characters encoded (see Encoded character) 268 are known as Unicode abstract characters. 269 ) 270 $(LI Abstract characters not directly 271 encoded by the Unicode Standard can often be 272 represented by the use of combining character sequences. 273 ) 274 ) 275 ) 276 $(P $(DEF Canonical decomposition) 277 The decomposition of a character or character sequence 278 that results from recursively applying the canonical 279 mappings found in the Unicode Character Database 280 and these described in Conjoining Jamo Behavior 281 (section 12 of 282 $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)). 283 ) 284 $(P $(DEF Canonical composition) 285 The precise definition of the Canonical composition 286 is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf, 287 Unicode Conformance) section 11. 288 Informally it's the process that does the reverse of the canonical 289 decomposition with the addition of certain rules 290 that e.g. prevent legacy characters from appearing in the composed result. 291 ) 292 $(P $(DEF Canonical equivalent) 293 Two character sequences are said to be canonical equivalents if 294 their full canonical decompositions are identical. 295 ) 296 $(P $(DEF Character) Typically differs by context. 297 For the purpose of this documentation the term $(I character) 298 implies $(I encoded character), that is, a code point having 299 an assigned abstract character (a symbolic meaning). 300 ) 301 $(P $(DEF Code point) Any value in the Unicode codespace; 302 that is, the range of integers from 0 to 10FFFF (hex). 303 Not all code points are assigned to encoded characters. 304 ) 305 $(P $(DEF Code unit) The minimal bit combination that can represent 306 a unit of encoded text for processing or interchange. 307 Depending on the encoding this could be: 308 8-bit code units in the UTF-8 (`char`), 309 16-bit code units in the UTF-16 (`wchar`), 310 and 32-bit code units in the UTF-32 (`dchar`). 311 $(I Note that in UTF-32, a code unit is a code point 312 and is represented by the D `dchar` type.) 313 ) 314 $(P $(DEF Combining character) A character with the General Category 315 of Combining Mark(M). 316 $(UL 317 $(LI All characters with non-zero canonical combining class 318 are combining characters, but the reverse is not the case: 319 there are combining characters with a zero combining class. 320 ) 321 $(LI These characters are not normally used in isolation 322 unless they are being described. They include such characters 323 as accents, diacritics, Hebrew points, Arabic vowel signs, 324 and Indic matras. 325 ) 326 ) 327 ) 328 $(P $(DEF Combining class) 329 A numerical value used by the Unicode Canonical Ordering Algorithm 330 to determine which sequences of combining marks are to be 331 considered canonically equivalent and which are not. 332 ) 333 $(P $(DEF Compatibility decomposition) 334 The decomposition of a character or character sequence that results 335 from recursively applying both the compatibility mappings and 336 the canonical mappings found in the Unicode Character Database, and those 337 described in Conjoining Jamo Behavior no characters 338 can be further decomposed. 339 ) 340 $(P $(DEF Compatibility equivalent) 341 Two character sequences are said to be compatibility 342 equivalents if their full compatibility decompositions are identical. 343 ) 344 $(P $(DEF Encoded character) An association (or mapping) 345 between an abstract character and a code point. 346 ) 347 $(P $(DEF Glyph) The actual, concrete image of a glyph representation 348 having been rasterized or otherwise imaged onto some display surface. 349 ) 350 $(P $(DEF Grapheme base) A character with the property 351 Grapheme_Base, or any standard Korean syllable block. 352 ) 353 $(P $(DEF Grapheme cluster) Defined as the text between 354 grapheme boundaries as specified by Unicode Standard Annex #29, 355 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation). 356 Important general properties of a grapheme: 357 $(UL 358 $(LI The grapheme cluster represents a horizontally segmentable 359 unit of text, consisting of some grapheme base (which may 360 consist of a Korean syllable) together with any number of 361 nonspacing marks applied to it. 362 ) 363 $(LI A grapheme cluster typically starts with a grapheme base 364 and then extends across any subsequent sequence of nonspacing marks. 365 A grapheme cluster is most directly relevant to text rendering and 366 processes such as cursor placement and text selection in editing, 367 but may also be relevant to comparison and searching. 368 ) 369 $(LI For many processes, a grapheme cluster behaves as if it was a 370 single character with the same properties as its grapheme base. 371 Effectively, nonspacing marks apply $(I graphically) to the base, 372 but do not change its properties. 373 ) 374 ) 375 $(P This module defines a number of primitives that work with graphemes: 376 $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride). 377 All of them are using $(I extended grapheme) boundaries 378 as defined in the aforementioned standard annex. 379 ) 380 ) 381 $(P $(DEF Nonspacing mark) A combining character with the 382 General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me). 383 ) 384 $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark. 385 ) 386 $(SECTION Normalization) 387 $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent) 388 or $(S_LINK Compatibility equivalent, compatibility equivalent) 389 characters in the Unicode Standard make it necessary to have a full, formal 390 definition of equivalence for Unicode strings. 391 String equivalence is determined by a process called normalization, 392 whereby strings are converted into forms which are compared 393 directly for identity. This is the primary goal of the normalization process, 394 see the function $(LREF normalize) to convert into any of 395 the four defined forms. 396 ) 397 $(P A very important attribute of the Unicode Normalization Forms 398 is that they must remain stable between versions of the Unicode Standard. 399 A Unicode string normalized to a particular Unicode Normalization Form 400 in one version of the standard is guaranteed to remain in that Normalization 401 Form for implementations of future versions of the standard. 402 ) 403 $(P The Unicode Standard specifies four normalization forms. 404 Informally, two of these forms are defined by maximal decomposition 405 of equivalent sequences, and two of these forms are defined 406 by maximal $(I composition) of equivalent sequences. 407 $(UL 408 $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition, 409 canonical decomposition) of a character sequence.) 410 $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition, 411 compatibility decomposition) of a character sequence.) 412 $(LI Normalization Form C (NFC): The canonical composition of the 413 $(S_LINK Canonical decomposition, canonical decomposition) 414 of a coded character sequence.) 415 $(LI Normalization Form KC (NFKC): The canonical composition 416 of the $(S_LINK Compatibility decomposition, 417 compatibility decomposition) of a character sequence) 418 ) 419 ) 420 $(P The choice of the normalization form depends on the particular use case. 421 NFC is the best form for general text, since it's more compatible with 422 strings converted from legacy encodings. NFKC is the preferred form for 423 identifiers, especially where there are security concerns. NFD and NFKD 424 are the most useful for internal processing. 425 ) 426 $(SECTION Construction of lookup tables) 427 $(P The Unicode standard describes a set of algorithms that 428 depend on having the ability to quickly look up various properties 429 of a code point. Given the codespace of about 1 million $(CODEPOINTS), 430 it is not a trivial task to provide a space-efficient solution for 431 the multitude of properties. 432 ) 433 $(P Common approaches such as hash-tables or binary search over 434 sorted code point intervals (as in $(LREF InversionList)) are insufficient. 435 Hash-tables have enormous memory footprint and binary search 436 over intervals is not fast enough for some heavy-duty algorithms. 437 ) 438 $(P The recommended solution (see Unicode Implementation Guidelines) 439 is using multi-stage tables that are an implementation of the 440 $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer 441 keys and a fixed number of stages. For the remainder of the section 442 this will be called a fixed trie. The following describes a particular 443 implementation that is aimed for the speed of access at the expense 444 of ideal size savings. 445 ) 446 $(P Taking a 2-level Trie as an example the principle of operation is as follows. 447 Split the number of bits in a key (code point, 21 bits) into 2 components 448 (e.g. 15 and 8). The first is the number of bits in the index of the trie 449 and the other is number of bits in each page of the trie. 450 The layout of the trie is then an array of size 2^^bits-of-index followed 451 an array of memory chunks of size 2^^bits-of-page/bits-per-element. 452 ) 453 $(P The number of pages is variable (but not less then 1) 454 unlike the number of entries in the index. The slots of the index 455 all have to contain a number of a page that is present. The lookup is then 456 just a couple of operations - slice the upper bits, 457 lookup an index for these, take a page at this index and use 458 the lower bits as an offset within this page. 459 460 Assuming that pages are laid out consequently 461 in one array at `pages`, the pseudo-code is: 462 ) 463 --- 464 auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits; 465 pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)]; 466 --- 467 $(P Where if `elemsPerPage` is a power of 2 the whole process is 468 a handful of simple instructions and 2 array reads. Subsequent levels 469 of the trie are introduced by recursing on this notion - the index array 470 is treated as values. The number of bits in index is then again 471 split into 2 parts, with pages over 'current-index' and the new 'upper-index'. 472 ) 473 474 $(P For completeness a level 1 trie is simply an array. 475 The current implementation takes advantage of bit-packing values 476 when the range is known to be limited in advance (such as `bool`). 477 See also $(LREF BitPacked) for enforcing it manually. 478 The major size advantage however comes from the fact 479 that multiple $(B identical pages on every level are merged) by construction. 480 ) 481 $(P The process of constructing a trie is more involved and is hidden from 482 the user in a form of the convenience functions $(LREF codepointTrie), 483 $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie). 484 In general a set or built-in AA with `dchar` type 485 can be turned into a trie. The trie object in this module 486 is read-only (immutable); it's effectively frozen after construction. 487 ) 488 $(SECTION Unicode properties) 489 $(P This is a full list of Unicode properties accessible through $(LREF unicode) 490 with specific helpers per category nested within. Consult the 491 $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility) 492 when in doubt about the contents of a particular set. 493 ) 494 $(P General category sets listed below are only accessible with the 495 $(LREF unicode) shorthand accessor.) 496 $(BOOKTABLE $(B General category ), 497 $(TR $(TH Abb.) $(TH Long form) 498 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form)) 499 $(TR $(TD L) $(TD Letter) 500 $(TD Cn) $(TD Unassigned) $(TD Po) $(TD Other_Punctuation)) 501 $(TR $(TD Ll) $(TD Lowercase_Letter) 502 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation)) 503 $(TR $(TD Lm) $(TD Modifier_Letter) 504 $(TD Cs) $(TD Surrogate) $(TD S) $(TD Symbol)) 505 $(TR $(TD Lo) $(TD Other_Letter) 506 $(TD N) $(TD Number) $(TD Sc) $(TD Currency_Symbol)) 507 $(TR $(TD Lt) $(TD Titlecase_Letter) 508 $(TD Nd) $(TD Decimal_Number) $(TD Sk) $(TD Modifier_Symbol)) 509 $(TR $(TD Lu) $(TD Uppercase_Letter) 510 $(TD Nl) $(TD Letter_Number) $(TD Sm) $(TD Math_Symbol)) 511 $(TR $(TD M) $(TD Mark) 512 $(TD No) $(TD Other_Number) $(TD So) $(TD Other_Symbol)) 513 $(TR $(TD Mc) $(TD Spacing_Mark) 514 $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator)) 515 $(TR $(TD Me) $(TD Enclosing_Mark) 516 $(TD Pc) $(TD Connector_Punctuation) $(TD Zl) $(TD Line_Separator)) 517 $(TR $(TD Mn) $(TD Nonspacing_Mark) 518 $(TD Pd) $(TD Dash_Punctuation) $(TD Zp) $(TD Paragraph_Separator)) 519 $(TR $(TD C) $(TD Other) 520 $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator)) 521 $(TR $(TD Cc) $(TD Control) $(TD Pf) 522 $(TD Final_Punctuation) $(TD -) $(TD Any)) 523 $(TR $(TD Cf) $(TD Format) 524 $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII)) 525 ) 526 $(P Sets for other commonly useful properties that are 527 accessible with $(LREF unicode):) 528 $(BOOKTABLE $(B Common binary properties), 529 $(TR $(TH Name) $(TH Name) $(TH Name)) 530 $(TR $(TD Alphabetic) $(TD Ideographic) $(TD Other_Uppercase)) 531 $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax)) 532 $(TR $(TD Bidi_Control) $(TD ID_Start) $(TD Pattern_White_Space)) 533 $(TR $(TD Cased) $(TD IDS_Trinary_Operator) $(TD Quotation_Mark)) 534 $(TR $(TD Case_Ignorable) $(TD Join_Control) $(TD Radical)) 535 $(TR $(TD Dash) $(TD Logical_Order_Exception) $(TD Soft_Dotted)) 536 $(TR $(TD Default_Ignorable_Code_Point) $(TD Lowercase) $(TD STerm)) 537 $(TR $(TD Deprecated) $(TD Math) $(TD Terminal_Punctuation)) 538 $(TR $(TD Diacritic) $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph)) 539 $(TR $(TD Extender) $(TD Other_Alphabetic) $(TD Uppercase)) 540 $(TR $(TD Grapheme_Base) $(TD Other_Default_Ignorable_Code_Point) $(TD Variation_Selector)) 541 $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend) $(TD White_Space)) 542 $(TR $(TD Grapheme_Link) $(TD Other_ID_Continue) $(TD XID_Continue)) 543 $(TR $(TD Hex_Digit) $(TD Other_ID_Start) $(TD XID_Start)) 544 $(TR $(TD Hyphen) $(TD Other_Lowercase) ) 545 $(TR $(TD ID_Continue) $(TD Other_Math) ) 546 ) 547 $(P Below is the table with block names accepted by $(LREF unicode.block). 548 Note that the shorthand version $(LREF unicode) requires "In" 549 to be prepended to the names of blocks so as to disambiguate 550 scripts and blocks. 551 ) 552 $(BOOKTABLE $(B Blocks), 553 $(TR $(TD Aegean Numbers) $(TD Ethiopic Extended) $(TD Mongolian)) 554 $(TR $(TD Alchemical Symbols) $(TD Ethiopic Extended-A) $(TD Musical Symbols)) 555 $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement) $(TD Myanmar)) 556 $(TR $(TD Ancient Greek Musical Notation) $(TD General Punctuation) $(TD Myanmar Extended-A)) 557 $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes) $(TD New Tai Lue)) 558 $(TR $(TD Ancient Symbols) $(TD Georgian) $(TD NKo)) 559 $(TR $(TD Arabic) $(TD Georgian Supplement) $(TD Number Forms)) 560 $(TR $(TD Arabic Extended-A) $(TD Glagolitic) $(TD Ogham)) 561 $(TR $(TD Arabic Mathematical Alphabetic Symbols) $(TD Gothic) $(TD Ol Chiki)) 562 $(TR $(TD Arabic Presentation Forms-A) $(TD Greek and Coptic) $(TD Old Italic)) 563 $(TR $(TD Arabic Presentation Forms-B) $(TD Greek Extended) $(TD Old Persian)) 564 $(TR $(TD Arabic Supplement) $(TD Gujarati) $(TD Old South Arabian)) 565 $(TR $(TD Armenian) $(TD Gurmukhi) $(TD Old Turkic)) 566 $(TR $(TD Arrows) $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition)) 567 $(TR $(TD Avestan) $(TD Hangul Compatibility Jamo) $(TD Oriya)) 568 $(TR $(TD Balinese) $(TD Hangul Jamo) $(TD Osmanya)) 569 $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A) $(TD Phags-pa)) 570 $(TR $(TD Bamum Supplement) $(TD Hangul Jamo Extended-B) $(TD Phaistos Disc)) 571 $(TR $(TD Basic Latin) $(TD Hangul Syllables) $(TD Phoenician)) 572 $(TR $(TD Batak) $(TD Hanunoo) $(TD Phonetic Extensions)) 573 $(TR $(TD Bengali) $(TD Hebrew) $(TD Phonetic Extensions Supplement)) 574 $(TR $(TD Block Elements) $(TD High Private Use Surrogates) $(TD Playing Cards)) 575 $(TR $(TD Bopomofo) $(TD High Surrogates) $(TD Private Use Area)) 576 $(TR $(TD Bopomofo Extended) $(TD Hiragana) $(TD Rejang)) 577 $(TR $(TD Box Drawing) $(TD Ideographic Description Characters) $(TD Rumi Numeral Symbols)) 578 $(TR $(TD Brahmi) $(TD Imperial Aramaic) $(TD Runic)) 579 $(TR $(TD Braille Patterns) $(TD Inscriptional Pahlavi) $(TD Samaritan)) 580 $(TR $(TD Buginese) $(TD Inscriptional Parthian) $(TD Saurashtra)) 581 $(TR $(TD Buhid) $(TD IPA Extensions) $(TD Sharada)) 582 $(TR $(TD Byzantine Musical Symbols) $(TD Javanese) $(TD Shavian)) 583 $(TR $(TD Carian) $(TD Kaithi) $(TD Sinhala)) 584 $(TR $(TD Chakma) $(TD Kana Supplement) $(TD Small Form Variants)) 585 $(TR $(TD Cham) $(TD Kanbun) $(TD Sora Sompeng)) 586 $(TR $(TD Cherokee) $(TD Kangxi Radicals) $(TD Spacing Modifier Letters)) 587 $(TR $(TD CJK Compatibility) $(TD Kannada) $(TD Specials)) 588 $(TR $(TD CJK Compatibility Forms) $(TD Katakana) $(TD Sundanese)) 589 $(TR $(TD CJK Compatibility Ideographs) $(TD Katakana Phonetic Extensions) $(TD Sundanese Supplement)) 590 $(TR $(TD CJK Compatibility Ideographs Supplement) $(TD Kayah Li) $(TD Superscripts and Subscripts)) 591 $(TR $(TD CJK Radicals Supplement) $(TD Kharoshthi) $(TD Supplemental Arrows-A)) 592 $(TR $(TD CJK Strokes) $(TD Khmer) $(TD Supplemental Arrows-B)) 593 $(TR $(TD CJK Symbols and Punctuation) $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators)) 594 $(TR $(TD CJK Unified Ideographs) $(TD Lao) $(TD Supplemental Punctuation)) 595 $(TR $(TD CJK Unified Ideographs Extension A) $(TD Latin-1 Supplement) $(TD Supplementary Private Use Area-A)) 596 $(TR $(TD CJK Unified Ideographs Extension B) $(TD Latin Extended-A) $(TD Supplementary Private Use Area-B)) 597 $(TR $(TD CJK Unified Ideographs Extension C) $(TD Latin Extended Additional) $(TD Syloti Nagri)) 598 $(TR $(TD CJK Unified Ideographs Extension D) $(TD Latin Extended-B) $(TD Syriac)) 599 $(TR $(TD Combining Diacritical Marks) $(TD Latin Extended-C) $(TD Tagalog)) 600 $(TR $(TD Combining Diacritical Marks for Symbols) $(TD Latin Extended-D) $(TD Tagbanwa)) 601 $(TR $(TD Combining Diacritical Marks Supplement) $(TD Lepcha) $(TD Tags)) 602 $(TR $(TD Combining Half Marks) $(TD Letterlike Symbols) $(TD Tai Le)) 603 $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham)) 604 $(TR $(TD Control Pictures) $(TD Linear B Ideograms) $(TD Tai Viet)) 605 $(TR $(TD Coptic) $(TD Linear B Syllabary) $(TD Tai Xuan Jing Symbols)) 606 $(TR $(TD Counting Rod Numerals) $(TD Lisu) $(TD Takri)) 607 $(TR $(TD Cuneiform) $(TD Low Surrogates) $(TD Tamil)) 608 $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian) $(TD Telugu)) 609 $(TR $(TD Currency Symbols) $(TD Lydian) $(TD Thaana)) 610 $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai)) 611 $(TR $(TD Cyrillic) $(TD Malayalam) $(TD Tibetan)) 612 $(TR $(TD Cyrillic Extended-A) $(TD Mandaic) $(TD Tifinagh)) 613 $(TR $(TD Cyrillic Extended-B) $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols)) 614 $(TR $(TD Cyrillic Supplement) $(TD Mathematical Operators) $(TD Ugaritic)) 615 $(TR $(TD Deseret) $(TD Meetei Mayek) $(TD Unified Canadian Aboriginal Syllabics)) 616 $(TR $(TD Devanagari) $(TD Meetei Mayek Extensions) $(TD Unified Canadian Aboriginal Syllabics Extended)) 617 $(TR $(TD Devanagari Extended) $(TD Meroitic Cursive) $(TD Vai)) 618 $(TR $(TD Dingbats) $(TD Meroitic Hieroglyphs) $(TD Variation Selectors)) 619 $(TR $(TD Domino Tiles) $(TD Miao) $(TD Variation Selectors Supplement)) 620 $(TR $(TD Egyptian Hieroglyphs) $(TD Miscellaneous Mathematical Symbols-A) $(TD Vedic Extensions)) 621 $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B) $(TD Vertical Forms)) 622 $(TR $(TD Enclosed Alphanumerics) $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols)) 623 $(TR $(TD Enclosed Alphanumeric Supplement) $(TD Miscellaneous Symbols and Arrows) $(TD Yi Radicals)) 624 $(TR $(TD Enclosed CJK Letters and Months) $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables)) 625 $(TR $(TD Enclosed Ideographic Supplement) $(TD Miscellaneous Technical) ) 626 $(TR $(TD Ethiopic) $(TD Modifier Tone Letters) ) 627 ) 628 $(P Below is the table with script names accepted by $(LREF unicode.script) 629 and by the shorthand version $(LREF unicode):) 630 $(BOOKTABLE $(B Scripts), 631 $(TR $(TD Arabic) $(TD Hanunoo) $(TD Old_Italic)) 632 $(TR $(TD Armenian) $(TD Hebrew) $(TD Old_Persian)) 633 $(TR $(TD Avestan) $(TD Hiragana) $(TD Old_South_Arabian)) 634 $(TR $(TD Balinese) $(TD Imperial_Aramaic) $(TD Old_Turkic)) 635 $(TR $(TD Bamum) $(TD Inherited) $(TD Oriya)) 636 $(TR $(TD Batak) $(TD Inscriptional_Pahlavi) $(TD Osmanya)) 637 $(TR $(TD Bengali) $(TD Inscriptional_Parthian) $(TD Phags_Pa)) 638 $(TR $(TD Bopomofo) $(TD Javanese) $(TD Phoenician)) 639 $(TR $(TD Brahmi) $(TD Kaithi) $(TD Rejang)) 640 $(TR $(TD Braille) $(TD Kannada) $(TD Runic)) 641 $(TR $(TD Buginese) $(TD Katakana) $(TD Samaritan)) 642 $(TR $(TD Buhid) $(TD Kayah_Li) $(TD Saurashtra)) 643 $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi) $(TD Sharada)) 644 $(TR $(TD Carian) $(TD Khmer) $(TD Shavian)) 645 $(TR $(TD Chakma) $(TD Lao) $(TD Sinhala)) 646 $(TR $(TD Cham) $(TD Latin) $(TD Sora_Sompeng)) 647 $(TR $(TD Cherokee) $(TD Lepcha) $(TD Sundanese)) 648 $(TR $(TD Common) $(TD Limbu) $(TD Syloti_Nagri)) 649 $(TR $(TD Coptic) $(TD Linear_B) $(TD Syriac)) 650 $(TR $(TD Cuneiform) $(TD Lisu) $(TD Tagalog)) 651 $(TR $(TD Cypriot) $(TD Lycian) $(TD Tagbanwa)) 652 $(TR $(TD Cyrillic) $(TD Lydian) $(TD Tai_Le)) 653 $(TR $(TD Deseret) $(TD Malayalam) $(TD Tai_Tham)) 654 $(TR $(TD Devanagari) $(TD Mandaic) $(TD Tai_Viet)) 655 $(TR $(TD Egyptian_Hieroglyphs) $(TD Meetei_Mayek) $(TD Takri)) 656 $(TR $(TD Ethiopic) $(TD Meroitic_Cursive) $(TD Tamil)) 657 $(TR $(TD Georgian) $(TD Meroitic_Hieroglyphs) $(TD Telugu)) 658 $(TR $(TD Glagolitic) $(TD Miao) $(TD Thaana)) 659 $(TR $(TD Gothic) $(TD Mongolian) $(TD Thai)) 660 $(TR $(TD Greek) $(TD Myanmar) $(TD Tibetan)) 661 $(TR $(TD Gujarati) $(TD New_Tai_Lue) $(TD Tifinagh)) 662 $(TR $(TD Gurmukhi) $(TD Nko) $(TD Ugaritic)) 663 $(TR $(TD Han) $(TD Ogham) $(TD Vai)) 664 $(TR $(TD Hangul) $(TD Ol_Chiki) $(TD Yi)) 665 ) 666 $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).) 667 $(BOOKTABLE $(B Hangul syllable type), 668 $(TR $(TH Abb.) $(TH Long form)) 669 $(TR $(TD L) $(TD Leading_Jamo)) 670 $(TR $(TD LV) $(TD LV_Syllable)) 671 $(TR $(TD LVT) $(TD LVT_Syllable) ) 672 $(TR $(TD T) $(TD Trailing_Jamo)) 673 $(TR $(TD V) $(TD Vowel_Jamo)) 674 ) 675 References: 676 $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table), 677 $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia), 678 $(HTTP www.unicode.org, The Unicode Consortium), 679 $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms), 680 $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation) 681 $(HTTP www.unicode.org/uni2book/ch05.pdf, 682 Unicode Implementation Guidelines) 683 $(HTTP www.unicode.org/uni2book/ch03.pdf, 684 Unicode Conformance) 685 Trademarks: 686 Unicode(tm) is a trademark of Unicode, Inc. 687 688 Copyright: Copyright 2013 - 689 License: $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0). 690 Authors: Dmitry Olshansky 691 Source: $(PHOBOSSRC std/uni/package.d) 692 Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2) 693 694 Macros: 695 696 SECTION = <h3><a id="$1">$0</a></h3> 697 DEF = <div><a id="$1"><i>$0</i></a></div> 698 S_LINK = <a href="#$1">$+</a> 699 CODEPOINT = $(S_LINK Code point, code point) 700 CODEPOINTS = $(S_LINK Code point, code points) 701 CHARACTER = $(S_LINK Character, character) 702 CHARACTERS = $(S_LINK Character, characters) 703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster) 704 +/ 705 module std.uni; 706 707 import std.meta : AliasSeq; 708 import std.range.primitives : back, ElementEncodingType, ElementType, empty, 709 front, hasLength, hasSlicing, isForwardRange, isInputRange, 710 isRandomAccessRange, popFront, put, save; 711 import std.traits : isConvertibleToString, isIntegral, isSomeChar, 712 isSomeString, Unqual, isDynamicArray; 713 // debug = std_uni; 714 715 import std.internal.unicode_tables; // generated file 716 717 debug(std_uni) import std.stdio; // writefln, writeln 718 719 private: 720 721 722 void copyBackwards(T,U)(T[] src, U[] dest) 723 { 724 assert(src.length == dest.length); 725 for (size_t i=src.length; i-- > 0; ) 726 dest[i] = src[i]; 727 } 728 729 void copyForward(T,U)(T[] src, U[] dest) 730 { 731 assert(src.length == dest.length); 732 for (size_t i=0; i<src.length; i++) 733 dest[i] = src[i]; 734 } 735 736 // TODO: update to reflect all major CPUs supporting unaligned reads 737 version (X86) 738 enum hasUnalignedReads = true; 739 else version (X86_64) 740 enum hasUnalignedReads = true; 741 else version (SystemZ) 742 enum hasUnalignedReads = true; 743 else 744 enum hasUnalignedReads = false; // better be safe then sorry 745 746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator. 747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator. 748 public enum dchar nelSep = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line. 749 750 // test the intro example 751 @safe unittest 752 { 753 import std.algorithm.searching : find; 754 // initialize code point sets using script/block or property name 755 // set contains code points from both scripts. 756 auto set = unicode("Cyrillic") | unicode("Armenian"); 757 // or simpler and statically-checked look 758 auto ascii = unicode.ASCII; 759 auto currency = unicode.Currency_Symbol; 760 761 // easy set ops 762 auto a = set & ascii; 763 assert(a.empty); // as it has no intersection with ascii 764 a = set | ascii; 765 auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian 766 767 // some properties of code point sets 768 assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2 769 // testing presence of a code point in a set 770 // is just fine, it is O(logN) 771 assert(!b['$']); 772 assert(!b['\u058F']); // Armenian dram sign 773 assert(b['¥']); 774 775 // building fast lookup tables, these guarantee O(1) complexity 776 // 1-level Trie lookup table essentially a huge bit-set ~262Kb 777 auto oneTrie = toTrie!1(b); 778 // 2-level far more compact but typically slightly slower 779 auto twoTrie = toTrie!2(b); 780 // 3-level even smaller, and a bit slower yet 781 auto threeTrie = toTrie!3(b); 782 assert(oneTrie['£']); 783 assert(twoTrie['£']); 784 assert(threeTrie['£']); 785 786 // build the trie with the most sensible trie level 787 // and bind it as a functor 788 auto cyrillicOrArmenian = toDelegate(set); 789 auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!"); 790 assert(balance == "ընկեր!"); 791 // compatible with bool delegate(dchar) 792 bool delegate(dchar) bindIt = cyrillicOrArmenian; 793 794 // Normalization 795 string s = "Plain ascii (and not only), is always normalized!"; 796 assert(s is normalize(s));// is the same string 797 798 string nonS = "A\u0308ffin"; // A ligature 799 auto nS = normalize(nonS); // to NFC, the W3C endorsed standard 800 assert(nS == "Äffin"); 801 assert(nS != nonS); 802 string composed = "Äffin"; 803 804 assert(normalize!NFD(composed) == "A\u0308ffin"); 805 // to NFKD, compatibility decomposition useful for fuzzy matching/searching 806 assert(normalize!NFKD("2¹⁰") == "210"); 807 } 808 809 enum lastDchar = 0x10FFFF; 810 811 auto force(T, F)(F from) 812 if (isIntegral!T && !is(T == F)) 813 { 814 assert(from <= T.max && from >= T.min); 815 return cast(T) from; 816 } 817 818 auto force(T, F)(F from) 819 if (isBitPacked!T && !is(T == F)) 820 { 821 assert(from <= 2^^bitSizeOf!T-1); 822 return T(cast(TypeOfBitPacked!T) from); 823 } 824 825 auto force(T, F)(F from) 826 if (is(T == F)) 827 { 828 return from; 829 } 830 831 // repeat X times the bit-pattern in val assuming it's length is 'bits' 832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc 833 { 834 static if (times == 1) 835 return val; 836 else static if (bits == 1) 837 { 838 static if (times == size_t.sizeof*8) 839 return val ? size_t.max : 0; 840 else 841 return val ? (1 << times)-1 : 0; 842 } 843 else static if (times % 2) 844 return (replicateBits!(times-1, bits)(val)<<bits) | val; 845 else 846 return replicateBits!(times/2, bits*2)((val << bits) | val); 847 } 848 849 @safe pure nothrow @nogc unittest // for replicate 850 { 851 import std.algorithm.iteration : sum, map; 852 import std.range : iota; 853 size_t m = 0b111; 854 size_t m2 = 0b01; 855 static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) 856 { 857 assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i))); 858 assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum()); 859 } 860 } 861 862 // multiple arrays squashed into one memory block 863 struct MultiArray(Types...) 864 { 865 import std.range.primitives : isOutputRange; 866 this(size_t[] sizes...) @safe pure nothrow 867 { 868 assert(dim == sizes.length); 869 size_t full_size; 870 foreach (i, v; Types) 871 { 872 full_size += spaceFor!(bitSizeOf!v)(sizes[i]); 873 sz[i] = sizes[i]; 874 static if (i >= 1) 875 offsets[i] = offsets[i-1] + 876 spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]); 877 } 878 879 storage = new size_t[full_size]; 880 } 881 882 this(const(size_t)[] raw_offsets, 883 const(size_t)[] raw_sizes, 884 return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc 885 { 886 offsets[] = raw_offsets[]; 887 sz[] = raw_sizes[]; 888 storage = data; 889 } 890 891 @property auto slice(size_t n)()inout pure nothrow @nogc 892 { 893 auto ptr = raw_ptr!n; 894 return packedArrayView!(Types[n])(ptr, sz[n]); 895 } 896 897 @property auto ptr(size_t n)()inout pure nothrow @nogc 898 { 899 auto ptr = raw_ptr!n; 900 return inout(PackedPtr!(Types[n]))(ptr); 901 } 902 903 template length(size_t n) 904 { 905 @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; } 906 907 @property void length(size_t new_size) 908 { 909 if (new_size > sz[n]) 910 {// extend 911 size_t delta = (new_size - sz[n]); 912 sz[n] += delta; 913 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 914 storage.length += delta;// extend space at end 915 // raw_slice!x must follow resize as it could be moved! 916 // next stmts move all data past this array, last-one-goes-first 917 static if (n != dim-1) 918 { 919 auto start = raw_ptr!(n+1); 920 // len includes delta 921 size_t len = (storage.ptr+storage.length-start); 922 923 copyBackwards(start[0 .. len-delta], start[delta .. len]); 924 925 start[0 .. delta] = 0; 926 // offsets are used for raw_slice, ptr etc. 927 foreach (i; n+1 .. dim) 928 offsets[i] += delta; 929 } 930 } 931 else if (new_size < sz[n]) 932 {// shrink 933 size_t delta = (sz[n] - new_size); 934 sz[n] -= delta; 935 delta = spaceFor!(bitSizeOf!(Types[n]))(delta); 936 // move all data past this array, forward direction 937 static if (n != dim-1) 938 { 939 auto start = raw_ptr!(n+1); 940 size_t len = (storage.ptr+storage.length-start); 941 copyForward(start[0 .. len-delta], start[delta .. len]); 942 943 // adjust offsets last, they affect raw_slice 944 foreach (i; n+1 .. dim) 945 offsets[i] -= delta; 946 } 947 storage.length -= delta; 948 } 949 // else - NOP 950 } 951 } 952 953 @property size_t bytes(size_t n=size_t.max)() const @safe 954 { 955 static if (n == size_t.max) 956 return storage.length*size_t.sizeof; 957 else static if (n != Types.length-1) 958 return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof; 959 else 960 return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof; 961 } 962 963 void store(OutRange)(scope OutRange sink) const 964 if (isOutputRange!(OutRange, char)) 965 { 966 import std.format.write : formattedWrite; 967 formattedWrite(sink, "[%( 0x%x, %)]", offsets[]); 968 formattedWrite(sink, ", [%( 0x%x, %)]", sz[]); 969 formattedWrite(sink, ", [%( 0x%x, %)]", storage); 970 } 971 972 private: 973 import std.meta : staticMap; 974 @property auto raw_ptr(size_t n)()inout pure nothrow @nogc 975 { 976 static if (n == 0) 977 return storage.ptr; 978 else 979 { 980 return storage.ptr+offsets[n]; 981 } 982 } 983 enum dim = Types.length; 984 size_t[dim] offsets;// offset for level x 985 size_t[dim] sz;// size of level x 986 alias bitWidth = staticMap!(bitSizeOf, Types); 987 size_t[] storage; 988 } 989 990 @system unittest 991 { 992 import std.conv : text; 993 enum dg = (){ 994 // sizes are: 995 // lvl0: 3, lvl1 : 2, lvl2: 1 996 auto m = MultiArray!(int, ubyte, int)(3,2,1); 997 998 static void check(size_t k, T)(ref T m, int n) 999 { 1000 foreach (i; 0 .. n) 1001 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n])); 1002 } 1003 1004 static void checkB(size_t k, T)(ref T m, int n) 1005 { 1006 foreach (i; 0 .. n) 1007 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n])); 1008 } 1009 1010 static void fill(size_t k, T)(ref T m, int n) 1011 { 1012 foreach (i; 0 .. n) 1013 m.slice!(k)[i] = force!ubyte(i+1); 1014 } 1015 1016 static void fillB(size_t k, T)(ref T m, int n) 1017 { 1018 foreach (i; 0 .. n) 1019 m.slice!(k)[i] = force!ubyte(n-i); 1020 } 1021 1022 m.length!1 = 100; 1023 fill!1(m, 100); 1024 check!1(m, 100); 1025 1026 m.length!0 = 220; 1027 fill!0(m, 220); 1028 check!1(m, 100); 1029 check!0(m, 220); 1030 1031 m.length!2 = 17; 1032 fillB!2(m, 17); 1033 checkB!2(m, 17); 1034 check!0(m, 220); 1035 check!1(m, 100); 1036 1037 m.length!2 = 33; 1038 checkB!2(m, 17); 1039 fillB!2(m, 33); 1040 checkB!2(m, 33); 1041 check!0(m, 220); 1042 check!1(m, 100); 1043 1044 m.length!1 = 195; 1045 fillB!1(m, 195); 1046 checkB!1(m, 195); 1047 checkB!2(m, 33); 1048 check!0(m, 220); 1049 1050 auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10); 1051 marr.length!0 = 15; 1052 marr.length!1 = 30; 1053 fill!1(marr, 30); 1054 fill!0(marr, 15); 1055 check!1(marr, 30); 1056 check!0(marr, 15); 1057 return 0; 1058 }; 1059 enum ct = dg(); 1060 auto rt = dg(); 1061 } 1062 1063 @system unittest 1064 {// more bitpacking tests 1065 import std.conv : text; 1066 1067 alias Bitty = 1068 MultiArray!(BitPacked!(size_t, 3) 1069 , BitPacked!(size_t, 4) 1070 , BitPacked!(size_t, 3) 1071 , BitPacked!(size_t, 6) 1072 , bool); 1073 alias fn1 = sliceBits!(13, 16); 1074 alias fn2 = sliceBits!( 9, 13); 1075 alias fn3 = sliceBits!( 6, 9); 1076 alias fn4 = sliceBits!( 0, 6); 1077 static void check(size_t lvl, MA)(ref MA arr){ 1078 for (size_t i = 0; i< arr.length!lvl; i++) 1079 assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i])); 1080 } 1081 1082 static void fillIdx(size_t lvl, MA)(ref MA arr){ 1083 for (size_t i = 0; i< arr.length!lvl; i++) 1084 arr.slice!(lvl)[i] = i; 1085 } 1086 Bitty m1; 1087 1088 m1.length!4 = 10; 1089 m1.length!3 = 2^^6; 1090 m1.length!2 = 2^^3; 1091 m1.length!1 = 2^^4; 1092 m1.length!0 = 2^^3; 1093 1094 m1.length!4 = 2^^16; 1095 1096 for (size_t i = 0; i< m1.length!4; i++) 1097 m1.slice!(4)[i] = i % 2; 1098 1099 fillIdx!1(m1); 1100 check!1(m1); 1101 fillIdx!2(m1); 1102 check!2(m1); 1103 fillIdx!3(m1); 1104 check!3(m1); 1105 fillIdx!0(m1); 1106 check!0(m1); 1107 check!3(m1); 1108 check!2(m1); 1109 check!1(m1); 1110 for (size_t i=0; i < 2^^16; i++) 1111 { 1112 m1.slice!(4)[i] = i % 2; 1113 m1.slice!(0)[fn1(i)] = fn1(i); 1114 m1.slice!(1)[fn2(i)] = fn2(i); 1115 m1.slice!(2)[fn3(i)] = fn3(i); 1116 m1.slice!(3)[fn4(i)] = fn4(i); 1117 } 1118 for (size_t i=0; i < 2^^16; i++) 1119 { 1120 assert(m1.slice!(4)[i] == i % 2); 1121 assert(m1.slice!(0)[fn1(i)] == fn1(i)); 1122 assert(m1.slice!(1)[fn2(i)] == fn2(i)); 1123 assert(m1.slice!(2)[fn3(i)] == fn3(i)); 1124 assert(m1.slice!(3)[fn4(i)] == fn4(i)); 1125 } 1126 } 1127 1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc 1129 { 1130 import std.math.algebraic : nextPow2; 1131 enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView 1132 static if (bits > 8*size_t.sizeof) 1133 { 1134 static assert(bits % (size_t.sizeof*8) == 0); 1135 return new_len * bits/(8*size_t.sizeof); 1136 } 1137 else 1138 { 1139 enum factor = size_t.sizeof*8/bits; 1140 return (new_len+factor-1)/factor; // rounded up 1141 } 1142 } 1143 1144 template isBitPackableType(T) 1145 { 1146 enum isBitPackableType = isBitPacked!T 1147 || isIntegral!T || is(T == bool) || isSomeChar!T; 1148 } 1149 1150 //============================================================================ 1151 template PackedArrayView(T) 1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1153 && isBitPackableType!U) || isBitPackableType!T) 1154 { 1155 import std.math.algebraic : nextPow2; 1156 private enum bits = bitSizeOf!T; 1157 alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1158 } 1159 1160 //unsafe and fast access to a chunk of RAM as if it contains packed values 1161 template PackedPtr(T) 1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz) 1163 && isBitPackableType!U) || isBitPackableType!T) 1164 { 1165 import std.math.algebraic : nextPow2; 1166 private enum bits = bitSizeOf!T; 1167 alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1); 1168 } 1169 1170 struct PackedPtrImpl(T, size_t bits) 1171 { 1172 pure nothrow: 1173 static assert(isPow2OrZero(bits)); 1174 1175 this(inout(size_t)* ptr)inout @safe @nogc 1176 { 1177 origin = ptr; 1178 } 1179 1180 private T simpleIndex(size_t n) inout 1181 { 1182 immutable q = n / factor; 1183 immutable r = n % factor; 1184 return cast(T)((origin[q] >> bits*r) & mask); 1185 } 1186 1187 private void simpleWrite(TypeOfBitPacked!T val, size_t n) 1188 in 1189 { 1190 static if (isIntegral!T) 1191 assert(val <= mask); 1192 } 1193 do 1194 { 1195 immutable q = n / factor; 1196 immutable r = n % factor; 1197 immutable tgt_shift = bits*r; 1198 immutable word = origin[q]; 1199 origin[q] = (word & ~(mask << tgt_shift)) 1200 | (cast(size_t) val << tgt_shift); 1201 } 1202 1203 static if (factor == bytesPerWord// can safely pack by byte 1204 || factor == 1 // a whole word at a time 1205 || ((factor == bytesPerWord/2 || factor == bytesPerWord/4) 1206 && hasUnalignedReads)) // this needs unaligned reads 1207 { 1208 static if (factor == bytesPerWord) 1209 alias U = ubyte; 1210 else static if (factor == bytesPerWord/2) 1211 alias U = ushort; 1212 else static if (factor == bytesPerWord/4) 1213 alias U = uint; 1214 else static if (size_t.sizeof == 8 && factor == bytesPerWord/8) 1215 alias U = ulong; 1216 1217 T opIndex(size_t idx) inout 1218 { 1219 T ret; 1220 version (LittleEndian) 1221 ret = __ctfe ? simpleIndex(idx) : 1222 cast(inout(T))(cast(U*) origin)[idx]; 1223 else 1224 ret = simpleIndex(idx); 1225 return ret; 1226 } 1227 1228 static if (isBitPacked!T) // lack of user-defined implicit conversion 1229 { 1230 void opIndexAssign(T val, size_t idx) 1231 { 1232 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1233 } 1234 } 1235 1236 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1237 { 1238 version (LittleEndian) 1239 { 1240 if (__ctfe) 1241 simpleWrite(val, idx); 1242 else 1243 (cast(U*) origin)[idx] = cast(U) val; 1244 } 1245 else 1246 simpleWrite(val, idx); 1247 } 1248 } 1249 else 1250 { 1251 T opIndex(size_t n) inout 1252 { 1253 return simpleIndex(n); 1254 } 1255 1256 static if (isBitPacked!T) // lack of user-defined implicit conversion 1257 { 1258 void opIndexAssign(T val, size_t idx) 1259 { 1260 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1261 } 1262 } 1263 1264 void opIndexAssign(TypeOfBitPacked!T val, size_t n) 1265 { 1266 return simpleWrite(val, n); 1267 } 1268 } 1269 1270 private: 1271 // factor - number of elements in one machine word 1272 enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1; 1273 enum bytesPerWord = size_t.sizeof; 1274 size_t* origin; 1275 } 1276 1277 // data is packed only by power of two sized packs per word, 1278 // thus avoiding mul/div overhead at the cost of ultimate packing 1279 // this construct doesn't own memory, only provides access, see MultiArray for usage 1280 struct PackedArrayViewImpl(T, size_t bits) 1281 { 1282 pure nothrow: 1283 1284 this(inout(size_t)* origin, size_t offset, size_t items) inout @safe 1285 { 1286 ptr = inout(PackedPtr!(T))(origin); 1287 ofs = offset; 1288 limit = items; 1289 } 1290 1291 bool zeros(size_t s, size_t e) 1292 in 1293 { 1294 assert(s <= e); 1295 } 1296 do 1297 { 1298 s += ofs; 1299 e += ofs; 1300 immutable pad_s = roundUp(s); 1301 if ( s >= e) 1302 { 1303 foreach (i; s .. e) 1304 if (ptr[i]) 1305 return false; 1306 return true; 1307 } 1308 immutable pad_e = roundDown(e); 1309 size_t i; 1310 for (i=s; i<pad_s; i++) 1311 if (ptr[i]) 1312 return false; 1313 // all in between is x*factor elements 1314 for (size_t j=i/factor; i<pad_e; i+=factor, j++) 1315 if (ptr.origin[j]) 1316 return false; 1317 for (; i<e; i++) 1318 if (ptr[i]) 1319 return false; 1320 return true; 1321 } 1322 1323 T opIndex(size_t idx) inout 1324 in 1325 { 1326 assert(idx < limit); 1327 } 1328 do 1329 { 1330 return ptr[ofs + idx]; 1331 } 1332 1333 static if (isBitPacked!T) // lack of user-defined implicit conversion 1334 { 1335 void opIndexAssign(T val, size_t idx) 1336 { 1337 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx); 1338 } 1339 } 1340 1341 void opIndexAssign(TypeOfBitPacked!T val, size_t idx) 1342 in 1343 { 1344 assert(idx < limit); 1345 } 1346 do 1347 { 1348 ptr[ofs + idx] = val; 1349 } 1350 1351 static if (isBitPacked!T) // lack of user-defined implicit conversions 1352 { 1353 void opSliceAssign(T val, size_t start, size_t end) 1354 { 1355 opSliceAssign(cast(TypeOfBitPacked!T) val, start, end); 1356 } 1357 } 1358 1359 void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end) 1360 in 1361 { 1362 assert(start <= end); 1363 assert(end <= limit); 1364 } 1365 do 1366 { 1367 // account for ofsetted view 1368 start += ofs; 1369 end += ofs; 1370 // rounded to factor granularity 1371 immutable pad_start = roundUp(start);// rounded up 1372 if (pad_start >= end) //rounded up >= then end of slice 1373 { 1374 //nothing to gain, use per element assignment 1375 foreach (i; start .. end) 1376 ptr[i] = val; 1377 return; 1378 } 1379 immutable pad_end = roundDown(end); // rounded down 1380 size_t i; 1381 for (i=start; i<pad_start; i++) 1382 ptr[i] = val; 1383 // all in between is x*factor elements 1384 if (pad_start != pad_end) 1385 { 1386 immutable repval = replicateBits!(factor, bits)(val); 1387 for (size_t j=i/factor; i<pad_end; i+=factor, j++) 1388 ptr.origin[j] = repval;// so speed it up by factor 1389 } 1390 for (; i<end; i++) 1391 ptr[i] = val; 1392 } 1393 1394 auto opSlice(size_t from, size_t to)inout 1395 in 1396 { 1397 assert(from <= to); 1398 assert(ofs + to <= limit); 1399 } 1400 do 1401 { 1402 return typeof(this)(ptr.origin, ofs + from, to - from); 1403 } 1404 1405 auto opSlice(){ return opSlice(0, length); } 1406 1407 bool opEquals(T)(auto ref T arr) const 1408 { 1409 if (limit != arr.limit) 1410 return false; 1411 size_t s1 = ofs, s2 = arr.ofs; 1412 size_t e1 = s1 + limit, e2 = s2 + limit; 1413 if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0) 1414 { 1415 return ptr.origin[s1/factor .. e1/factor] 1416 == arr.ptr.origin[s2/factor .. e2/factor]; 1417 } 1418 for (size_t i=0;i<limit; i++) 1419 if (this[i] != arr[i]) 1420 return false; 1421 return true; 1422 } 1423 1424 @property size_t length()const{ return limit; } 1425 1426 private: 1427 auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; } 1428 auto roundDown()(size_t val){ return val/factor*factor; } 1429 // factor - number of elements in one machine word 1430 enum factor = size_t.sizeof*8/bits; 1431 PackedPtr!(T) ptr; 1432 size_t ofs, limit; 1433 } 1434 1435 1436 private struct SliceOverIndexed(T) 1437 { 1438 enum assignableIndex = is(typeof((){ T.init[0] = Item.init; })); 1439 enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; })); 1440 auto opIndex(size_t idx)const 1441 in 1442 { 1443 assert(idx < to - from); 1444 } 1445 do 1446 { 1447 return (*arr)[from+idx]; 1448 } 1449 1450 static if (assignableIndex) 1451 void opIndexAssign(Item val, size_t idx) 1452 in 1453 { 1454 assert(idx < to - from); 1455 } 1456 do 1457 { 1458 (*arr)[from+idx] = val; 1459 } 1460 1461 auto opSlice(size_t a, size_t b) 1462 { 1463 return typeof(this)(from+a, from+b, arr); 1464 } 1465 1466 // static if (assignableSlice) 1467 void opSliceAssign(T)(T val, size_t start, size_t end) 1468 { 1469 (*arr)[start+from .. end+from] = val; 1470 } 1471 1472 auto opSlice() 1473 { 1474 return typeof(this)(from, to, arr); 1475 } 1476 1477 @property size_t length()const { return to-from;} 1478 1479 alias opDollar = length; 1480 1481 @property bool empty()const { return from == to; } 1482 1483 @property auto front()const { return (*arr)[from]; } 1484 1485 static if (assignableIndex) 1486 @property void front(Item val) { (*arr)[from] = val; } 1487 1488 @property auto back()const { return (*arr)[to-1]; } 1489 1490 static if (assignableIndex) 1491 @property void back(Item val) { (*arr)[to-1] = val; } 1492 1493 @property auto save() inout { return this; } 1494 1495 void popFront() { from++; } 1496 1497 void popBack() { to--; } 1498 1499 bool opEquals(T)(auto ref T arr) const 1500 { 1501 if (arr.length != length) 1502 return false; 1503 for (size_t i=0; i <length; i++) 1504 if (this[i] != arr[i]) 1505 return false; 1506 return true; 1507 } 1508 private: 1509 alias Item = typeof(T.init[0]); 1510 size_t from, to; 1511 T* arr; 1512 } 1513 1514 @safe pure nothrow @nogc unittest 1515 { 1516 static assert(isRandomAccessRange!(SliceOverIndexed!(int[]))); 1517 } 1518 1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x) 1520 if (is(Unqual!T == T)) 1521 { 1522 return SliceOverIndexed!(const(T))(a, b, x); 1523 } 1524 1525 // BUG? inout is out of reach 1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout 1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x) 1528 if (is(Unqual!T == T)) 1529 { 1530 return SliceOverIndexed!T(a, b, x); 1531 } 1532 1533 @system unittest 1534 { 1535 int[] idxArray = [2, 3, 5, 8, 13]; 1536 auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray); 1537 1538 assert(!sliced.empty); 1539 assert(sliced.front == 2); 1540 sliced.front = 1; 1541 assert(sliced.front == 1); 1542 assert(sliced.back == 13); 1543 sliced.popFront(); 1544 assert(sliced.front == 3); 1545 assert(sliced.back == 13); 1546 sliced.back = 11; 1547 assert(sliced.back == 11); 1548 sliced.popBack(); 1549 1550 assert(sliced.front == 3); 1551 assert(sliced[$-1] == 8); 1552 sliced = sliced[]; 1553 assert(sliced[0] == 3); 1554 assert(sliced.back == 8); 1555 sliced = sliced[1..$]; 1556 assert(sliced.front == 5); 1557 sliced = sliced[0..$-1]; 1558 assert(sliced[$-1] == 5); 1559 1560 int[] other = [2, 5]; 1561 assert(sliced[] == sliceOverIndexed(1, 2, &other)); 1562 sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1; 1563 assert(idxArray[0 .. 2] == [-1, -1]); 1564 uint[] nullArr = null; 1565 auto nullSlice = sliceOverIndexed(0, 0, &idxArray); 1566 assert(nullSlice.empty); 1567 } 1568 1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items) 1570 { 1571 return inout(PackedArrayView!T)(ptr, 0, items); 1572 } 1573 1574 1575 //============================================================================ 1576 // Partially unrolled binary search using Shar's method 1577 //============================================================================ 1578 1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow 1580 { 1581 import core.bitop : bsr; 1582 import std.array : replace; 1583 import std.conv : to; 1584 assert(isPow2OrZero(size)); 1585 string code = ` 1586 import core.bitop : bsr; 1587 auto power = bsr(m)+1; 1588 switch (power){`; 1589 size_t i = bsr(size); 1590 foreach_reverse (val; 0 .. bsr(size)) 1591 { 1592 auto v = 2^^val; 1593 code ~= ` 1594 case pow: 1595 if (pred(range[idx+m], needle)) 1596 idx += m; 1597 goto case; 1598 `.replace("m", to!string(v)) 1599 .replace("pow", to!string(i)); 1600 i--; 1601 } 1602 code ~= ` 1603 case 0: 1604 if (pred(range[idx], needle)) 1605 idx += 1; 1606 goto default; 1607 `; 1608 code ~= ` 1609 default: 1610 }`; 1611 return code; 1612 } 1613 1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc 1615 { 1616 // See also: std.math.isPowerOf2() 1617 return (sz & (sz-1)) == 0; 1618 } 1619 1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle) 1621 if (is(T : ElementType!Range)) 1622 { 1623 assert(isPow2OrZero(range.length)); 1624 size_t idx = 0, m = range.length/2; 1625 while (m != 0) 1626 { 1627 if (pred(range[idx+m], needle)) 1628 idx += m; 1629 m /= 2; 1630 } 1631 if (pred(range[idx], needle)) 1632 idx += 1; 1633 return idx; 1634 } 1635 1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle) 1637 if (is(T : ElementType!Range)) 1638 { 1639 assert(isPow2OrZero(range.length)); 1640 size_t idx = 0, m = range.length/2; 1641 enum max = 1 << 10; 1642 while (m >= max) 1643 { 1644 if (pred(range[idx+m], needle)) 1645 idx += m; 1646 m /= 2; 1647 } 1648 mixin(genUnrolledSwitchSearch(max)); 1649 return idx; 1650 } 1651 1652 template sharMethod(alias uniLowerBound) 1653 { 1654 size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle) 1655 if (is(T : ElementType!Range)) 1656 { 1657 import std.functional : binaryFun; 1658 import std.math.algebraic : nextPow2, truncPow2; 1659 alias pred = binaryFun!_pred; 1660 if (range.length == 0) 1661 return 0; 1662 if (isPow2OrZero(range.length)) 1663 return uniLowerBound!pred(range, needle); 1664 size_t n = truncPow2(range.length); 1665 if (pred(range[n-1], needle)) 1666 {// search in another 2^^k area that fully covers the tail of range 1667 size_t k = nextPow2(range.length - n + 1); 1668 return range.length - k + uniLowerBound!pred(range[$-k..$], needle); 1669 } 1670 else 1671 return uniLowerBound!pred(range[0 .. n], needle); 1672 } 1673 } 1674 1675 alias sharLowerBound = sharMethod!uniformLowerBound; 1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound; 1677 1678 @safe unittest 1679 { 1680 import std.array : array; 1681 import std.range : assumeSorted, iota; 1682 1683 auto stdLowerBound(T)(T[] range, T needle) 1684 { 1685 return assumeSorted(range).lowerBound(needle).length; 1686 } 1687 immutable MAX = 5*1173; 1688 auto arr = array(iota(5, MAX, 5)); 1689 assert(arr.length == MAX/5-1); 1690 foreach (i; 0 .. MAX+5) 1691 { 1692 auto st = stdLowerBound(arr, i); 1693 assert(st == sharLowerBound(arr, i)); 1694 assert(st == sharSwitchLowerBound(arr, i)); 1695 } 1696 arr = []; 1697 auto st = stdLowerBound(arr, 33); 1698 assert(st == sharLowerBound(arr, 33)); 1699 assert(st == sharSwitchLowerBound(arr, 33)); 1700 } 1701 //============================================================================ 1702 1703 @safe 1704 { 1705 // hope to see simillar stuff in public interface... once Allocators are out 1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only 1707 1708 @trusted size_t genericReplace(Policy=void, T, Range) 1709 (ref T dest, size_t from, size_t to, Range stuff) 1710 { 1711 import std.algorithm.mutation : copy; 1712 size_t delta = to - from; 1713 size_t stuff_end = from+stuff.length; 1714 if (stuff.length > delta) 1715 {// replace increases length 1716 delta = stuff.length - delta;// now, new is > old by delta 1717 static if (is(Policy == void)) 1718 dest.length = dest.length+delta;//@@@BUG lame @property 1719 else 1720 dest = Policy.realloc(dest, dest.length+delta); 1721 copyBackwards(dest[to .. dest.length-delta], 1722 dest[to+delta .. dest.length]); 1723 copyForward(stuff, dest[from .. stuff_end]); 1724 } 1725 else if (stuff.length == delta) 1726 { 1727 copy(stuff, dest[from .. to]); 1728 } 1729 else 1730 {// replace decreases length by delta 1731 delta = delta - stuff.length; 1732 copy(stuff, dest[from .. stuff_end]); 1733 copyForward(dest[to .. dest.length], 1734 dest[stuff_end .. dest.length-delta]); 1735 static if (is(Policy == void)) 1736 dest.length = dest.length - delta;//@@@BUG lame @property 1737 else 1738 dest = Policy.realloc(dest, dest.length-delta); 1739 } 1740 return stuff_end; 1741 } 1742 1743 1744 // Simple storage manipulation policy 1745 @safe private struct GcPolicy 1746 { 1747 import std.traits : isDynamicArray; 1748 1749 static T[] dup(T)(const T[] arr) 1750 { 1751 return arr.dup; 1752 } 1753 1754 static T[] alloc(T)(size_t size) 1755 { 1756 return new T[size]; 1757 } 1758 1759 static T[] realloc(T)(T[] arr, size_t sz) 1760 { 1761 arr.length = sz; 1762 return arr; 1763 } 1764 1765 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1766 { 1767 replaceInPlace(dest, from, to, stuff); 1768 } 1769 1770 static void append(T, V)(ref T[] arr, V value) 1771 if (!isInputRange!V) 1772 { 1773 arr ~= force!T(value); 1774 } 1775 1776 static void append(T, V)(ref T[] arr, V value) 1777 if (isInputRange!V) 1778 { 1779 insertInPlace(arr, arr.length, value); 1780 } 1781 1782 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1783 if (isDynamicArray!T && is(Unqual!T == T)) 1784 { 1785 debug 1786 { 1787 arr[] = cast(typeof(T.init[0]))(0xdead_beef); 1788 } 1789 arr = null; 1790 } 1791 1792 static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000 1793 if (isDynamicArray!T && !is(Unqual!T == T)) 1794 { 1795 arr = null; 1796 } 1797 } 1798 1799 // ditto 1800 @safe struct ReallocPolicy 1801 { 1802 import std.range.primitives : hasLength; 1803 1804 static T[] dup(T)(const T[] arr) 1805 { 1806 auto result = alloc!T(arr.length); 1807 result[] = arr[]; 1808 return result; 1809 } 1810 1811 static T[] alloc(T)(size_t size) @trusted 1812 { 1813 import std.internal.memory : enforceMalloc; 1814 1815 import core.checkedint : mulu; 1816 bool overflow; 1817 size_t nbytes = mulu(size, T.sizeof, overflow); 1818 if (overflow) assert(0); 1819 1820 auto ptr = cast(T*) enforceMalloc(nbytes); 1821 return ptr[0 .. size]; 1822 } 1823 1824 static T[] realloc(T)(return scope T[] arr, size_t size) @trusted 1825 { 1826 import std.internal.memory : enforceRealloc; 1827 if (!size) 1828 { 1829 destroy(arr); 1830 return null; 1831 } 1832 1833 import core.checkedint : mulu; 1834 bool overflow; 1835 size_t nbytes = mulu(size, T.sizeof, overflow); 1836 if (overflow) assert(0); 1837 1838 auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes); 1839 return ptr[0 .. size]; 1840 } 1841 1842 static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff) 1843 { 1844 genericReplace!(ReallocPolicy)(dest, from, to, stuff); 1845 } 1846 1847 static void append(T, V)(ref T[] arr, V value) 1848 if (!isInputRange!V) 1849 { 1850 if (arr.length == size_t.max) assert(0); 1851 arr = realloc(arr, arr.length+1); 1852 arr[$-1] = force!T(value); 1853 } 1854 1855 pure @safe unittest 1856 { 1857 int[] arr; 1858 ReallocPolicy.append(arr, 3); 1859 1860 import std.algorithm.comparison : equal; 1861 assert(equal(arr, [3])); 1862 } 1863 1864 static void append(T, V)(ref T[] arr, V value) 1865 if (isInputRange!V && hasLength!V) 1866 { 1867 import core.checkedint : addu; 1868 bool overflow; 1869 size_t nelems = addu(arr.length, value.length, overflow); 1870 if (overflow) assert(0); 1871 1872 arr = realloc(arr, nelems); 1873 1874 import std.algorithm.mutation : copy; 1875 copy(value, arr[$-value.length..$]); 1876 } 1877 1878 pure @safe unittest 1879 { 1880 int[] arr; 1881 ReallocPolicy.append(arr, [1,2,3]); 1882 1883 import std.algorithm.comparison : equal; 1884 assert(equal(arr, [1,2,3])); 1885 } 1886 1887 static void destroy(T)(scope ref T[] arr) @trusted 1888 { 1889 import core.memory : pureFree; 1890 if (arr.ptr) 1891 pureFree(arr.ptr); 1892 arr = null; 1893 } 1894 } 1895 1896 //build hack 1897 alias _RealArray = CowArray!ReallocPolicy; 1898 1899 pure @safe unittest 1900 { 1901 import std.algorithm.comparison : equal; 1902 1903 with(ReallocPolicy) 1904 { 1905 bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result, 1906 string file = __FILE__, size_t line = __LINE__) 1907 { 1908 { 1909 replaceImpl(orig, from, to, toReplace); 1910 scope(exit) destroy(orig); 1911 if (!equal(orig, result)) 1912 return false; 1913 } 1914 return true; 1915 } 1916 static T[] arr(T)(T[] args... ) 1917 { 1918 return dup(args); 1919 } 1920 1921 assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4])); 1922 assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4])); 1923 assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7])); 1924 assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4])); 1925 assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4])); 1926 } 1927 } 1928 1929 /** 1930 Tests if T is some kind a set of code points. Intended for template constraints. 1931 */ 1932 public template isCodepointSet(T) 1933 { 1934 static if (is(T dummy == InversionList!(Args), Args...)) 1935 enum isCodepointSet = true; 1936 else 1937 enum isCodepointSet = false; 1938 } 1939 1940 /** 1941 Tests if `T` is a pair of integers that implicitly convert to `V`. 1942 The following code must compile for any pair `T`: 1943 --- 1944 (T x){ V a = x[0]; V b = x[1];} 1945 --- 1946 The following must not compile: 1947 --- 1948 (T x){ V c = x[2];} 1949 --- 1950 */ 1951 public template isIntegralPair(T, V=uint) 1952 { 1953 enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];})) 1954 && !is(typeof((T x){ V c = x[2]; })); 1955 } 1956 1957 1958 /** 1959 The recommended default type for set of $(CODEPOINTS). 1960 For details, see the current implementation: $(LREF InversionList). 1961 */ 1962 public alias CodepointSet = InversionList!GcPolicy; 1963 1964 1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin 1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error 1967 // hence below doesn't seem to work 1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b"); 1969 1970 /** 1971 The recommended type of $(REF Tuple, std,_typecons) 1972 to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList). 1973 Any interval type should pass $(LREF isIntegralPair) trait. 1974 */ 1975 public struct CodepointInterval 1976 { 1977 pure: 1978 uint[2] _tuple; 1979 alias _tuple this; 1980 1981 @safe pure nothrow @nogc: 1982 1983 this(uint low, uint high) 1984 { 1985 _tuple[0] = low; 1986 _tuple[1] = high; 1987 } 1988 bool opEquals(T)(T val) const 1989 { 1990 return this[0] == val[0] && this[1] == val[1]; 1991 } 1992 @property ref inout(uint) a() return inout { return _tuple[0]; } 1993 @property ref inout(uint) b() return inout { return _tuple[1]; } 1994 } 1995 1996 /** 1997 $(P 1998 `InversionList` is a set of $(CODEPOINTS) 1999 represented as an array of open-right [a, b$(RPAREN) 2000 intervals (see $(LREF CodepointInterval) above). 2001 The name comes from the way the representation reads left to right. 2002 For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN), 2003 plus a singular value 60 looks like this: 2004 ) 2005 --- 2006 10, 50, 60, 61, 80, 90 2007 --- 2008 $(P 2009 The way to read this is: start with negative meaning that all numbers 2010 smaller then the next one are not present in this set (and positive - 2011 the contrary). Then switch positive/negative after each 2012 number passed from left to right. 2013 ) 2014 $(P This way negative spans until 10, then positive until 50, 2015 then negative until 60, then positive until 61, and so on. 2016 As seen this provides a space-efficient storage of highly redundant data 2017 that comes in long runs. A description which Unicode $(CHARACTER) 2018 properties fit nicely. The technique itself could be seen as a variation 2019 on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding). 2020 ) 2021 2022 $(P Sets are value types (just like `int` is) thus they 2023 are never aliased. 2024 ) 2025 Example: 2026 --- 2027 auto a = CodepointSet('a', 'z'+1); 2028 auto b = CodepointSet('A', 'Z'+1); 2029 auto c = a; 2030 a = a | b; 2031 assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1)); 2032 assert(a != c); 2033 --- 2034 $(P See also $(LREF unicode) for simpler construction of sets 2035 from predefined ones. 2036 ) 2037 2038 $(P Memory usage is 8 bytes per each contiguous interval in a set. 2039 The value semantics are achieved by using the 2040 $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique 2041 and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared). 2042 ) 2043 2044 Note: 2045 $(P It's not recommended to rely on the template parameters 2046 or the exact type of a current $(CODEPOINT) set in `std.uni`. 2047 The type and parameters may change when the standard 2048 allocators design is finalized. 2049 Use $(LREF isCodepointSet) with templates or just stick with the default 2050 alias $(LREF CodepointSet) throughout the whole code base. 2051 ) 2052 */ 2053 public struct InversionList(SP=GcPolicy) 2054 { 2055 import std.range : assumeSorted; 2056 2057 /** 2058 Construct from another code point set of any type. 2059 */ 2060 this(Set)(Set set) pure 2061 if (isCodepointSet!Set) 2062 { 2063 uint[] arr; 2064 foreach (v; set.byInterval) 2065 { 2066 arr ~= v.a; 2067 arr ~= v.b; 2068 } 2069 data = CowArray!(SP).reuse(arr); 2070 } 2071 2072 /** 2073 Construct a set from a forward range of code point intervals. 2074 */ 2075 this(Range)(Range intervals) pure 2076 if (isForwardRange!Range && isIntegralPair!(ElementType!Range)) 2077 { 2078 uint[] arr; 2079 foreach (v; intervals) 2080 { 2081 SP.append(arr, v.a); 2082 SP.append(arr, v.b); 2083 } 2084 data = CowArray!(SP).reuse(arr); 2085 sanitize(); //enforce invariant: sort intervals etc. 2086 } 2087 2088 //helper function that avoids sanity check to be CTFE-friendly 2089 private static fromIntervals(Range)(Range intervals) pure 2090 { 2091 import std.algorithm.iteration : map; 2092 import std.range : roundRobin; 2093 auto flattened = roundRobin(intervals.save.map!"a[0]"(), 2094 intervals.save.map!"a[1]"()); 2095 InversionList set; 2096 set.data = CowArray!(SP)(flattened); 2097 return set; 2098 } 2099 //ditto untill sort is CTFE-able 2100 private static fromIntervals()(uint[] intervals...) pure 2101 in 2102 { 2103 import std.conv : text; 2104 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2105 for (uint i = 0; i < intervals.length; i += 2) 2106 { 2107 auto a = intervals[i], b = intervals[i+1]; 2108 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2109 } 2110 } 2111 do 2112 { 2113 InversionList set; 2114 set.data = CowArray!(SP)(intervals); 2115 return set; 2116 } 2117 2118 /** 2119 Construct a set from plain values of code point intervals. 2120 */ 2121 this()(uint[] intervals...) 2122 in 2123 { 2124 import std.conv : text; 2125 assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!"); 2126 for (uint i = 0; i < intervals.length; i += 2) 2127 { 2128 auto a = intervals[i], b = intervals[i+1]; 2129 assert(a < b, text("illegal interval [a, b): ", a, " > ", b)); 2130 } 2131 } 2132 do 2133 { 2134 data = CowArray!(SP)(intervals); 2135 sanitize(); //enforce invariant: sort intervals etc. 2136 } 2137 2138 /// 2139 pure @safe unittest 2140 { 2141 import std.algorithm.comparison : equal; 2142 2143 auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1); 2144 foreach (v; 'a'..'z'+1) 2145 assert(set[v]); 2146 // Cyrillic lowercase interval 2147 foreach (v; 'а'..'я'+1) 2148 assert(set[v]); 2149 //specific order is not required, intervals may interesect 2150 auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1); 2151 //the same end result 2152 assert(set2.byInterval.equal(set.byInterval)); 2153 // test constructor this(Range)(Range intervals) 2154 auto chessPiecesWhite = CodepointInterval(9812, 9818); 2155 auto chessPiecesBlack = CodepointInterval(9818, 9824); 2156 auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]); 2157 foreach (v; '♔'..'♟'+1) 2158 assert(set3[v]); 2159 } 2160 2161 /** 2162 Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList). 2163 */ 2164 @property auto byInterval() scope 2165 { 2166 // TODO: change this to data[] once the -dip1000 errors have been fixed 2167 // see e.g. https://github.com/dlang/phobos/pull/6638 2168 import std.array : array; 2169 return Intervals!(typeof(data.array))(data.array); 2170 } 2171 2172 @safe unittest 2173 { 2174 import std.algorithm.comparison : equal; 2175 import std.typecons : tuple; 2176 2177 auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1); 2178 2179 assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')])); 2180 } 2181 2182 package(std) @property const(CodepointInterval)[] intervals() const 2183 { 2184 import std.array : array; 2185 return Intervals!(typeof(data[]))(data[]).array; 2186 } 2187 2188 /** 2189 Tests the presence of code point `val` in this set. 2190 */ 2191 bool opIndex(uint val) const 2192 { 2193 // the <= ensures that searching in interval of [a, b) for 'a' you get .length == 1 2194 // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1; 2195 return sharSwitchLowerBound!"a <= b"(data[], val) & 1; 2196 } 2197 2198 /// 2199 pure @safe unittest 2200 { 2201 auto gothic = unicode.Gothic; 2202 // Gothic letter ahsa 2203 assert(gothic['\U00010330']); 2204 // no ascii in Gothic obviously 2205 assert(!gothic['$']); 2206 } 2207 2208 2209 // Linear scan for `ch`. Useful only for small sets. 2210 // TODO: 2211 // used internally in std.regex 2212 // should be properly exposed in a public API ? 2213 package(std) auto scanFor()(dchar ch) const 2214 { 2215 immutable len = data.length; 2216 for (size_t i = 0; i < len; i++) 2217 if (ch < data[i]) 2218 return i & 1; 2219 return 0; 2220 } 2221 2222 /// Number of $(CODEPOINTS) in this set 2223 @property size_t length() 2224 { 2225 size_t sum = 0; 2226 foreach (iv; byInterval) 2227 { 2228 sum += iv.b - iv.a; 2229 } 2230 return sum; 2231 } 2232 2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin): 2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration 2235 //============================================================================ 2236 public: 2237 /** 2238 $(P Sets support natural syntax for set algebra, namely: ) 2239 $(BOOKTABLE , 2240 $(TR $(TH Operator) $(TH Math notation) $(TH Description) ) 2241 $(TR $(TD &) $(TD a ∩ b) $(TD intersection) ) 2242 $(TR $(TD |) $(TD a ∪ b) $(TD union) ) 2243 $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) ) 2244 $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) ) 2245 ) 2246 */ 2247 This opBinary(string op, U)(U rhs) 2248 if (isCodepointSet!U || is(U:dchar)) 2249 { 2250 static if (op == "&" || op == "|" || op == "~") 2251 {// symmetric ops thus can swap arguments to reuse r-value 2252 static if (is(U:dchar)) 2253 { 2254 auto tmp = this; 2255 mixin("tmp "~op~"= rhs; "); 2256 return tmp; 2257 } 2258 else 2259 { 2260 static if (is(Unqual!U == U)) 2261 { 2262 // try hard to reuse r-value 2263 mixin("rhs "~op~"= this;"); 2264 return rhs; 2265 } 2266 else 2267 { 2268 auto tmp = this; 2269 mixin("tmp "~op~"= rhs;"); 2270 return tmp; 2271 } 2272 } 2273 } 2274 else static if (op == "-") // anti-symmetric 2275 { 2276 auto tmp = this; 2277 tmp -= rhs; 2278 return tmp; 2279 } 2280 else 2281 static assert(0, "no operator "~op~" defined for Set"); 2282 } 2283 2284 /// 2285 pure @safe unittest 2286 { 2287 import std.algorithm.comparison : equal; 2288 import std.range : iota; 2289 2290 auto lower = unicode.LowerCase; 2291 auto upper = unicode.UpperCase; 2292 auto ascii = unicode.ASCII; 2293 2294 assert((lower & upper).empty); // no intersection 2295 auto lowerASCII = lower & ascii; 2296 assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1))); 2297 // throw away all of the lowercase ASCII 2298 assert((ascii - lower).length == 128 - 26); 2299 2300 auto onlyOneOf = lower ~ ascii; 2301 assert(!onlyOneOf['Δ']); // not ASCII and not lowercase 2302 assert(onlyOneOf['$']); // ASCII and not lowercase 2303 assert(!onlyOneOf['a']); // ASCII and lowercase 2304 assert(onlyOneOf['я']); // not ASCII but lowercase 2305 2306 // throw away all cased letters from ASCII 2307 auto noLetters = ascii - (lower | upper); 2308 assert(noLetters.length == 128 - 26*2); 2309 } 2310 2311 /// The 'op=' versions of the above overloaded operators. 2312 ref This opOpAssign(string op, U)(U rhs) 2313 if (isCodepointSet!U || is(U:dchar)) 2314 { 2315 static if (op == "|") // union 2316 { 2317 static if (is(U:dchar)) 2318 { 2319 this.addInterval(rhs, rhs+1); 2320 return this; 2321 } 2322 else 2323 return this.add(rhs); 2324 } 2325 else static if (op == "&") // intersection 2326 return this.intersect(rhs);// overloaded 2327 else static if (op == "-") // set difference 2328 return this.sub(rhs);// overloaded 2329 else static if (op == "~") // symmetric set difference 2330 { 2331 auto copy = this & rhs; 2332 this |= rhs; 2333 this -= copy; 2334 return this; 2335 } 2336 else 2337 static assert(0, "no operator "~op~" defined for Set"); 2338 } 2339 2340 /** 2341 Tests the presence of codepoint `ch` in this set, 2342 the same as $(LREF opIndex). 2343 */ 2344 bool opBinaryRight(string op: "in", U)(U ch) const 2345 if (is(U : dchar)) 2346 { 2347 return this[ch]; 2348 } 2349 2350 /// 2351 pure @safe unittest 2352 { 2353 assert('я' in unicode.Cyrillic); 2354 assert(!('z' in unicode.Cyrillic)); 2355 } 2356 2357 2358 2359 /** 2360 * Obtains a set that is the inversion of this set. 2361 * 2362 * See_Also: $(LREF inverted) 2363 */ 2364 auto opUnary(string op: "!")() 2365 { 2366 return this.inverted; 2367 } 2368 2369 /** 2370 A range that spans each $(CODEPOINT) in this set. 2371 */ 2372 @property auto byCodepoint() 2373 { 2374 static struct CodepointRange 2375 { 2376 this(This set) 2377 { 2378 r = set.byInterval; 2379 if (!r.empty) 2380 cur = r.front.a; 2381 } 2382 2383 @property dchar front() const 2384 { 2385 return cast(dchar) cur; 2386 } 2387 2388 @property bool empty() const 2389 { 2390 return r.empty; 2391 } 2392 2393 void popFront() 2394 { 2395 cur++; 2396 while (cur >= r.front.b) 2397 { 2398 r.popFront(); 2399 if (r.empty) 2400 break; 2401 cur = r.front.a; 2402 } 2403 } 2404 private: 2405 uint cur; 2406 typeof(This.init.byInterval) r; 2407 } 2408 2409 return CodepointRange(this); 2410 } 2411 2412 /// 2413 pure @safe unittest 2414 { 2415 import std.algorithm.comparison : equal; 2416 import std.range : iota; 2417 2418 auto set = unicode.ASCII; 2419 set.byCodepoint.equal(iota(0, 0x80)); 2420 } 2421 2422 /** 2423 $(P Obtain textual representation of this set in from of 2424 open-right intervals and feed it to `sink`. 2425 ) 2426 $(P Used by various standard formatting facilities such as 2427 $(REF formattedWrite, std,format), $(REF write, std,stdio), 2428 $(REF writef, std,stdio), $(REF to, std,conv) and others. 2429 ) 2430 Example: 2431 --- 2432 import std.conv; 2433 assert(unicode.ASCII.to!string == "[0..128$(RPAREN)"); 2434 --- 2435 */ 2436 2437 private import std.format.spec : FormatSpec; 2438 2439 /*************************************** 2440 * Obtain a textual representation of this InversionList 2441 * in form of open-right intervals. 2442 * 2443 * The formatting flag is applied individually to each value, for example: 2444 * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals) 2445 * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters) 2446 * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters) 2447 */ 2448 void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */ 2449 { 2450 import std.format.write : formatValue; 2451 auto range = byInterval; 2452 if (range.empty) 2453 return; 2454 2455 while (1) 2456 { 2457 auto i = range.front; 2458 range.popFront(); 2459 2460 put(sink, "["); 2461 formatValue(sink, i.a, fmt); 2462 put(sink, ".."); 2463 formatValue(sink, i.b, fmt); 2464 put(sink, ")"); 2465 if (range.empty) return; 2466 put(sink, " "); 2467 } 2468 } 2469 2470 /// 2471 pure @safe unittest 2472 { 2473 import std.conv : to; 2474 import std.format : format; 2475 import std.uni : unicode; 2476 2477 // This was originally using Cyrillic script. 2478 // Unfortunately this is a pretty active range for changes, 2479 // and hence broke in an update. 2480 // Therefore the range Basic latin was used instead as it 2481 // unlikely to ever change. 2482 2483 assert(unicode.InBasic_latin.to!string == "[0..128)"); 2484 2485 // The specs '%s' and '%d' are equivalent to the to!string call above. 2486 assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string); 2487 2488 assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)"); 2489 assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)"); 2490 } 2491 2492 pure @safe unittest 2493 { 2494 import std.exception : assertThrown; 2495 import std.format : format, FormatException; 2496 assertThrown!FormatException(format("%z", unicode.ASCII)); 2497 } 2498 2499 2500 /** 2501 Add an interval [a, b$(RPAREN) to this set. 2502 */ 2503 ref add()(uint a, uint b) 2504 { 2505 addInterval(a, b); 2506 return this; 2507 } 2508 2509 /// 2510 pure @safe unittest 2511 { 2512 CodepointSet someSet; 2513 someSet.add('0', '5').add('A','Z'+1); 2514 someSet.add('5', '9'+1); 2515 assert(someSet['0']); 2516 assert(someSet['5']); 2517 assert(someSet['9']); 2518 assert(someSet['Z']); 2519 } 2520 2521 private: 2522 2523 package(std) // used from: std.regex.internal.parser 2524 ref intersect(U)(U rhs) 2525 if (isCodepointSet!U) 2526 { 2527 Marker mark; 2528 foreach ( i; rhs.byInterval) 2529 { 2530 mark = this.dropUpTo(i.a, mark); 2531 mark = this.skipUpTo(i.b, mark); 2532 } 2533 this.dropUpTo(uint.max, mark); 2534 return this; 2535 } 2536 2537 ref intersect()(dchar ch) 2538 { 2539 foreach (i; byInterval) 2540 if (i.a <= ch && ch < i.b) 2541 return this = This.init.add(ch, ch+1); 2542 this = This.init; 2543 return this; 2544 } 2545 2546 pure @safe unittest 2547 { 2548 assert(unicode.Cyrillic.intersect('-').byInterval.empty); 2549 } 2550 2551 ref sub()(dchar ch) 2552 { 2553 return subChar(ch); 2554 } 2555 2556 // same as the above except that skip & drop parts are swapped 2557 package(std) // used from: std.regex.internal.parser 2558 ref sub(U)(U rhs) 2559 if (isCodepointSet!U) 2560 { 2561 Marker mark; 2562 foreach (i; rhs.byInterval) 2563 { 2564 mark = this.skipUpTo(i.a, mark); 2565 mark = this.dropUpTo(i.b, mark); 2566 } 2567 return this; 2568 } 2569 2570 package(std) // used from: std.regex.internal.parse 2571 ref add(U)(U rhs) 2572 if (isCodepointSet!U) 2573 { 2574 Marker start; 2575 foreach (i; rhs.byInterval) 2576 { 2577 start = addInterval(i.a, i.b, start); 2578 } 2579 return this; 2580 } 2581 2582 // end of mixin-able part 2583 //============================================================================ 2584 public: 2585 /** 2586 Obtains a set that is the inversion of this set. 2587 2588 See the '!' $(LREF opUnary) for the same but using operators. 2589 */ 2590 @property auto inverted() 2591 { 2592 InversionList inversion = this; 2593 if (inversion.data.length == 0) 2594 { 2595 inversion.addInterval(0, lastDchar+1); 2596 return inversion; 2597 } 2598 if (inversion.data[0] != 0) 2599 genericReplace(inversion.data, 0, 0, [0]); 2600 else 2601 genericReplace(inversion.data, 0, 1, cast(uint[]) null); 2602 if (data[data.length-1] != lastDchar+1) 2603 genericReplace(inversion.data, 2604 inversion.data.length, inversion.data.length, [lastDchar+1]); 2605 else 2606 genericReplace(inversion.data, 2607 inversion.data.length-1, inversion.data.length, cast(uint[]) null); 2608 2609 return inversion; 2610 } 2611 2612 /// 2613 pure @safe unittest 2614 { 2615 auto set = unicode.ASCII; 2616 // union with the inverse gets all of the code points in the Unicode 2617 assert((set | set.inverted).length == 0x110000); 2618 // no intersection with the inverse 2619 assert((set & set.inverted).empty); 2620 } 2621 2622 package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName) 2623 { 2624 import std.algorithm.searching : countUntil; 2625 import std.format : format; 2626 enum maxBinary = 3; 2627 static string linearScope(R)(R ivals, string indent) 2628 { 2629 string result = indent~"{\n"; 2630 string deeper = indent~" "; 2631 foreach (ival; ivals) 2632 { 2633 immutable span = ival[1] - ival[0]; 2634 assert(span != 0); 2635 if (span == 1) 2636 { 2637 result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]); 2638 } 2639 else if (span == 2) 2640 { 2641 result ~= format("%sif (ch == %s || ch == %s) return true;\n", 2642 deeper, ival[0], ival[0]+1); 2643 } 2644 else 2645 { 2646 if (ival[0] != 0) // dchar is unsigned and < 0 is useless 2647 result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]); 2648 result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]); 2649 } 2650 } 2651 result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals 2652 return result; 2653 } 2654 2655 static string binaryScope(R)(R ivals, string indent) @safe 2656 { 2657 // time to do unrolled comparisons? 2658 if (ivals.length < maxBinary) 2659 return linearScope(ivals, indent); 2660 else 2661 return bisect(ivals, ivals.length/2, indent); 2662 } 2663 2664 // not used yet if/elsebinary search is far better with DMD as of 2.061 2665 // and GDC is doing fine job either way 2666 static string switchScope(R)(R ivals, string indent) 2667 { 2668 string result = indent~"switch (ch){\n"; 2669 string deeper = indent~" "; 2670 foreach (ival; ivals) 2671 { 2672 if (ival[0]+1 == ival[1]) 2673 { 2674 result ~= format("%scase %s: return true;\n", 2675 deeper, ival[0]); 2676 } 2677 else 2678 { 2679 result ~= format("%scase %s: .. case %s: return true;\n", 2680 deeper, ival[0], ival[1]-1); 2681 } 2682 } 2683 result ~= deeper~"default: return false;\n"~indent~"}\n"; 2684 return result; 2685 } 2686 2687 static string bisect(R)(R range, size_t idx, string indent) 2688 { 2689 string deeper = indent ~ " "; 2690 // bisect on one [a, b) interval at idx 2691 string result = indent~"{\n"; 2692 // less branch, < a 2693 result ~= format("%sif (ch < %s)\n%s", 2694 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper)); 2695 // middle point, >= a && < b 2696 result ~= format("%selse if (ch < %s) return true;\n", 2697 deeper, range[idx][1]); 2698 // greater or equal branch, >= b 2699 result ~= format("%selse\n%s", 2700 deeper, binaryScope(range[idx+1..$], deeper)); 2701 return result~indent~"}\n"; 2702 } 2703 2704 string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n", 2705 funcName.empty ? "function" : funcName); 2706 // special case first bisection to be on ASCII vs beyond 2707 auto tillAscii = countUntil!"a[0] > 0x80"(range); 2708 if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0) 2709 code ~= binaryScope(range, ""); 2710 else 2711 code ~= bisect(range, tillAscii, ""); 2712 return code; 2713 } 2714 2715 /** 2716 Generates string with D source code of unary function with name of 2717 `funcName` taking a single `dchar` argument. If `funcName` is empty 2718 the code is adjusted to be a lambda function. 2719 2720 The function generated tests if the $(CODEPOINT) passed 2721 belongs to this set or not. The result is to be used with string mixin. 2722 The intended usage area is aggressive optimization via meta programming 2723 in parser generators and the like. 2724 2725 Note: Use with care for relatively small or regular sets. It 2726 could end up being slower then just using multi-staged tables. 2727 2728 Example: 2729 --- 2730 import std.stdio; 2731 2732 // construct set directly from [a, b$RPAREN intervals 2733 auto set = CodepointSet(10, 12, 45, 65, 100, 200); 2734 writeln(set); 2735 writeln(set.toSourceCode("func")); 2736 --- 2737 2738 The above outputs something along the lines of: 2739 --- 2740 bool func(dchar ch) @safe pure nothrow @nogc 2741 { 2742 if (ch < 45) 2743 { 2744 if (ch == 10 || ch == 11) return true; 2745 return false; 2746 } 2747 else if (ch < 65) return true; 2748 else 2749 { 2750 if (ch < 100) return false; 2751 if (ch < 200) return true; 2752 return false; 2753 } 2754 } 2755 --- 2756 */ 2757 string toSourceCode(string funcName="") 2758 { 2759 import std.array : array; 2760 auto range = byInterval.array(); 2761 return toSourceCode(range, funcName); 2762 } 2763 2764 /** 2765 True if this set doesn't contain any $(CODEPOINTS). 2766 */ 2767 @property bool empty() const 2768 { 2769 return data.length == 0; 2770 } 2771 2772 /// 2773 pure @safe unittest 2774 { 2775 CodepointSet emptySet; 2776 assert(emptySet.length == 0); 2777 assert(emptySet.empty); 2778 } 2779 2780 private: 2781 alias This = typeof(this); 2782 alias Marker = size_t; 2783 2784 // a random-access range of integral pairs 2785 static struct Intervals(Range) 2786 { 2787 import std.range.primitives : hasAssignableElements; 2788 2789 this(Range sp) scope 2790 { 2791 slice = sp; 2792 start = 0; 2793 end = sp.length; 2794 } 2795 2796 this(Range sp, size_t s, size_t e) scope 2797 { 2798 slice = sp; 2799 start = s; 2800 end = e; 2801 } 2802 2803 @property auto front()const 2804 { 2805 immutable a = slice[start]; 2806 immutable b = slice[start+1]; 2807 return CodepointInterval(a, b); 2808 } 2809 2810 //may break sorted property - but we need std.sort to access it 2811 //hence package(std) protection attribute 2812 static if (hasAssignableElements!Range) 2813 package(std) @property void front(CodepointInterval val) 2814 { 2815 slice[start] = val.a; 2816 slice[start+1] = val.b; 2817 } 2818 2819 @property auto back()const 2820 { 2821 immutable a = slice[end-2]; 2822 immutable b = slice[end-1]; 2823 return CodepointInterval(a, b); 2824 } 2825 2826 //ditto about package 2827 static if (hasAssignableElements!Range) 2828 package(std) @property void back(CodepointInterval val) 2829 { 2830 slice[end-2] = val.a; 2831 slice[end-1] = val.b; 2832 } 2833 2834 void popFront() 2835 { 2836 start += 2; 2837 } 2838 2839 void popBack() 2840 { 2841 end -= 2; 2842 } 2843 2844 auto opIndex(size_t idx) const 2845 { 2846 immutable a = slice[start+idx*2]; 2847 immutable b = slice[start+idx*2+1]; 2848 return CodepointInterval(a, b); 2849 } 2850 2851 //ditto about package 2852 static if (hasAssignableElements!Range) 2853 package(std) void opIndexAssign(CodepointInterval val, size_t idx) 2854 { 2855 slice[start+idx*2] = val.a; 2856 slice[start+idx*2+1] = val.b; 2857 } 2858 2859 auto opSlice(size_t s, size_t e) 2860 { 2861 return Intervals(slice, s*2+start, e*2+start); 2862 } 2863 2864 @property size_t length()const { return slice.length/2; } 2865 2866 @property bool empty()const { return start == end; } 2867 2868 @property auto save(){ return this; } 2869 private: 2870 size_t start, end; 2871 Range slice; 2872 } 2873 2874 // called after construction from intervals 2875 // to make sure invariants hold 2876 void sanitize() 2877 { 2878 import std.algorithm.comparison : max; 2879 import std.algorithm.mutation : SwapStrategy; 2880 import std.algorithm.sorting : sort; 2881 if (data.length == 0) 2882 return; 2883 alias Ival = CodepointInterval; 2884 //intervals wrapper for a _range_ over packed array 2885 auto ivals = Intervals!(typeof(data[]))(data[]); 2886 //@@@BUG@@@ can't use "a.a < b.a" see 2887 // https://issues.dlang.org/show_bug.cgi?id=12265 2888 sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals); 2889 // what follows is a variation on stable remove 2890 // differences: 2891 // - predicate is binary, and is tested against 2892 // the last kept element (at 'i'). 2893 // - predicate mutates lhs (merges rhs into lhs) 2894 size_t len = ivals.length; 2895 size_t i = 0; 2896 size_t j = 1; 2897 while (j < len) 2898 { 2899 if (ivals[i].b >= ivals[j].a) 2900 { 2901 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b)); 2902 j++; 2903 } 2904 else //unmergable 2905 { 2906 // check if there is a hole after merges 2907 // (in the best case we do 0 writes to ivals) 2908 if (j != i+1) 2909 ivals[i+1] = ivals[j]; //copy over 2910 i++; 2911 j++; 2912 } 2913 } 2914 len = i + 1; 2915 for (size_t k=0; k + 1 < len; k++) 2916 { 2917 assert(ivals[k].a < ivals[k].b); 2918 assert(ivals[k].b < ivals[k+1].a); 2919 } 2920 data.length = len * 2; 2921 } 2922 2923 // special case for normal InversionList 2924 ref subChar(dchar ch) 2925 { 2926 auto mark = skipUpTo(ch); 2927 if (mark != data.length 2928 && data[mark] == ch && data[mark-1] == ch) 2929 { 2930 // it has split, meaning that ch happens to be in one of intervals 2931 data[mark] = data[mark]+1; 2932 } 2933 return this; 2934 } 2935 2936 // 2937 Marker addInterval(int a, int b, Marker hint=Marker.init) scope 2938 in 2939 { 2940 assert(a <= b); 2941 } 2942 do 2943 { 2944 import std.range : assumeSorted, SearchPolicy; 2945 auto range = assumeSorted(data[]); 2946 size_t pos; 2947 size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length; 2948 if (a_idx == range.length) 2949 { 2950 // [---+++----++++----++++++] 2951 // [ a b] 2952 data.append(a, b); 2953 return data.length-1; 2954 } 2955 size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx; 2956 uint[3] buf = void; 2957 uint to_insert; 2958 debug(std_uni) 2959 { 2960 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2961 } 2962 if (b_idx == range.length) 2963 { 2964 // [-------++++++++----++++++-] 2965 // [ s a b] 2966 if (a_idx & 1)// a in positive 2967 { 2968 buf[0] = b; 2969 to_insert = 1; 2970 } 2971 else// a in negative 2972 { 2973 buf[0] = a; 2974 buf[1] = b; 2975 to_insert = 2; 2976 } 2977 pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]); 2978 return pos - 1; 2979 } 2980 2981 uint top = data[b_idx]; 2982 2983 debug(std_uni) 2984 { 2985 writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx); 2986 writefln("a=%s; b=%s; top=%s;", a, b, top); 2987 } 2988 if (a_idx & 1) 2989 {// a in positive 2990 if (b_idx & 1)// b in positive 2991 { 2992 // [-------++++++++----++++++-] 2993 // [ s a b ] 2994 buf[0] = top; 2995 to_insert = 1; 2996 } 2997 else // b in negative 2998 { 2999 // [-------++++++++----++++++-] 3000 // [ s a b ] 3001 if (top == b) 3002 { 3003 assert(b_idx+1 < data.length); 3004 buf[0] = data[b_idx+1]; 3005 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]); 3006 return pos - 1; 3007 } 3008 buf[0] = b; 3009 buf[1] = top; 3010 to_insert = 2; 3011 } 3012 } 3013 else 3014 { // a in negative 3015 if (b_idx & 1) // b in positive 3016 { 3017 // [----------+++++----++++++-] 3018 // [ a b ] 3019 buf[0] = a; 3020 buf[1] = top; 3021 to_insert = 2; 3022 } 3023 else// b in negative 3024 { 3025 // [----------+++++----++++++-] 3026 // [ a s b ] 3027 if (top == b) 3028 { 3029 assert(b_idx+1 < data.length); 3030 buf[0] = a; 3031 buf[1] = data[b_idx+1]; 3032 pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]); 3033 return pos - 1; 3034 } 3035 buf[0] = a; 3036 buf[1] = b; 3037 buf[2] = top; 3038 to_insert = 3; 3039 } 3040 } 3041 pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]); 3042 debug(std_uni) 3043 { 3044 writefln("marker idx: %d; length=%d", pos, data[pos], data.length); 3045 writeln("inserting ", buf[0 .. to_insert]); 3046 } 3047 return pos - 1; 3048 } 3049 3050 // 3051 Marker dropUpTo(uint a, Marker pos=Marker.init) 3052 in 3053 { 3054 assert(pos % 2 == 0); // at start of interval 3055 } 3056 do 3057 { 3058 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3059 if (range.empty) 3060 return pos; 3061 size_t idx = pos; 3062 idx += range.lowerBound(a).length; 3063 3064 debug(std_uni) 3065 { 3066 writeln("dropUpTo full length=", data.length); 3067 writeln(pos,"~~~", idx); 3068 } 3069 if (idx == data.length) 3070 return genericReplace(data, pos, idx, cast(uint[])[]); 3071 if (idx & 1) 3072 { // a in positive 3073 //[--+++----++++++----+++++++------...] 3074 // |<---si s a t 3075 genericReplace(data, pos, idx, [a]); 3076 } 3077 else 3078 { // a in negative 3079 //[--+++----++++++----+++++++-------+++...] 3080 // |<---si s a t 3081 genericReplace(data, pos, idx, cast(uint[])[]); 3082 } 3083 return pos; 3084 } 3085 3086 // 3087 Marker skipUpTo(uint a, Marker pos=Marker.init) 3088 out(result) 3089 { 3090 assert(result % 2 == 0);// always start of interval 3091 //(may be 0-width after-split) 3092 } 3093 do 3094 { 3095 assert(data.length % 2 == 0); 3096 auto range = assumeSorted!"a <= b"(data[pos .. data.length]); 3097 size_t idx = pos+range.lowerBound(a).length; 3098 3099 if (idx >= data.length) // could have Marker point to recently removed stuff 3100 return data.length; 3101 3102 if (idx & 1)// inside of interval, check for split 3103 { 3104 3105 immutable top = data[idx]; 3106 if (top == a)// no need to split, it's end 3107 return idx+1; 3108 immutable start = data[idx-1]; 3109 if (a == start) 3110 return idx-1; 3111 // split it up 3112 genericReplace(data, idx, idx+1, [a, a, top]); 3113 return idx+1; // avoid odd index 3114 } 3115 return idx; 3116 } 3117 3118 CowArray!SP data; 3119 } 3120 3121 pure @safe unittest 3122 { 3123 import std.conv : to; 3124 assert(unicode.ASCII.to!string() == "[0..128)"); 3125 } 3126 3127 // pedantic version for ctfe, and aligned-access only architectures 3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3129 { 3130 idx *= 3; 3131 version (LittleEndian) 3132 return ptr[idx] + (cast(uint) ptr[idx+1]<<8) 3133 + (cast(uint) ptr[idx+2]<<16); 3134 else 3135 return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8) 3136 + ptr[idx+2]; 3137 } 3138 3139 // ditto 3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3141 { 3142 idx *= 3; 3143 version (LittleEndian) 3144 { 3145 ptr[idx] = val & 0xFF; 3146 ptr[idx+1] = (val >> 8) & 0xFF; 3147 ptr[idx+2] = (val >> 16) & 0xFF; 3148 } 3149 else 3150 { 3151 ptr[idx] = (val >> 16) & 0xFF; 3152 ptr[idx+1] = (val >> 8) & 0xFF; 3153 ptr[idx+2] = val & 0xFF; 3154 } 3155 } 3156 3157 // unaligned x86-like read/write functions 3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3159 { 3160 uint* src = cast(uint*)(ptr+3*idx); 3161 version (LittleEndian) 3162 return *src & 0xFF_FFFF; 3163 else 3164 return *src >> 8; 3165 } 3166 3167 // ditto 3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3169 { 3170 uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx); 3171 version (LittleEndian) 3172 *dest = val | (*dest & 0xFF00_0000); 3173 else 3174 *dest = (val << 8) | (*dest & 0xFF); 3175 } 3176 3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc 3178 { 3179 static if (hasUnalignedReads) 3180 return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx); 3181 else 3182 return safeRead24(ptr, idx); 3183 } 3184 3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc 3186 { 3187 static if (hasUnalignedReads) 3188 return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx); 3189 else 3190 return safeWrite24(ptr, val, idx); 3191 } 3192 3193 struct CowArray(SP=GcPolicy) 3194 { 3195 import std.range.primitives : hasLength; 3196 3197 @safe: 3198 static auto reuse(uint[] arr) 3199 { 3200 CowArray cow; 3201 cow.data = arr; 3202 SP.append(cow.data, 1); 3203 assert(cow.refCount == 1); 3204 assert(cow.length == arr.length); 3205 return cow; 3206 } 3207 3208 this(Range)(Range range) 3209 if (isInputRange!Range && hasLength!Range) 3210 { 3211 import std.algorithm.mutation : copy; 3212 length = range.length; 3213 copy(range, data[0..$-1]); 3214 } 3215 3216 this(Range)(Range range) 3217 if (isForwardRange!Range && !hasLength!Range) 3218 { 3219 import std.algorithm.mutation : copy; 3220 import std.range.primitives : walkLength; 3221 immutable len = walkLength(range.save); 3222 length = len; 3223 copy(range, data[0..$-1]); 3224 } 3225 3226 this(this) 3227 { 3228 if (!empty) 3229 { 3230 refCount = refCount + 1; 3231 } 3232 } 3233 3234 ~this() 3235 { 3236 if (!empty) 3237 { 3238 immutable cnt = refCount; 3239 if (cnt == 1) 3240 SP.destroy(data); 3241 else 3242 refCount = cnt - 1; 3243 } 3244 } 3245 3246 // no ref-count for empty U24 array 3247 @property bool empty() const { return data.length == 0; } 3248 3249 // report one less then actual size 3250 @property size_t length() const 3251 { 3252 return data.length ? data.length - 1 : 0; 3253 } 3254 3255 //+ an extra slot for ref-count 3256 @property void length(size_t len) 3257 { 3258 import std.algorithm.comparison : min; 3259 import std.algorithm.mutation : copy; 3260 if (len == 0) 3261 { 3262 if (!empty) 3263 freeThisReference(); 3264 return; 3265 } 3266 immutable total = len + 1; // including ref-count 3267 if (empty) 3268 { 3269 data = SP.alloc!uint(total); 3270 refCount = 1; 3271 return; 3272 } 3273 immutable cur_cnt = refCount; 3274 if (cur_cnt != 1) // have more references to this memory 3275 { 3276 refCount = cur_cnt - 1; 3277 auto new_data = SP.alloc!uint(total); 3278 // take shrinking into account 3279 auto to_copy = min(total, data.length) - 1; 3280 copy(data[0 .. to_copy], new_data[0 .. to_copy]); 3281 data = new_data; // before setting refCount! 3282 refCount = 1; 3283 } 3284 else // 'this' is the only reference 3285 { 3286 // use the realloc (hopefully in-place operation) 3287 data = SP.realloc(data, total); 3288 refCount = 1; // setup a ref-count in the new end of the array 3289 } 3290 } 3291 3292 alias opDollar = length; 3293 3294 uint opIndex()(size_t idx)const 3295 { 3296 return data[idx]; 3297 } 3298 3299 void opIndexAssign(uint val, size_t idx) 3300 { 3301 auto cnt = refCount; 3302 if (cnt != 1) 3303 dupThisReference(cnt); 3304 data[idx] = val; 3305 } 3306 3307 // 3308 auto opSlice(size_t from, size_t to) 3309 { 3310 if (!empty) 3311 { 3312 auto cnt = refCount; 3313 if (cnt != 1) 3314 dupThisReference(cnt); 3315 } 3316 return data[from .. to]; 3317 3318 } 3319 3320 // 3321 auto opSlice(size_t from, size_t to) const 3322 { 3323 return data[from .. to]; 3324 } 3325 3326 // length slices before the ref count 3327 auto opSlice() 3328 { 3329 return opSlice(0, length); 3330 } 3331 3332 // ditto 3333 auto opSlice() const 3334 { 3335 return opSlice(0, length); 3336 } 3337 3338 void append(Range)(Range range) 3339 if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint)) 3340 { 3341 size_t nl = length + range.length; 3342 length = nl; 3343 copy(range, this[nl-range.length .. nl]); 3344 } 3345 3346 void append()(uint[] val...) 3347 { 3348 length = length + val.length; 3349 data[$-val.length-1 .. $-1] = val[]; 3350 } 3351 3352 bool opEquals()(auto ref const CowArray rhs) const 3353 { 3354 if (empty ^ rhs.empty) 3355 return false; // one is empty and the other isn't 3356 return empty || data[0..$-1] == rhs.data[0..$-1]; 3357 } 3358 3359 private: 3360 // ref-count is right after the data 3361 @property uint refCount() const 3362 { 3363 return data[$-1]; 3364 } 3365 3366 @property void refCount(uint cnt) 3367 { 3368 data[$-1] = cnt; 3369 } 3370 3371 void freeThisReference() 3372 { 3373 immutable count = refCount; 3374 if (count != 1) // have more references to this memory 3375 { 3376 // dec shared ref-count 3377 refCount = count - 1; 3378 data = []; 3379 } 3380 else 3381 SP.destroy(data); 3382 assert(!data.ptr); 3383 } 3384 3385 void dupThisReference(uint count) 3386 in 3387 { 3388 assert(!empty && count != 1 && count == refCount); 3389 } 3390 do 3391 { 3392 import std.algorithm.mutation : copy; 3393 // dec shared ref-count 3394 refCount = count - 1; 3395 // copy to the new chunk of RAM 3396 auto new_data = SP.alloc!uint(data.length); 3397 // bit-blit old stuff except the counter 3398 copy(data[0..$-1], new_data[0..$-1]); 3399 data = new_data; // before setting refCount! 3400 refCount = 1; // so that this updates the right one 3401 } 3402 3403 uint[] data; 3404 } 3405 3406 pure @safe unittest// Uint24 tests 3407 { 3408 import std.algorithm.comparison : equal; 3409 import std.algorithm.mutation : copy; 3410 import std.conv : text; 3411 import std.range : iota, chain; 3412 import std.range.primitives : isBidirectionalRange, isOutputRange; 3413 void funcRef(T)(ref T u24) 3414 { 3415 u24.length = 2; 3416 u24[1] = 1024; 3417 T u24_c = u24; 3418 assert(u24[1] == 1024); 3419 u24.length = 0; 3420 assert(u24.empty); 3421 u24.append([1, 2]); 3422 assert(equal(u24[], [1, 2])); 3423 u24.append(111); 3424 assert(equal(u24[], [1, 2, 111])); 3425 assert(!u24_c.empty && u24_c[1] == 1024); 3426 u24.length = 3; 3427 copy(iota(0, 3), u24[]); 3428 assert(equal(u24[], iota(0, 3))); 3429 assert(u24_c[1] == 1024); 3430 } 3431 3432 void func2(T)(T u24) 3433 { 3434 T u24_2 = u24; 3435 T u24_3; 3436 u24_3 = u24_2; 3437 assert(u24_2 == u24_3); 3438 assert(equal(u24[], u24_2[])); 3439 assert(equal(u24_2[], u24_3[])); 3440 funcRef(u24_3); 3441 3442 assert(equal(u24_3[], iota(0, 3))); 3443 assert(!equal(u24_2[], u24_3[])); 3444 assert(equal(u24_2[], u24[])); 3445 u24_2 = u24_3; 3446 assert(equal(u24_2[], iota(0, 3))); 3447 // to test that passed arg is intact outside 3448 // plus try out opEquals 3449 u24 = u24_3; 3450 u24 = T.init; 3451 u24_3 = T.init; 3452 assert(u24.empty); 3453 assert(u24 == u24_3); 3454 assert(u24 != u24_2); 3455 } 3456 3457 static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy)) 3458 {{ 3459 alias Range = typeof(CowArray!Policy.init[]); 3460 alias U24A = CowArray!Policy; 3461 static assert(isForwardRange!Range); 3462 static assert(isBidirectionalRange!Range); 3463 static assert(isOutputRange!(Range, uint)); 3464 static assert(isRandomAccessRange!(Range)); 3465 3466 auto arr = U24A([42u, 36, 100]); 3467 assert(arr[0] == 42); 3468 assert(arr[1] == 36); 3469 arr[0] = 72; 3470 arr[1] = 0xFE_FEFE; 3471 assert(arr[0] == 72); 3472 assert(arr[1] == 0xFE_FEFE); 3473 assert(arr[2] == 100); 3474 U24A arr2 = arr; 3475 assert(arr2[0] == 72); 3476 arr2[0] = 11; 3477 // test COW-ness 3478 assert(arr[0] == 72); 3479 assert(arr2[0] == 11); 3480 // set this to about 100M to stress-test COW memory management 3481 foreach (v; 0 .. 10_000) 3482 func2(arr); 3483 assert(equal(arr[], [72, 0xFE_FEFE, 100])); 3484 3485 auto r2 = U24A(iota(0, 100)); 3486 assert(equal(r2[], iota(0, 100)), text(r2[])); 3487 copy(iota(10, 170, 2), r2[10 .. 90]); 3488 assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100))) 3489 , text(r2[])); 3490 }} 3491 } 3492 3493 pure @safe unittest// core set primitives test 3494 { 3495 import std.conv : text; 3496 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3497 foreach (CodeList; AllSets) 3498 { 3499 CodeList a; 3500 //"plug a hole" test 3501 a.add(10, 20).add(25, 30).add(15, 27); 3502 assert(a == CodeList(10, 30), text(a)); 3503 3504 auto x = CodeList.init; 3505 x.add(10, 20).add(30, 40).add(50, 60); 3506 3507 a = x; 3508 a.add(20, 49);//[10, 49) [50, 60) 3509 assert(a == CodeList(10, 49, 50 ,60)); 3510 3511 a = x; 3512 a.add(20, 50); 3513 assert(a == CodeList(10, 60), text(a)); 3514 3515 // simple unions, mostly edge effects 3516 x = CodeList.init; 3517 x.add(10, 20).add(40, 60); 3518 3519 a = x; 3520 a.add(10, 25); //[10, 25) [40, 60) 3521 assert(a == CodeList(10, 25, 40, 60)); 3522 3523 a = x; 3524 a.add(5, 15); //[5, 20) [40, 60) 3525 assert(a == CodeList(5, 20, 40, 60)); 3526 3527 a = x; 3528 a.add(0, 10); // [0, 20) [40, 60) 3529 assert(a == CodeList(0, 20, 40, 60)); 3530 3531 a = x; 3532 a.add(0, 5); // prepand 3533 assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a)); 3534 3535 a = x; 3536 a.add(5, 20); 3537 assert(a == CodeList(5, 20, 40, 60)); 3538 3539 a = x; 3540 a.add(3, 37); 3541 assert(a == CodeList(3, 37, 40, 60)); 3542 3543 a = x; 3544 a.add(37, 65); 3545 assert(a == CodeList(10, 20, 37, 65)); 3546 3547 // some tests on helpers for set intersection 3548 x = CodeList.init.add(10, 20).add(40, 60).add(100, 120); 3549 a = x; 3550 3551 auto m = a.skipUpTo(60); 3552 a.dropUpTo(110, m); 3553 assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[])); 3554 3555 a = x; 3556 a.dropUpTo(100); 3557 assert(a == CodeList(100, 120), text(a.data[])); 3558 3559 a = x; 3560 m = a.skipUpTo(50); 3561 a.dropUpTo(140, m); 3562 assert(a == CodeList(10, 20, 40, 50), text(a.data[])); 3563 a = x; 3564 a.dropUpTo(60); 3565 assert(a == CodeList(100, 120), text(a.data[])); 3566 } 3567 } 3568 3569 3570 //test constructor to work with any order of intervals 3571 pure @safe unittest 3572 { 3573 import std.algorithm.comparison : equal; 3574 import std.conv : text, to; 3575 import std.range : chain, iota; 3576 import std.typecons : tuple; 3577 //ensure constructor handles bad ordering and overlap 3578 auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1); 3579 foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1))) 3580 assert(ch in c1, to!string(ch)); 3581 3582 //contiguos 3583 assert(CodepointSet(1000, 1006, 1006, 1009) 3584 .byInterval.equal([tuple(1000, 1009)])); 3585 //contains 3586 assert(CodepointSet(900, 1200, 1000, 1100) 3587 .byInterval.equal([tuple(900, 1200)])); 3588 //intersect left 3589 assert(CodepointSet(900, 1100, 1000, 1200) 3590 .byInterval.equal([tuple(900, 1200)])); 3591 //intersect right 3592 assert(CodepointSet(1000, 1200, 900, 1100) 3593 .byInterval.equal([tuple(900, 1200)])); 3594 3595 //ditto with extra items at end 3596 assert(CodepointSet(1000, 1200, 900, 1100, 800, 850) 3597 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3598 assert(CodepointSet(900, 1100, 1000, 1200, 800, 850) 3599 .byInterval.equal([tuple(800, 850), tuple(900, 1200)])); 3600 3601 //"plug a hole" test 3602 auto c2 = CodepointSet(20, 40, 3603 60, 80, 100, 140, 150, 200, 3604 40, 60, 80, 100, 140, 150 3605 ); 3606 assert(c2.byInterval.equal([tuple(20, 200)])); 3607 3608 auto c3 = CodepointSet( 3609 20, 40, 60, 80, 100, 140, 150, 200, 3610 0, 10, 15, 100, 10, 20, 200, 220); 3611 assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)])); 3612 } 3613 3614 3615 pure @safe unittest 3616 { // full set operations 3617 import std.conv : text; 3618 alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy); 3619 foreach (CodeList; AllSets) 3620 { 3621 CodeList a, b, c, d; 3622 3623 //"plug a hole" 3624 a.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3625 b.add(40, 60).add(80, 100).add(140, 150); 3626 c = a | b; 3627 d = b | a; 3628 assert(c == CodeList(20, 200), text(CodeList.stringof," ", c)); 3629 assert(c == d, text(c," vs ", d)); 3630 3631 b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210); 3632 c = a | b; //[20,45) [60, 85) [95, 140) [150, 210) 3633 d = b | a; 3634 assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c)); 3635 assert(c == d, text(c," vs ", d)); 3636 3637 b = CodeList.init.add(10, 20).add(30,100).add(145,200); 3638 c = a | b;//[10, 140) [145, 200) 3639 d = b | a; 3640 assert(c == CodeList(10, 140, 145, 200)); 3641 assert(c == d, text(c," vs ", d)); 3642 3643 b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220); 3644 c = a | b;//[0, 140) [150, 220) 3645 d = b | a; 3646 assert(c == CodeList(0, 140, 150, 220)); 3647 assert(c == d, text(c," vs ", d)); 3648 3649 3650 a = CodeList.init.add(20, 40).add(60, 80); 3651 b = CodeList.init.add(25, 35).add(65, 75); 3652 c = a & b; 3653 d = b & a; 3654 assert(c == CodeList(25, 35, 65, 75), text(c)); 3655 assert(c == d, text(c," vs ", d)); 3656 3657 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3658 b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180); 3659 c = a & b; 3660 d = b & a; 3661 assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c)); 3662 assert(c == d, text(c," vs ", d)); 3663 3664 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3665 b = CodeList.init.add(10, 30).add(60, 120).add(135, 160); 3666 c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160) 3667 d = b & a; 3668 3669 assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c)); 3670 assert(c == d, text(c, " vs ",d)); 3671 assert((c & a) == c); 3672 assert((d & b) == d); 3673 assert((c & d) == d); 3674 3675 b = CodeList.init.add(40, 60).add(80, 100).add(140, 200); 3676 c = a & b; 3677 d = b & a; 3678 assert(c == CodeList(150, 200), text(c)); 3679 assert(c == d, text(c, " vs ",d)); 3680 assert((c & a) == c); 3681 assert((d & b) == d); 3682 assert((c & d) == d); 3683 3684 assert((a & a) == a); 3685 assert((b & b) == b); 3686 3687 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3688 b = CodeList.init.add(30, 60).add(75, 120).add(190, 300); 3689 c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190) 3690 d = b - a;// [40, 60) [80, 100) [200, 300) 3691 assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c)); 3692 assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d)); 3693 assert(c - d == c, text(c-d, " vs ", c)); 3694 assert(d - c == d, text(d-c, " vs ", d)); 3695 assert(c - c == CodeList.init); 3696 assert(d - d == CodeList.init); 3697 3698 a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150, 200); 3699 b = CodeList.init.add(10, 50).add(60, 160).add(190, 300); 3700 c = a - b;// [160, 190) 3701 d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300) 3702 assert(c == CodeList(160, 190), text(c)); 3703 assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d)); 3704 assert(c - d == c, text(c-d, " vs ", c)); 3705 assert(d - c == d, text(d-c, " vs ", d)); 3706 assert(c - c == CodeList.init); 3707 assert(d - d == CodeList.init); 3708 3709 a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200); 3710 b = CodeList.init.add(10, 30).add(45, 100).add(130, 190); 3711 c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200) 3712 d = b ~ a; 3713 assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200), 3714 text(c)); 3715 assert(c == d, text(c, " vs ", d)); 3716 } 3717 } 3718 3719 } 3720 3721 pure @safe unittest// vs single dchar 3722 { 3723 import std.conv : text; 3724 CodepointSet a = CodepointSet(10, 100, 120, 200); 3725 assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A')); 3726 assert((a & 'B') == CodepointSet(66, 67)); 3727 } 3728 3729 pure @safe unittest// iteration & opIndex 3730 { 3731 import std.algorithm.comparison : equal; 3732 import std.conv : text; 3733 import std.typecons : tuple, Tuple; 3734 3735 static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy))) 3736 {{ 3737 auto arr = "ABCDEFGHIJKLMabcdefghijklm"d; 3738 auto a = CodeList('A','N','a', 'n'); 3739 assert(equal(a.byInterval, 3740 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')] 3741 ), text(a.byInterval)); 3742 3743 // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ? 3744 version (bug8949) 3745 { 3746 import std.range : retro; 3747 assert(equal(retro(a.byInterval), 3748 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')] 3749 ), text(retro(a.byInterval))); 3750 } 3751 auto achr = a.byCodepoint; 3752 assert(equal(achr, arr), text(a.byCodepoint)); 3753 foreach (ch; a.byCodepoint) 3754 assert(a[ch]); 3755 auto x = CodeList(100, 500, 600, 900, 1200, 1500); 3756 assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval)); 3757 foreach (ch; x.byCodepoint) 3758 assert(x[ch]); 3759 static if (is(CodeList == CodepointSet)) 3760 { 3761 auto y = CodeList(x.byInterval); 3762 assert(equal(x.byInterval, y.byInterval)); 3763 } 3764 assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[])); 3765 assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[])); 3766 }} 3767 } 3768 3769 //============================================================================ 3770 // Generic Trie template and various ways to build it 3771 //============================================================================ 3772 3773 // debug helper to get a shortened array dump 3774 auto arrayRepr(T)(T x) 3775 { 3776 import std.conv : text; 3777 if (x.length > 32) 3778 { 3779 return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]); 3780 } 3781 else 3782 return text(x); 3783 } 3784 3785 /** 3786 Maps `Key` to a suitable integer index within the range of `size_t`. 3787 The mapping is constructed by applying predicates from `Prefix` left to right 3788 and concatenating the resulting bits. 3789 3790 The first (leftmost) predicate defines the most significant bits of 3791 the resulting index. 3792 */ 3793 template mapTrieIndex(Prefix...) 3794 { 3795 size_t mapTrieIndex(Key)(Key key) 3796 if (isValidPrefixForTrie!(Key, Prefix)) 3797 { 3798 alias p = Prefix; 3799 size_t idx; 3800 foreach (i, v; p[0..$-1]) 3801 { 3802 idx |= p[i](key); 3803 idx <<= p[i+1].bitSize; 3804 } 3805 idx |= p[$-1](key); 3806 return idx; 3807 } 3808 } 3809 3810 /* 3811 `TrieBuilder` is a type used for incremental construction 3812 of $(LREF Trie)s. 3813 3814 See $(LREF buildTrie) for generic helpers built on top of it. 3815 */ 3816 @trusted private struct TrieBuilder(Value, Key, Args...) 3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args)) 3818 { 3819 import std.exception : enforce; 3820 3821 private: 3822 // last index is not stored in table, it is used as an offset to values in a block. 3823 static if (is(Value == bool))// always pack bool 3824 alias V = BitPacked!(Value, 1); 3825 else 3826 alias V = Value; 3827 static auto deduceMaxIndex(Preds...)() 3828 { 3829 size_t idx = 1; 3830 foreach (v; Preds) 3831 idx *= 2^^v.bitSize; 3832 return idx; 3833 } 3834 3835 static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key 3836 { 3837 alias Prefix = Args[1..$]; 3838 enum lastPageSize = 2^^Prefix[$-1].bitSize; 3839 enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]); 3840 enum roughedMaxIndex = 3841 (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize; 3842 // check warp around - if wrapped, use the default deduction rule 3843 enum maxIndex = roughedMaxIndex < translatedMaxIndex ? 3844 deduceMaxIndex!(Prefix)() : roughedMaxIndex; 3845 } 3846 else 3847 { 3848 alias Prefix = Args; 3849 enum maxIndex = deduceMaxIndex!(Prefix)(); 3850 } 3851 3852 alias getIndex = mapTrieIndex!(Prefix); 3853 3854 enum lastLevel = Prefix.length-1; 3855 struct ConstructState 3856 { 3857 size_t idx_zeros, idx_ones; 3858 } 3859 // iteration over levels of Trie, each indexes its own level and thus a shortened domain 3860 size_t[Prefix.length] indices; 3861 // default filler value to use 3862 Value defValue; 3863 // this is a full-width index of next item 3864 size_t curIndex; 3865 // all-zeros page index, all-ones page index (+ indicator if there is such a page) 3866 ConstructState[Prefix.length] state; 3867 // the table being constructed 3868 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table; 3869 3870 @disable this(); 3871 3872 //shortcut for index variable at level 'level' 3873 @property ref idx(size_t level)(){ return indices[level]; } 3874 3875 // this function assumes no holes in the input so 3876 // indices are going one by one 3877 void addValue(size_t level, T)(T val, size_t numVals) 3878 { 3879 alias j = idx!level; 3880 enum pageSize = 1 << Prefix[level].bitSize; 3881 if (numVals == 0) 3882 return; 3883 auto ptr = table.slice!(level); 3884 if (numVals == 1) 3885 { 3886 static if (level == Prefix.length-1) 3887 ptr[j] = val; 3888 else 3889 {// can incur narrowing conversion 3890 assert(j < ptr.length); 3891 ptr[j] = force!(typeof(ptr[j]))(val); 3892 } 3893 j++; 3894 if (j % pageSize == 0) 3895 spillToNextPage!level(ptr); 3896 return; 3897 } 3898 // longer row of values 3899 // get to the next page boundary 3900 immutable nextPB = (j + pageSize) & ~(pageSize-1); 3901 immutable n = nextPB - j;// can fill right in this page 3902 if (numVals < n) //fits in current page 3903 { 3904 ptr[j .. j+numVals] = val; 3905 j += numVals; 3906 return; 3907 } 3908 static if (level != 0)//on the first level it always fits 3909 { 3910 numVals -= n; 3911 //write till the end of current page 3912 ptr[j .. j+n] = val; 3913 j += n; 3914 //spill to the next page 3915 spillToNextPage!level(ptr); 3916 // page at once loop 3917 if (state[level].idx_zeros != size_t.max && val == T.init) 3918 { 3919 alias NextIdx = typeof(table.slice!(level-1)[0]); 3920 addValue!(level-1)(force!NextIdx(state[level].idx_zeros), 3921 numVals/pageSize); 3922 ptr = table.slice!level; //table structure might have changed 3923 numVals %= pageSize; 3924 } 3925 else 3926 { 3927 while (numVals >= pageSize) 3928 { 3929 numVals -= pageSize; 3930 ptr[j .. j+pageSize] = val; 3931 j += pageSize; 3932 spillToNextPage!level(ptr); 3933 } 3934 } 3935 if (numVals) 3936 { 3937 // the leftovers, an incomplete page 3938 ptr[j .. j+numVals] = val; 3939 j += numVals; 3940 } 3941 } 3942 } 3943 3944 void spillToNextPage(size_t level, Slice)(ref Slice ptr) 3945 { 3946 // last level (i.e. topmost) has 1 "page" 3947 // thus it need not to add a new page on upper level 3948 static if (level != 0) 3949 spillToNextPageImpl!(level)(ptr); 3950 } 3951 3952 // this can re-use the current page if duplicate or allocate a new one 3953 // it also makes sure that previous levels point to the correct page in this level 3954 void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr) 3955 { 3956 alias NextIdx = typeof(table.slice!(level-1)[0]); 3957 NextIdx next_lvl_index; 3958 enum pageSize = 1 << Prefix[level].bitSize; 3959 assert(idx!level % pageSize == 0); 3960 immutable last = idx!level-pageSize; 3961 const slice = ptr[idx!level - pageSize .. idx!level]; 3962 size_t j; 3963 for (j=0; j<last; j+=pageSize) 3964 { 3965 if (ptr[j .. j+pageSize] == slice) 3966 { 3967 // get index to it, reuse ptr space for the next block 3968 next_lvl_index = force!NextIdx(j/pageSize); 3969 version (none) 3970 { 3971 import std.stdio : writefln, writeln; 3972 writefln("LEVEL(%s) page mapped idx: %s: 0..%s ---> [%s..%s]" 3973 ,level 3974 ,indices[level-1], pageSize, j, j+pageSize); 3975 writeln("LEVEL(", level 3976 , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize])); 3977 writeln("LEVEL(", level 3978 , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize])); 3979 } 3980 idx!level -= pageSize; // reuse this page, it is duplicate 3981 break; 3982 } 3983 } 3984 if (j == last) 3985 { 3986 L_allocate_page: 3987 next_lvl_index = force!NextIdx(idx!level/pageSize - 1); 3988 if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize)) 3989 { 3990 state[level].idx_zeros = next_lvl_index; 3991 } 3992 // allocate next page 3993 version (none) 3994 { 3995 import std.stdio : writefln; 3996 writefln("LEVEL(%s) page allocated: %s" 3997 , level, arrayRepr(slice[0 .. pageSize])); 3998 writefln("LEVEL(%s) index: %s ; page at this index %s" 3999 , level 4000 , next_lvl_index 4001 , arrayRepr( 4002 table.slice!(level) 4003 [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize] 4004 )); 4005 } 4006 table.length!level = table.length!level + pageSize; 4007 } 4008 L_know_index: 4009 // for the previous level, values are indices to the pages in the current level 4010 addValue!(level-1)(next_lvl_index, 1); 4011 ptr = table.slice!level; //re-load the slice after moves 4012 } 4013 4014 // idx - full-width index to fill with v (full-width index != key) 4015 // fills everything in the range of [curIndex, idx) with filler 4016 void putAt(size_t idx, Value v) 4017 { 4018 assert(idx >= curIndex); 4019 immutable numFillers = idx - curIndex; 4020 addValue!lastLevel(defValue, numFillers); 4021 addValue!lastLevel(v, 1); 4022 curIndex = idx + 1; 4023 } 4024 4025 // ditto, but sets the range of [idxA, idxB) to v 4026 void putRangeAt(size_t idxA, size_t idxB, Value v) 4027 { 4028 assert(idxA >= curIndex); 4029 assert(idxB >= idxA); 4030 size_t numFillers = idxA - curIndex; 4031 addValue!lastLevel(defValue, numFillers); 4032 addValue!lastLevel(v, idxB - idxA); 4033 curIndex = idxB; // open-right 4034 } 4035 4036 enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~ 4037 "duplicate key->value mapping"; 4038 4039 public: 4040 /** 4041 Construct a builder, where `filler` is a value 4042 to indicate empty slots (or "not found" condition). 4043 */ 4044 this(Value filler) 4045 { 4046 curIndex = 0; 4047 defValue = filler; 4048 // zeros-page index, ones-page index 4049 foreach (ref v; state) 4050 v = ConstructState(size_t.max, size_t.max); 4051 table = typeof(table)(indices); 4052 // one page per level is a bootstrap minimum 4053 foreach (i, Pred; Prefix) 4054 table.length!i = (1 << Pred.bitSize); 4055 } 4056 4057 /** 4058 Put a value `v` into interval as 4059 mapped by keys from `a` to `b`. 4060 All slots prior to `a` are filled with 4061 the default filler. 4062 */ 4063 void putRange(Key a, Key b, Value v) 4064 { 4065 auto idxA = getIndex(a), idxB = getIndex(b); 4066 // indexes of key should always grow 4067 enforce(idxB >= idxA && idxA >= curIndex, errMsg); 4068 putRangeAt(idxA, idxB, v); 4069 } 4070 4071 /** 4072 Put a value `v` into slot mapped by `key`. 4073 All slots prior to `key` are filled with the 4074 default filler. 4075 */ 4076 void putValue(Key key, Value v) 4077 { 4078 auto idx = getIndex(key); 4079 enforce(idx >= curIndex, errMsg); 4080 putAt(idx, v); 4081 } 4082 4083 /// Finishes construction of Trie, yielding an immutable Trie instance. 4084 auto build() 4085 { 4086 static if (maxIndex != 0) // doesn't cover full range of size_t 4087 { 4088 assert(curIndex <= maxIndex); 4089 addValue!lastLevel(defValue, maxIndex - curIndex); 4090 } 4091 else 4092 { 4093 if (curIndex != 0 // couldn't wrap around 4094 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty 4095 { 4096 addValue!lastLevel(defValue, size_t.max - curIndex); 4097 addValue!lastLevel(defValue, 1); 4098 } 4099 // else curIndex already completed the full range of size_t by wrapping around 4100 } 4101 return Trie!(V, Key, maxIndex, Prefix)(table); 4102 } 4103 } 4104 4105 /** 4106 $(P A generic Trie data-structure for a fixed number of stages. 4107 The design goal is optimal speed with smallest footprint size. 4108 ) 4109 $(P It's intentionally read-only and doesn't provide constructors. 4110 To construct one use a special builder, 4111 see $(LREF TrieBuilder) and $(LREF buildTrie). 4112 ) 4113 4114 */ 4115 @trusted private struct Trie(Value, Key, Args...) 4116 if (isValidPrefixForTrie!(Key, Args) 4117 || (isValidPrefixForTrie!(Key, Args[1..$]) 4118 && is(typeof(Args[0]) : size_t))) 4119 { 4120 import std.range.primitives : isOutputRange; 4121 static if (is(typeof(Args[0]) : size_t)) 4122 { 4123 private enum maxIndex = Args[0]; 4124 private enum hasBoundsCheck = true; 4125 private alias Prefix = Args[1..$]; 4126 } 4127 else 4128 { 4129 private enum hasBoundsCheck = false; 4130 private alias Prefix = Args; 4131 } 4132 4133 private this()(typeof(_table) table) 4134 { 4135 _table = table; 4136 } 4137 4138 // only for constant Tries constructed from precompiled tables 4139 private this()(const(size_t)[] offsets, const(size_t)[] sizes, 4140 const(size_t)[] data) const 4141 { 4142 _table = typeof(_table)(offsets, sizes, data); 4143 } 4144 4145 /** 4146 $(P Lookup the `key` in this `Trie`. ) 4147 4148 $(P The lookup always succeeds if key fits the domain 4149 provided during construction. The whole domain defined 4150 is covered so instead of not found condition 4151 the sentinel (filler) value could be used. ) 4152 4153 $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to 4154 define a domain of `Trie` keys and the sentinel value. ) 4155 4156 Note: 4157 Domain range-checking is only enabled in debug builds 4158 and results in assertion failure. 4159 */ 4160 TypeOfBitPacked!Value opIndex()(Key key) const 4161 { 4162 static if (hasBoundsCheck) 4163 assert(mapTrieIndex!Prefix(key) < maxIndex); 4164 size_t idx; 4165 alias p = Prefix; 4166 idx = cast(size_t) p[0](key); 4167 foreach (i, v; p[0..$-1]) 4168 idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key)); 4169 return _table.ptr!(p.length-1)[idx]; 4170 } 4171 4172 /// 4173 @property size_t bytes(size_t n=size_t.max)() const 4174 { 4175 return _table.bytes!n; 4176 } 4177 4178 /// 4179 @property size_t pages(size_t n)() const 4180 { 4181 return (bytes!n+2^^(Prefix[n].bitSize-1)) 4182 /2^^Prefix[n].bitSize; 4183 } 4184 4185 /// 4186 void store(OutRange)(scope OutRange sink) const 4187 if (isOutputRange!(OutRange, char)) 4188 { 4189 _table.store(sink); 4190 } 4191 4192 private: 4193 MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table; 4194 } 4195 4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes' 4197 // left-to-right, the most significant bits first 4198 template GetBitSlicing(size_t top, sizes...) 4199 { 4200 static if (sizes.length > 0) 4201 alias GetBitSlicing = 4202 AliasSeq!(sliceBits!(top - sizes[0], top), 4203 GetBitSlicing!(top - sizes[0], sizes[1..$])); 4204 else 4205 alias GetBitSlicing = AliasSeq!(); 4206 } 4207 4208 template callableWith(T) 4209 { 4210 template callableWith(alias Pred) 4211 { 4212 static if (!is(typeof(Pred(T.init)))) 4213 enum callableWith = false; 4214 else 4215 { 4216 alias Result = typeof(Pred(T.init)); 4217 enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result)); 4218 } 4219 } 4220 } 4221 4222 /* 4223 Check if `Prefix` is a valid set of predicates 4224 for `Trie` template having `Key` as the type of keys. 4225 This requires all predicates to be callable, take 4226 single argument of type `Key` and return unsigned value. 4227 */ 4228 template isValidPrefixForTrie(Key, Prefix...) 4229 { 4230 import std.meta : allSatisfy; 4231 enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws 4232 } 4233 4234 /* 4235 Check if `Args` is a set of maximum key value followed by valid predicates 4236 for `Trie` template having `Key` as the type of keys. 4237 */ 4238 template isValidArgsForTrie(Key, Args...) 4239 { 4240 static if (Args.length > 1) 4241 { 4242 enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args) 4243 || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key)); 4244 } 4245 else 4246 enum isValidArgsForTrie = isValidPrefixForTrie!Args; 4247 } 4248 4249 @property size_t sumOfIntegerTuple(ints...)() 4250 { 4251 size_t count=0; 4252 foreach (v; ints) 4253 count += v; 4254 return count; 4255 } 4256 4257 /** 4258 A shorthand for creating a custom multi-level fixed Trie 4259 from a `CodepointSet`. `sizes` are numbers of bits per level, 4260 with the most significant bits used first. 4261 4262 Note: The sum of `sizes` must be equal 21. 4263 4264 See_Also: $(LREF toTrie), which is even simpler. 4265 4266 Example: 4267 --- 4268 { 4269 import std.stdio; 4270 auto set = unicode("Number"); 4271 auto trie = codepointSetTrie!(8, 5, 8)(set); 4272 writeln("Input code points to test:"); 4273 foreach (line; stdin.byLine) 4274 { 4275 int count=0; 4276 foreach (dchar ch; line) 4277 if (trie[ch])// is number 4278 count++; 4279 writefln("Contains %d number code points.", count); 4280 } 4281 } 4282 --- 4283 */ 4284 public template codepointSetTrie(sizes...) 4285 if (sumOfIntegerTuple!sizes == 21) 4286 { 4287 auto codepointSetTrie(Set)(Set set) 4288 if (isCodepointSet!Set) 4289 { 4290 auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false); 4291 foreach (ival; set.byInterval) 4292 builder.putRange(ival[0], ival[1], true); 4293 return builder.build(); 4294 } 4295 } 4296 4297 /// Type of Trie generated by codepointSetTrie function. 4298 public template CodepointSetTrie(sizes...) 4299 if (sumOfIntegerTuple!sizes == 21) 4300 { 4301 alias Prefix = GetBitSlicing!(21, sizes); 4302 alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build()); 4303 } 4304 4305 /** 4306 A slightly more general tool for building fixed `Trie` 4307 for the Unicode data. 4308 4309 Specifically unlike `codepointSetTrie` it's allows creating mappings 4310 of `dchar` to an arbitrary type `T`. 4311 4312 Note: Overload taking `CodepointSet`s will naturally convert 4313 only to bool mapping `Trie`s. 4314 4315 CodepointTrie is the type of Trie as generated by codepointTrie function. 4316 */ 4317 public template codepointTrie(T, sizes...) 4318 if (sumOfIntegerTuple!sizes == 21) 4319 { 4320 alias Prefix = GetBitSlicing!(21, sizes); 4321 4322 static if (is(TypeOfBitPacked!T == bool)) 4323 { 4324 auto codepointTrie(Set)(const scope Set set) 4325 if (isCodepointSet!Set) 4326 { 4327 return codepointSetTrie(set); 4328 } 4329 } 4330 4331 /// 4332 auto codepointTrie()(T[dchar] map, T defValue=T.init) 4333 { 4334 return buildTrie!(T, dchar, Prefix)(map, defValue); 4335 } 4336 4337 // unsorted range of pairs 4338 /// 4339 auto codepointTrie(R)(R range, T defValue=T.init) 4340 if (isInputRange!R 4341 && is(typeof(ElementType!R.init[0]) : T) 4342 && is(typeof(ElementType!R.init[1]) : dchar)) 4343 { 4344 // build from unsorted array of pairs 4345 // TODO: expose index sorting functions for Trie 4346 return buildTrie!(T, dchar, Prefix)(range, defValue, true); 4347 } 4348 } 4349 4350 @system pure unittest 4351 { 4352 import std.algorithm.comparison : max; 4353 import std.algorithm.searching : count; 4354 4355 // pick characters from the Greek script 4356 auto set = unicode.Greek; 4357 4358 // a user-defined property (or an expensive function) 4359 // that we want to look up 4360 static uint luckFactor(dchar ch) 4361 { 4362 // here we consider a character lucky 4363 // if its code point has a lot of identical hex-digits 4364 // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2 4365 ubyte[6] nibbles; // 6 4-bit chunks of code point 4366 uint value = ch; 4367 foreach (i; 0 .. 6) 4368 { 4369 nibbles[i] = value & 0xF; 4370 value >>= 4; 4371 } 4372 uint luck; 4373 foreach (n; nibbles) 4374 luck = cast(uint) max(luck, count(nibbles[], n)); 4375 return luck; 4376 } 4377 4378 // only unsigned built-ins are supported at the moment 4379 alias LuckFactor = BitPacked!(uint, 3); 4380 4381 // create a temporary associative array (AA) 4382 LuckFactor[dchar] map; 4383 foreach (ch; set.byCodepoint) 4384 map[ch] = LuckFactor(luckFactor(ch)); 4385 4386 // bits per stage are chosen randomly, fell free to optimize 4387 auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map); 4388 4389 // from now on the AA is not needed 4390 foreach (ch; set.byCodepoint) 4391 assert(trie[ch] == luckFactor(ch)); // verify 4392 // CJK is not Greek, thus it has the default value 4393 assert(trie['\u4444'] == 0); 4394 // and here is a couple of quite lucky Greek characters: 4395 // Greek small letter epsilon with dasia 4396 assert(trie['\u1F11'] == 3); 4397 // Ancient Greek metretes sign 4398 assert(trie['\U00010181'] == 3); 4399 4400 } 4401 4402 /// ditto 4403 public template CodepointTrie(T, sizes...) 4404 if (sumOfIntegerTuple!sizes == 21) 4405 { 4406 alias Prefix = GetBitSlicing!(21, sizes); 4407 alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build()); 4408 } 4409 4410 package(std) template cmpK0(alias Pred) 4411 { 4412 import std.typecons : Tuple; 4413 static bool cmpK0(Value, Key) 4414 (Tuple!(Value, Key) a, Tuple!(Value, Key) b) 4415 { 4416 return Pred(a[1]) < Pred(b[1]); 4417 } 4418 } 4419 4420 /** 4421 The most general utility for construction of `Trie`s 4422 short of using `TrieBuilder` directly. 4423 4424 Provides a number of convenience overloads. 4425 `Args` is tuple of maximum key value followed by 4426 predicates to construct index from key. 4427 4428 Alternatively if the first argument is not a value convertible to `Key` 4429 then the whole tuple of `Args` is treated as predicates 4430 and the maximum Key is deduced from predicates. 4431 */ 4432 private template buildTrie(Value, Key, Args...) 4433 if (isValidArgsForTrie!(Key, Args)) 4434 { 4435 static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key 4436 { 4437 alias Prefix = Args[1..$]; 4438 } 4439 else 4440 alias Prefix = Args; 4441 4442 alias getIndex = mapTrieIndex!(Prefix); 4443 4444 // for multi-sort 4445 template GetComparators(size_t n) 4446 { 4447 static if (n > 0) 4448 alias GetComparators = 4449 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1])); 4450 else 4451 alias GetComparators = AliasSeq!(); 4452 } 4453 4454 /* 4455 Build `Trie` from a range of a Key-Value pairs, 4456 assuming it is sorted by Key as defined by the following lambda: 4457 ------ 4458 (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b) 4459 ------ 4460 Exception is thrown if it's detected that the above order doesn't hold. 4461 4462 In other words $(LREF mapTrieIndex) should be a 4463 monotonically increasing function that maps `Key` to an integer. 4464 4465 See_Also: $(REF sort, std,_algorithm), 4466 $(REF SortedRange, std,range), 4467 $(REF setUnion, std,_algorithm). 4468 */ 4469 auto buildTrie(Range)(Range range, Value filler=Value.init) 4470 if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value) 4471 && is(typeof(Range.init.front[1]) : Key)) 4472 { 4473 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4474 foreach (v; range) 4475 builder.putValue(v[1], v[0]); 4476 return builder.build(); 4477 } 4478 4479 /* 4480 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4481 to build `Trie` from a range of open-right intervals of `Key`s. 4482 The requirement on the ordering of keys (and the behavior on the 4483 violation of it) is the same as for Key-Value range overload. 4484 4485 Intervals denote ranges of !`filler` i.e. the opposite of filler. 4486 If no filler provided keys inside of the intervals map to true, 4487 and `filler` is false. 4488 */ 4489 auto buildTrie(Range)(Range range, Value filler=Value.init) 4490 if (is(TypeOfBitPacked!Value == bool) 4491 && isInputRange!Range && is(typeof(Range.init.front[0]) : Key) 4492 && is(typeof(Range.init.front[1]) : Key)) 4493 { 4494 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4495 foreach (ival; range) 4496 builder.putRange(ival[0], ival[1], !filler); 4497 return builder.build(); 4498 } 4499 4500 auto buildTrie(Range)(Range range, Value filler, bool unsorted) 4501 if (isInputRange!Range 4502 && is(typeof(Range.init.front[0]) : Value) 4503 && is(typeof(Range.init.front[1]) : Key)) 4504 { 4505 import std.algorithm.sorting : multiSort; 4506 alias Comps = GetComparators!(Prefix.length); 4507 if (unsorted) 4508 multiSort!(Comps)(range); 4509 return buildTrie(range, filler); 4510 } 4511 4512 /* 4513 If `Value` is bool (or BitPacked!(bool, x)) then it's possible 4514 to build `Trie` simply from an input range of `Key`s. 4515 The requirement on the ordering of keys (and the behavior on the 4516 violation of it) is the same as for Key-Value range overload. 4517 4518 Keys found in range denote !`filler` i.e. the opposite of filler. 4519 If no filler provided keys map to true, and `filler` is false. 4520 */ 4521 auto buildTrie(Range)(Range range, Value filler=Value.init) 4522 if (is(TypeOfBitPacked!Value == bool) 4523 && isInputRange!Range && is(typeof(Range.init.front) : Key)) 4524 { 4525 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4526 foreach (v; range) 4527 builder.putValue(v, !filler); 4528 return builder.build(); 4529 } 4530 4531 /* 4532 If `Key` is unsigned integer `Trie` could be constructed from array 4533 of values where array index serves as key. 4534 */ 4535 auto buildTrie()(Value[] array, Value filler=Value.init) 4536 if (isUnsigned!Key) 4537 { 4538 auto builder = TrieBuilder!(Value, Key, Prefix)(filler); 4539 foreach (idx, v; array) 4540 builder.putValue(idx, v); 4541 return builder.build(); 4542 } 4543 4544 /* 4545 Builds `Trie` from associative array. 4546 */ 4547 auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init) 4548 { 4549 import std.array : array; 4550 import std.range : zip; 4551 auto range = array(zip(map.values, map.keys)); 4552 return buildTrie(range, filler, true); // sort it 4553 } 4554 } 4555 4556 // helper in place of assumeSize to 4557 //reduce mangled name & help DMD inline Trie functors 4558 struct clamp(size_t bits) 4559 { 4560 static size_t opCall(T)(T arg){ return arg; } 4561 enum bitSize = bits; 4562 } 4563 4564 struct clampIdx(size_t idx, size_t bits) 4565 { 4566 static size_t opCall(T)(T arg){ return arg[idx]; } 4567 enum bitSize = bits; 4568 } 4569 4570 /** 4571 Conceptual type that outlines the common properties of all UTF Matchers. 4572 4573 Note: For illustration purposes only, every method 4574 call results in assertion failure. 4575 Use $(LREF utfMatcher) to obtain a concrete matcher 4576 for UTF-8 or UTF-16 encodings. 4577 */ 4578 public struct MatcherConcept 4579 { 4580 /** 4581 $(P Perform a semantic equivalent 2 operations: 4582 decoding a $(CODEPOINT) at front of `inp` and testing if 4583 it belongs to the set of $(CODEPOINTS) of this matcher. ) 4584 4585 $(P The effect on `inp` depends on the kind of function called:) 4586 4587 $(P Match. If the codepoint is found in the set then range `inp` 4588 is advanced by its size in $(S_LINK Code unit, code units), 4589 otherwise the range is not modifed.) 4590 4591 $(P Skip. The range is always advanced by the size 4592 of the tested $(CODEPOINT) regardless of the result of test.) 4593 4594 $(P Test. The range is left unaffected regardless 4595 of the result of test.) 4596 */ 4597 public bool match(Range)(ref Range inp) 4598 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4599 { 4600 assert(false); 4601 } 4602 4603 ///ditto 4604 public bool skip(Range)(ref Range inp) 4605 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4606 { 4607 assert(false); 4608 } 4609 4610 ///ditto 4611 public bool test(Range)(ref Range inp) 4612 if (isRandomAccessRange!Range && is(ElementType!Range : char)) 4613 { 4614 assert(false); 4615 } 4616 /// 4617 pure @safe unittest 4618 { 4619 string truth = "2² = 4"; 4620 auto m = utfMatcher!char(unicode.Number); 4621 assert(m.match(truth)); // '2' is a number all right 4622 assert(truth == "² = 4"); // skips on match 4623 assert(m.match(truth)); // so is the superscript '2' 4624 assert(!m.match(truth)); // space is not a number 4625 assert(truth == " = 4"); // unaffected on no match 4626 assert(!m.skip(truth)); // same test ... 4627 assert(truth == "= 4"); // but skips a codepoint regardless 4628 assert(!m.test(truth)); // '=' is not a number 4629 assert(truth == "= 4"); // test never affects argument 4630 } 4631 4632 /** 4633 Advanced feature - provide direct access to a subset of matcher based a 4634 set of known encoding lengths. Lengths are provided in 4635 $(S_LINK Code unit, code units). The sub-matcher then may do less 4636 operations per any `test`/`match`. 4637 4638 Use with care as the sub-matcher won't match 4639 any $(CODEPOINTS) that have encoded length that doesn't belong 4640 to the selected set of lengths. Also the sub-matcher object references 4641 the parent matcher and must not be used past the liftetime 4642 of the latter. 4643 4644 Another caveat of using sub-matcher is that skip is not available 4645 preciesly because sub-matcher doesn't detect all lengths. 4646 */ 4647 @property auto subMatcher(Lengths...)() 4648 { 4649 assert(0); 4650 return this; 4651 } 4652 4653 pure @safe unittest 4654 { 4655 auto m = utfMatcher!char(unicode.Number); 4656 string square = "2²"; 4657 // about sub-matchers 4658 assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered 4659 assert(m.subMatcher!1.match(square)); // ASCII-only, works 4660 assert(!m.subMatcher!1.test(square)); // unicode '²' 4661 assert(m.subMatcher!(2,3,4).match(square)); // 4662 assert(square == ""); 4663 wstring wsquare = "2²"; 4664 auto m16 = utfMatcher!wchar(unicode.Number); 4665 // may keep ref, but the orignal (m16) must be kept alive 4666 auto bmp = m16.subMatcher!1; 4667 assert(bmp.match(wsquare)); // Okay, in basic multilingual plan 4668 assert(bmp.match(wsquare)); // And '²' too 4669 } 4670 } 4671 4672 /** 4673 Test if `M` is an UTF Matcher for ranges of `Char`. 4674 */ 4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){ 4676 C[] s; 4677 auto d = s.decoder; 4678 M m; 4679 assert(is(typeof(m.match(d)) == bool)); 4680 assert(is(typeof(m.test(d)) == bool)); 4681 static if (is(typeof(m.skip(d)))) 4682 { 4683 assert(is(typeof(m.skip(d)) == bool)); 4684 assert(is(typeof(m.skip(s)) == bool)); 4685 } 4686 assert(is(typeof(m.match(s)) == bool)); 4687 assert(is(typeof(m.test(s)) == bool)); 4688 }); 4689 4690 pure @safe unittest 4691 { 4692 alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init)); 4693 alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init)); 4694 static assert(isUtfMatcher!(CharMatcher, char)); 4695 static assert(isUtfMatcher!(CharMatcher, immutable(char))); 4696 static assert(isUtfMatcher!(WcharMatcher, wchar)); 4697 static assert(isUtfMatcher!(WcharMatcher, immutable(wchar))); 4698 } 4699 4700 enum Mode { 4701 alwaysSkip, 4702 neverSkip, 4703 skipOnMatch 4704 } 4705 4706 mixin template ForwardStrings() 4707 { 4708 private bool fwdStr(string fn, C)(ref C[] str) const @trusted 4709 { 4710 import std.utf : byCodeUnit; 4711 alias type = typeof(byCodeUnit(str)); 4712 return mixin(fn~"(*cast(type*)&str)"); 4713 } 4714 } 4715 4716 template Utf8Matcher() 4717 { 4718 enum validSize(int sz) = sz >= 1 && sz <= 4; 4719 4720 void badEncoding() pure @safe 4721 { 4722 import std.utf : UTFException; 4723 throw new UTFException("Invalid UTF-8 sequence"); 4724 } 4725 4726 //for 1-stage ASCII 4727 alias AsciiSpec = AliasSeq!(bool, char, clamp!7); 4728 //for 2-stage lookup of 2 byte UTF-8 sequences 4729 alias Utf8Spec2 = AliasSeq!(bool, char[2], 4730 clampIdx!(0, 5), clampIdx!(1, 6)); 4731 //ditto for 3 byte 4732 alias Utf8Spec3 = AliasSeq!(bool, char[3], 4733 clampIdx!(0, 4), 4734 clampIdx!(1, 6), 4735 clampIdx!(2, 6) 4736 ); 4737 //ditto for 4 byte 4738 alias Utf8Spec4 = AliasSeq!(bool, char[4], 4739 clampIdx!(0, 3), clampIdx!(1, 6), 4740 clampIdx!(2, 6), clampIdx!(3, 6) 4741 ); 4742 alias Tables = AliasSeq!( 4743 typeof(TrieBuilder!(AsciiSpec)(false).build()), 4744 typeof(TrieBuilder!(Utf8Spec2)(false).build()), 4745 typeof(TrieBuilder!(Utf8Spec3)(false).build()), 4746 typeof(TrieBuilder!(Utf8Spec4)(false).build()) 4747 ); 4748 alias Table(int size) = Tables[size-1]; 4749 4750 enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1; 4751 enum encMask(size_t size) = ((1 << size)-1)<<(8-size); 4752 4753 char truncate()(char ch) pure @safe 4754 { 4755 ch -= 0x80; 4756 if (ch < 0x40) 4757 { 4758 return ch; 4759 } 4760 else 4761 { 4762 badEncoding(); 4763 return cast(char) 0; 4764 } 4765 } 4766 4767 static auto encode(size_t sz)(dchar ch) 4768 if (sz > 1) 4769 { 4770 import std.utf : encodeUTF = encode; 4771 char[4] buf; 4772 encodeUTF(buf, ch); 4773 char[sz] ret; 4774 buf[0] &= leadMask!sz; 4775 foreach (n; 1 .. sz) 4776 buf[n] = buf[n] & 0x3f; //keep 6 lower bits 4777 ret[] = buf[0 .. sz]; 4778 return ret; 4779 } 4780 4781 auto build(Set)(Set set) 4782 { 4783 import std.algorithm.iteration : map; 4784 auto ascii = set & unicode.ASCII; 4785 auto utf8_2 = set & CodepointSet(0x80, 0x800); 4786 auto utf8_3 = set & CodepointSet(0x800, 0x1_0000); 4787 auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1); 4788 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 4789 auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2); 4790 auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3); 4791 auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4); 4792 alias Ret = Impl!(1,2,3,4); 4793 return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T); 4794 } 4795 4796 // Bootstrap UTF-8 static matcher interface 4797 // from 3 primitives: tab!(size), lookup and Sizes 4798 mixin template DefMatcher() 4799 { 4800 import std.format : format; 4801 import std.meta : Erase, staticIndexOf; 4802 enum hasASCII = staticIndexOf!(1, Sizes) >= 0; 4803 alias UniSizes = Erase!(1, Sizes); 4804 4805 //generate dispatch code sequence for unicode parts 4806 static auto genDispatch() 4807 { 4808 string code; 4809 foreach (size; UniSizes) 4810 code ~= format(q{ 4811 if ((ch & ~leadMask!%d) == encMask!(%d)) 4812 return lookup!(%d, mode)(inp); 4813 else 4814 }, size, size, size); 4815 static if (Sizes.length == 4) //covers all code unit cases 4816 code ~= "{ badEncoding(); return false; }"; 4817 else 4818 code ~= "return false;"; //may be just fine but not covered 4819 return code; 4820 } 4821 enum dispatch = genDispatch(); 4822 4823 public bool match(Range)(ref Range inp) const 4824 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4825 !isDynamicArray!Range) 4826 { 4827 enum mode = Mode.skipOnMatch; 4828 assert(!inp.empty); 4829 immutable ch = inp[0]; 4830 static if (hasASCII) 4831 { 4832 if (ch < 0x80) 4833 { 4834 immutable r = tab!1[ch]; 4835 if (r) 4836 inp.popFront(); 4837 return r; 4838 } 4839 else 4840 mixin(dispatch); 4841 } 4842 else 4843 mixin(dispatch); 4844 } 4845 4846 static if (Sizes.length == 4) // can skip iff can detect all encodings 4847 { 4848 public bool skip(Range)(ref Range inp) const 4849 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4850 !isDynamicArray!Range) 4851 { 4852 enum mode = Mode.alwaysSkip; 4853 assert(!inp.empty); 4854 auto ch = inp[0]; 4855 static if (hasASCII) 4856 { 4857 if (ch < 0x80) 4858 { 4859 inp.popFront(); 4860 return tab!1[ch]; 4861 } 4862 else 4863 mixin(dispatch); 4864 } 4865 else 4866 mixin(dispatch); 4867 } 4868 } 4869 4870 public bool test(Range)(ref Range inp) const 4871 if (isRandomAccessRange!Range && is(ElementType!Range : char) && 4872 !isDynamicArray!Range) 4873 { 4874 enum mode = Mode.neverSkip; 4875 assert(!inp.empty); 4876 auto ch = inp[0]; 4877 4878 static if (hasASCII) 4879 { 4880 if (ch < 0x80) 4881 return tab!1[ch]; 4882 else 4883 mixin(dispatch); 4884 } 4885 else 4886 mixin(dispatch); 4887 } 4888 4889 bool match(C)(ref C[] str) const 4890 if (isSomeChar!C) 4891 { 4892 return fwdStr!"match"(str); 4893 } 4894 4895 bool skip(C)(ref C[] str) const 4896 if (isSomeChar!C) 4897 { 4898 return fwdStr!"skip"(str); 4899 } 4900 4901 bool test(C)(ref C[] str) const 4902 if (isSomeChar!C) 4903 { 4904 return fwdStr!"test"(str); 4905 } 4906 4907 mixin ForwardStrings; 4908 } 4909 4910 struct Impl(Sizes...) 4911 { 4912 import std.meta : allSatisfy, staticMap; 4913 static assert(allSatisfy!(validSize, Sizes), 4914 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4915 private: 4916 //pick tables for chosen sizes 4917 alias OurTabs = staticMap!(Table, Sizes); 4918 OurTabs tables; 4919 mixin DefMatcher; 4920 //static disptach helper UTF size ==> table 4921 alias tab(int i) = tables[i - 1]; 4922 4923 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 4924 { 4925 return CherryPick!(Impl, SizesToPick)(&this); 4926 } 4927 4928 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4929 { 4930 import std.range : popFrontN; 4931 if (inp.length < size) 4932 { 4933 badEncoding(); 4934 return false; 4935 } 4936 char[size] needle = void; 4937 needle[0] = leadMask!size & inp[0]; 4938 static foreach (i; 1 .. size) 4939 { 4940 needle[i] = truncate(inp[i]); 4941 } 4942 //overlong encoding checks 4943 static if (size == 2) 4944 { 4945 //0x80-0x7FF 4946 //got 6 bits in needle[1], must use at least 8 bits 4947 //must use at least 2 bits in needle[1] 4948 if (needle[0] < 2) badEncoding(); 4949 } 4950 else static if (size == 3) 4951 { 4952 //0x800-0xFFFF 4953 //got 6 bits in needle[2], must use at least 12bits 4954 //must use 6 bits in needle[1] or anything in needle[0] 4955 if (needle[0] == 0 && needle[1] < 0x20) badEncoding(); 4956 } 4957 else static if (size == 4) 4958 { 4959 //0x800-0xFFFF 4960 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits 4961 //must use 5 bits (or above) in needle[1] or anything in needle[0] 4962 if (needle[0] == 0 && needle[1] < 0x10) badEncoding(); 4963 } 4964 static if (mode == Mode.alwaysSkip) 4965 { 4966 inp.popFrontN(size); 4967 return tab!size[needle]; 4968 } 4969 else static if (mode == Mode.neverSkip) 4970 { 4971 return tab!size[needle]; 4972 } 4973 else 4974 { 4975 static assert(mode == Mode.skipOnMatch); 4976 4977 if (tab!size[needle]) 4978 { 4979 inp.popFrontN(size); 4980 return true; 4981 } 4982 else 4983 return false; 4984 } 4985 } 4986 } 4987 4988 struct CherryPick(I, Sizes...) 4989 { 4990 import std.meta : allSatisfy; 4991 static assert(allSatisfy!(validSize, Sizes), 4992 "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8"); 4993 private: 4994 I* m; 4995 @property auto tab(int i)() const { return m.tables[i - 1]; } 4996 bool lookup(int size, Mode mode, Range)(ref Range inp) const 4997 { 4998 return m.lookup!(size, mode)(inp); 4999 } 5000 mixin DefMatcher; 5001 } 5002 } 5003 5004 template Utf16Matcher() 5005 { 5006 enum validSize(int sz) = sz >= 1 && sz <= 2; 5007 5008 void badEncoding() pure @safe 5009 { 5010 import std.utf : UTFException; 5011 throw new UTFException("Invalid UTF-16 sequence"); 5012 } 5013 5014 // 1-stage ASCII 5015 alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7); 5016 //2-stage BMP 5017 alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7)); 5018 //4-stage - full Unicode 5019 //assume that 0xD800 & 0xDC00 bits are cleared 5020 //thus leaving 10 bit per wchar to worry about 5021 alias UniSpec = AliasSeq!(bool, wchar[2], 5022 assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4), 5023 assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6), 5024 ); 5025 alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build()); 5026 alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build()); 5027 alias Uni = typeof(TrieBuilder!(UniSpec)(false).build()); 5028 5029 auto encode2(dchar ch) 5030 { 5031 ch -= 0x1_0000; 5032 assert(ch <= 0xF_FFFF); 5033 wchar[2] ret; 5034 //do not put surrogate bits, they are sliced off 5035 ret[0] = cast(wchar)(ch >> 10); 5036 ret[1] = (ch & 0xFFF); 5037 return ret; 5038 } 5039 5040 auto build(Set)(Set set) 5041 { 5042 import std.algorithm.iteration : map; 5043 auto ascii = set & unicode.ASCII; 5044 auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1)) 5045 - CodepointSet.fromIntervals(0xD800, 0xDFFF+1); 5046 auto other = set - (bmp | ascii); 5047 auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec); 5048 auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec); 5049 auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec); 5050 alias Ret = Impl!(1,2); 5051 return Ret(asciiT, bmpT, otherT); 5052 } 5053 5054 //bootstrap full UTF-16 matcher interace from 5055 //sizeFlags, lookupUni and ascii 5056 mixin template DefMatcher() 5057 { 5058 public bool match(Range)(ref Range inp) const 5059 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5060 !isDynamicArray!Range) 5061 { 5062 enum mode = Mode.skipOnMatch; 5063 assert(!inp.empty); 5064 immutable ch = inp[0]; 5065 static if (sizeFlags & 1) 5066 { 5067 if (ch < 0x80) 5068 { 5069 if (ascii[ch]) 5070 { 5071 inp.popFront(); 5072 return true; 5073 } 5074 else 5075 return false; 5076 } 5077 return lookupUni!mode(inp); 5078 } 5079 else 5080 return lookupUni!mode(inp); 5081 } 5082 5083 static if (Sizes.length == 2) 5084 { 5085 public bool skip(Range)(ref Range inp) const 5086 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5087 !isDynamicArray!Range) 5088 { 5089 enum mode = Mode.alwaysSkip; 5090 assert(!inp.empty); 5091 immutable ch = inp[0]; 5092 static if (sizeFlags & 1) 5093 { 5094 if (ch < 0x80) 5095 { 5096 inp.popFront(); 5097 return ascii[ch]; 5098 } 5099 else 5100 return lookupUni!mode(inp); 5101 } 5102 else 5103 return lookupUni!mode(inp); 5104 } 5105 } 5106 5107 public bool test(Range)(ref Range inp) const 5108 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) && 5109 !isDynamicArray!Range) 5110 { 5111 enum mode = Mode.neverSkip; 5112 assert(!inp.empty); 5113 auto ch = inp[0]; 5114 static if (sizeFlags & 1) 5115 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp); 5116 else 5117 return lookupUni!mode(inp); 5118 } 5119 5120 bool match(C)(ref C[] str) const 5121 if (isSomeChar!C) 5122 { 5123 return fwdStr!"match"(str); 5124 } 5125 5126 bool skip(C)(ref C[] str) const 5127 if (isSomeChar!C) 5128 { 5129 return fwdStr!"skip"(str); 5130 } 5131 5132 bool test(C)(ref C[] str) const 5133 if (isSomeChar!C) 5134 { 5135 return fwdStr!"test"(str); 5136 } 5137 5138 mixin ForwardStrings; //dispatch strings to range versions 5139 } 5140 5141 struct Impl(Sizes...) 5142 if (Sizes.length >= 1 && Sizes.length <= 2) 5143 { 5144 private: 5145 import std.meta : allSatisfy; 5146 static assert(allSatisfy!(validSize, Sizes), 5147 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5148 static if (Sizes.length > 1) 5149 enum sizeFlags = Sizes[0] | Sizes[1]; 5150 else 5151 enum sizeFlags = Sizes[0]; 5152 5153 static if (sizeFlags & 1) 5154 { 5155 Ascii ascii; 5156 Bmp bmp; 5157 } 5158 static if (sizeFlags & 2) 5159 { 5160 Uni uni; 5161 } 5162 mixin DefMatcher; 5163 5164 package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)() 5165 { 5166 return CherryPick!(Impl, SizesToPick)(&this); 5167 } 5168 5169 bool lookupUni(Mode mode, Range)(ref Range inp) const 5170 { 5171 wchar x = cast(wchar)(inp[0] - 0xD800); 5172 //not a high surrogate 5173 if (x > 0x3FF) 5174 { 5175 //low surrogate 5176 if (x <= 0x7FF) badEncoding(); 5177 static if (sizeFlags & 1) 5178 { 5179 auto ch = inp[0]; 5180 static if (mode == Mode.alwaysSkip) 5181 inp.popFront(); 5182 static if (mode == Mode.skipOnMatch) 5183 { 5184 if (bmp[ch]) 5185 { 5186 inp.popFront(); 5187 return true; 5188 } 5189 else 5190 return false; 5191 } 5192 else 5193 return bmp[ch]; 5194 } 5195 else //skip is not available for sub-matchers, so just false 5196 return false; 5197 } 5198 else 5199 { 5200 import std.range : popFrontN; 5201 static if (sizeFlags & 2) 5202 { 5203 if (inp.length < 2) 5204 badEncoding(); 5205 wchar y = cast(wchar)(inp[1] - 0xDC00); 5206 //not a low surrogate 5207 if (y > 0x3FF) 5208 badEncoding(); 5209 wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff]; 5210 static if (mode == Mode.alwaysSkip) 5211 inp.popFrontN(2); 5212 static if (mode == Mode.skipOnMatch) 5213 { 5214 if (uni[needle]) 5215 { 5216 inp.popFrontN(2); 5217 return true; 5218 } 5219 else 5220 return false; 5221 } 5222 else 5223 return uni[needle]; 5224 } 5225 else //ditto 5226 return false; 5227 } 5228 } 5229 } 5230 5231 struct CherryPick(I, Sizes...) 5232 if (Sizes.length >= 1 && Sizes.length <= 2) 5233 { 5234 private: 5235 import std.meta : allSatisfy; 5236 I* m; 5237 enum sizeFlags = I.sizeFlags; 5238 5239 static if (sizeFlags & 1) 5240 { 5241 @property auto ascii()() const { return m.ascii; } 5242 } 5243 5244 bool lookupUni(Mode mode, Range)(ref Range inp) const 5245 { 5246 return m.lookupUni!mode(inp); 5247 } 5248 mixin DefMatcher; 5249 static assert(allSatisfy!(validSize, Sizes), 5250 "Only lengths of 1 and 2 code units are possible in UTF-16"); 5251 } 5252 } 5253 5254 private auto utf8Matcher(Set)(Set set) 5255 { 5256 return Utf8Matcher!().build(set); 5257 } 5258 5259 private auto utf16Matcher(Set)(Set set) 5260 { 5261 return Utf16Matcher!().build(set); 5262 } 5263 5264 /** 5265 Constructs a matcher object 5266 to classify $(CODEPOINTS) from the `set` for encoding 5267 that has `Char` as code unit. 5268 5269 See $(LREF MatcherConcept) for API outline. 5270 */ 5271 public auto utfMatcher(Char, Set)(Set set) 5272 if (isCodepointSet!Set) 5273 { 5274 static if (is(Char : char)) 5275 return utf8Matcher(set); 5276 else static if (is(Char : wchar)) 5277 return utf16Matcher(set); 5278 else static if (is(Char : dchar)) 5279 static assert(false, "UTF-32 needs no decoding, 5280 and thus not supported by utfMatcher"); 5281 else 5282 static assert(false, "Only character types 'char' and 'wchar' are allowed"); 5283 } 5284 5285 5286 //a range of code units, packed with index to speed up forward iteration 5287 package(std) auto decoder(C)(C[] s, size_t offset=0) 5288 if (is(C : wchar) || is(C : char)) 5289 { 5290 static struct Decoder 5291 { 5292 pure nothrow: 5293 C[] str; 5294 size_t idx; 5295 @property C front(){ return str[idx]; } 5296 @property C back(){ return str[$-1]; } 5297 void popFront(){ idx++; } 5298 void popBack(){ str = str[0..$-1]; } 5299 void popFrontN(size_t n){ idx += n; } 5300 @property bool empty(){ return idx == str.length; } 5301 @property auto save(){ return this; } 5302 auto opIndex(size_t i){ return str[idx+i]; } 5303 @property size_t length(){ return str.length - idx; } 5304 alias opDollar = length; 5305 auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); } 5306 } 5307 static assert(isRandomAccessRange!Decoder); 5308 static assert(is(ElementType!Decoder : C)); 5309 return Decoder(s, offset); 5310 } 5311 5312 pure @safe unittest 5313 { 5314 string rs = "hi! ネемног砀 текста"; 5315 auto codec = rs.decoder; 5316 auto utf8 = utf8Matcher(unicode.Letter); 5317 auto asc = utf8.subMatcher!(1); 5318 auto uni = utf8.subMatcher!(2,3,4); 5319 5320 // h 5321 assert(asc.test(codec)); 5322 assert(!uni.match(codec)); 5323 assert(utf8.skip(codec)); 5324 assert(codec.idx == 1); 5325 5326 // i 5327 assert(asc.test(codec)); 5328 assert(!uni.match(codec)); 5329 assert(utf8.skip(codec)); 5330 assert(codec.idx == 2); 5331 5332 // ! 5333 assert(!asc.match(codec)); 5334 assert(!utf8.test(codec)); 5335 assert(!utf8.skip(codec)); 5336 assert(codec.idx == 3); 5337 5338 // space 5339 assert(!asc.test(codec)); 5340 assert(!utf8.test(codec)); 5341 assert(!utf8.skip(codec)); 5342 assert(codec.idx == 4); 5343 5344 assert(utf8.test(codec)); 5345 foreach (i; 0 .. 7) 5346 { 5347 assert(!asc.test(codec)); 5348 assert(uni.test(codec)); 5349 assert(utf8.skip(codec)); 5350 } 5351 assert(!utf8.test(codec)); 5352 assert(!utf8.skip(codec)); 5353 5354 //the same with match where applicable 5355 codec = rs.decoder; 5356 assert(utf8.match(codec)); 5357 assert(codec.idx == 1); 5358 assert(utf8.match(codec)); 5359 assert(codec.idx == 2); 5360 assert(!utf8.match(codec)); 5361 assert(codec.idx == 2); 5362 assert(!utf8.skip(codec)); 5363 assert(!utf8.skip(codec)); 5364 5365 foreach (i; 0 .. 7) 5366 { 5367 assert(!asc.test(codec)); 5368 assert(utf8.test(codec)); 5369 assert(utf8.match(codec)); 5370 } 5371 auto i = codec.idx; 5372 assert(!utf8.match(codec)); 5373 assert(codec.idx == i); 5374 } 5375 5376 pure @system unittest 5377 { 5378 import std.range : stride; 5379 static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe 5380 { 5381 bool t = m.test(r); 5382 auto save = r.idx; 5383 assert(t == m.match(r)); 5384 assert(r.idx == save || t); //ether no change or was match 5385 r.idx = save; 5386 static if (is(typeof(m.skip(r)))) 5387 { 5388 assert(t == m.skip(r)); 5389 assert(r.idx != save); //always changed 5390 r.idx = save; 5391 } 5392 return t; 5393 } 5394 auto utf16 = utfMatcher!wchar(unicode.L); 5395 auto bmp = utf16.subMatcher!1; 5396 auto nonBmp = utf16.subMatcher!1; 5397 auto utf8 = utfMatcher!char(unicode.L); 5398 auto ascii = utf8.subMatcher!1; 5399 auto uni2 = utf8.subMatcher!2; 5400 auto uni3 = utf8.subMatcher!3; 5401 auto uni24 = utf8.subMatcher!(2,4); 5402 foreach (ch; unicode.L.byCodepoint.stride(3)) 5403 { 5404 import std.utf : encode; 5405 char[4] buf; 5406 wchar[2] buf16; 5407 auto len = encode(buf, ch); 5408 auto len16 = encode(buf16, ch); 5409 auto c8 = buf[0 .. len].decoder; 5410 auto c16 = buf16[0 .. len16].decoder; 5411 assert(testAll(utf16, c16)); 5412 assert(testAll(bmp, c16) || len16 != 1); 5413 assert(testAll(nonBmp, c16) || len16 != 2); 5414 5415 assert(testAll(utf8, c8)); 5416 5417 //submatchers return false on out of their domain 5418 assert(testAll(ascii, c8) || len != 1); 5419 assert(testAll(uni2, c8) || len != 2); 5420 assert(testAll(uni3, c8) || len != 3); 5421 assert(testAll(uni24, c8) || (len != 2 && len != 4)); 5422 } 5423 } 5424 5425 // cover decode fail cases of Matcher 5426 pure @safe unittest 5427 { 5428 import std.algorithm.iteration : map; 5429 import std.exception : collectException; 5430 import std.format : format; 5431 auto utf16 = utfMatcher!wchar(unicode.L); 5432 auto utf8 = utfMatcher!char(unicode.L); 5433 //decode failure cases UTF-8 5434 alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79", 5435 "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00", 5436 "\xCF\x00\0x00\0x00\x00"); 5437 foreach (msg; fails8) 5438 { 5439 assert(collectException((){ 5440 auto s = msg; 5441 size_t idx = 0; 5442 utf8.test(s); 5443 }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg)); 5444 } 5445 //decode failure cases UTF-16 5446 alias fails16 = AliasSeq!([0xD811], [0xDC02]); 5447 foreach (msg; fails16) 5448 { 5449 assert(collectException((){ 5450 auto s = msg.map!(x => cast(wchar) x); 5451 utf16.test(s); 5452 }())); 5453 } 5454 } 5455 5456 /++ 5457 Convenience function to construct optimal configurations for 5458 packed Trie from any `set` of $(CODEPOINTS). 5459 5460 The parameter `level` indicates the number of trie levels to use, 5461 allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs 5462 speed-size wise. 5463 5464 $(P Level 1 is fastest and the most memory hungry (a bit array). ) 5465 $(P Level 4 is the slowest and has the smallest footprint. ) 5466 5467 See the $(S_LINK Synopsis, Synopsis) section for example. 5468 5469 Note: 5470 Level 4 stays very practical (being faster and more predictable) 5471 compared to using direct lookup on the `set` itself. 5472 5473 5474 +/ 5475 public auto toTrie(size_t level, Set)(Set set) 5476 if (isCodepointSet!Set) 5477 { 5478 static if (level == 1) 5479 return codepointSetTrie!(21)(set); 5480 else static if (level == 2) 5481 return codepointSetTrie!(10, 11)(set); 5482 else static if (level == 3) 5483 return codepointSetTrie!(8, 5, 8)(set); 5484 else static if (level == 4) 5485 return codepointSetTrie!(6, 4, 4, 7)(set); 5486 else 5487 static assert(false, 5488 "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly"); 5489 } 5490 5491 /** 5492 $(P Builds a `Trie` with typically optimal speed-size trade-off 5493 and wraps it into a delegate of the following type: 5494 $(D bool delegate(dchar ch)). ) 5495 5496 $(P Effectively this creates a 'tester' lambda suitable 5497 for algorithms like std.algorithm.find that take unary predicates. ) 5498 5499 See the $(S_LINK Synopsis, Synopsis) section for example. 5500 */ 5501 public auto toDelegate(Set)(Set set) 5502 if (isCodepointSet!Set) 5503 { 5504 // 3 is very small and is almost as fast as 2-level (due to CPU caches?) 5505 auto t = toTrie!3(set); 5506 return (dchar ch) => t[ch]; 5507 } 5508 5509 /** 5510 $(P Opaque wrapper around unsigned built-in integers and 5511 code unit (char/wchar/dchar) types. 5512 Parameter `sz` indicates that the value is confined 5513 to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be 5514 packed more tightly when stored in certain 5515 data-structures like trie. ) 5516 5517 Note: 5518 $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T` 5519 but not vise-versa. Users have to ensure the value fits in 5520 the range required and use the `cast` 5521 operator to perform the conversion.) 5522 */ 5523 struct BitPacked(T, size_t sz) 5524 if (isIntegral!T || is(T:dchar)) 5525 { 5526 enum bitSize = sz; 5527 T _value; 5528 alias _value this; 5529 } 5530 5531 /* 5532 Depending on the form of the passed argument `bitSizeOf` returns 5533 the amount of bits required to represent a given type 5534 or a return type of a given functor. 5535 */ 5536 template bitSizeOf(Args...) 5537 if (Args.length == 1) 5538 { 5539 import std.traits : ReturnType; 5540 alias T = Args[0]; 5541 static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t)) 5542 { 5543 enum bitSizeOf = T.bitSize; 5544 } 5545 else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits)) 5546 { 5547 enum bitSizeOf = bitSizeOf!(ReturnType!T); 5548 } 5549 else 5550 { 5551 enum bitSizeOf = T.sizeof*8; 5552 } 5553 } 5554 5555 /** 5556 Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x) 5557 and thus suitable for packing. 5558 */ 5559 template isBitPacked(T) 5560 { 5561 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5562 enum isBitPacked = true; 5563 else 5564 enum isBitPacked = false; 5565 } 5566 5567 /** 5568 Gives the type `U` from $(LREF BitPacked)!(U, x) 5569 or `T` itself for every other type. 5570 */ 5571 template TypeOfBitPacked(T) 5572 { 5573 static if (is(T dummy == BitPacked!(U, bits), U, size_t bits)) 5574 alias TypeOfBitPacked = U; 5575 else 5576 alias TypeOfBitPacked = T; 5577 } 5578 5579 /* 5580 Wrapper, used in definition of custom data structures from `Trie` template. 5581 Applying it to a unary lambda function indicates that the returned value always 5582 fits within `bits` of bits. 5583 */ 5584 struct assumeSize(alias Fn, size_t bits) 5585 { 5586 enum bitSize = bits; 5587 static auto ref opCall(T)(auto ref T arg) 5588 { 5589 return Fn(arg); 5590 } 5591 } 5592 5593 /* 5594 A helper for defining lambda function that yields a slice 5595 of certain bits from an unsigned integral value. 5596 The resulting lambda is wrapped in assumeSize and can be used directly 5597 with `Trie` template. 5598 */ 5599 struct sliceBits(size_t from, size_t to) 5600 { 5601 //for now bypass assumeSize, DMD has trouble inlining it 5602 enum bitSize = to-from; 5603 static auto opCall(T)(T x) 5604 out(result) 5605 { 5606 assert(result < (1 << to-from)); 5607 } 5608 do 5609 { 5610 static assert(from < to); 5611 static if (from == 0) 5612 return x & ((1 << to)-1); 5613 else 5614 return (x >> from) & ((1<<(to-from))-1); 5615 } 5616 } 5617 5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; } 5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; } 5620 alias lo8 = assumeSize!(low_8, 8); 5621 alias mlo8 = assumeSize!(midlow_8, 8); 5622 5623 @safe pure nothrow @nogc unittest 5624 { 5625 static assert(bitSizeOf!lo8 == 8); 5626 static assert(bitSizeOf!(sliceBits!(4, 7)) == 3); 5627 static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2); 5628 } 5629 5630 template Sequence(size_t start, size_t end) 5631 { 5632 static if (start < end) 5633 alias Sequence = AliasSeq!(start, Sequence!(start+1, end)); 5634 else 5635 alias Sequence = AliasSeq!(); 5636 } 5637 5638 //---- TRIE TESTS ---- 5639 @system unittest 5640 { 5641 import std.algorithm.iteration : map; 5642 import std.algorithm.sorting : sort; 5643 import std.array : array; 5644 import std.conv : text, to; 5645 import std.range : iota; 5646 static trieStats(TRIE)(TRIE t) 5647 { 5648 version (std_uni_stats) 5649 { 5650 import std.stdio : writefln, writeln; 5651 writeln("---TRIE FOOTPRINT STATS---"); 5652 static foreach (i; 0 .. t.table.dim) 5653 { 5654 writefln("lvl%s = %s bytes; %s pages" 5655 , i, t.bytes!i, t.pages!i); 5656 } 5657 writefln("TOTAL: %s bytes", t.bytes); 5658 version (none) 5659 { 5660 writeln("INDEX (excluding value level):"); 5661 static foreach (i; 0 .. t.table.dim-1) 5662 writeln(t.table.slice!(i)[0 .. t.table.length!i]); 5663 } 5664 writeln("---------------------------"); 5665 } 5666 } 5667 //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2) 5668 // alias lo8 = assumeSize!(8, function (uint x) { return x&0xFF; }); 5669 // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; }); 5670 alias Set = CodepointSet; 5671 auto set = Set('A','Z','a','z'); 5672 auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array 5673 for (int a='a'; a<'z';a++) 5674 assert(trie[a]); 5675 for (int a='A'; a<'Z';a++) 5676 assert(trie[a]); 5677 for (int a=0; a<'A'; a++) 5678 assert(!trie[a]); 5679 for (int a ='Z'; a<'a'; a++) 5680 assert(!trie[a]); 5681 trieStats(trie); 5682 5683 auto redundant2 = Set( 5684 1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111); 5685 auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval); 5686 trieStats(trie2); 5687 foreach (e; redundant2.byCodepoint) 5688 assert(trie2[e], text(cast(uint) e, " - ", trie2[e])); 5689 foreach (i; 0 .. 1024) 5690 { 5691 assert(trie2[i] == (i in redundant2)); 5692 } 5693 5694 5695 auto redundant3 = Set( 5696 2, 4, 6, 8, 16, 5697 2+16, 4+16, 16+6, 16+8, 16+16, 5698 2+32, 4+32, 32+6, 32+8, 5699 ); 5700 5701 enum max3 = 256; 5702 // sliceBits 5703 auto trie3 = buildTrie!(bool, uint, max3, 5704 sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4) 5705 )(redundant3.byInterval); 5706 trieStats(trie3); 5707 foreach (i; 0 .. max3) 5708 assert(trie3[i] == (i in redundant3), text(cast(uint) i)); 5709 5710 auto redundant4 = Set( 5711 10, 64, 64+10, 128, 128+10, 256, 256+10, 512, 5712 1000, 2000, 3000, 4000, 5000, 6000 5713 ); 5714 enum max4 = 2^^16; 5715 auto trie4 = buildTrie!(bool, size_t, max4, 5716 sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6) 5717 )(redundant4.byInterval); 5718 foreach (i; 0 .. max4) 5719 { 5720 if (i in redundant4) 5721 assert(trie4[i], text(cast(uint) i)); 5722 } 5723 trieStats(trie4); 5724 5725 alias mapToS = mapTrieIndex!(useItemAt!(0, char)); 5726 string[] redundantS = ["tea", "start", "orange"]; 5727 redundantS.sort!((a,b) => mapToS(a) < mapToS(b))(); 5728 auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS); 5729 // using first char only 5730 assert(redundantS == ["orange", "start", "tea"]); 5731 assert(strie["test"], text(strie["test"])); 5732 assert(!strie["aea"]); 5733 assert(strie["s"]); 5734 5735 // a bit size test 5736 auto a = array(map!(x => to!ubyte(x))(iota(0, 256))); 5737 auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a); 5738 trieStats(bt); 5739 foreach (i; 0 .. 256) 5740 assert(bt[cast(ubyte) i]); 5741 } 5742 5743 template useItemAt(size_t idx, T) 5744 if (isIntegral!T || is(T: dchar)) 5745 { 5746 size_t impl(const scope T[] arr){ return arr[idx]; } 5747 alias useItemAt = assumeSize!(impl, 8*T.sizeof); 5748 } 5749 5750 template useLastItem(T) 5751 { 5752 size_t impl(const scope T[] arr){ return arr[$-1]; } 5753 alias useLastItem = assumeSize!(impl, 8*T.sizeof); 5754 } 5755 5756 template fullBitSize(Prefix...) 5757 { 5758 static if (Prefix.length > 0) 5759 enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]); 5760 else 5761 enum fullBitSize = 0; 5762 } 5763 5764 template idxTypes(Key, size_t fullBits, Prefix...) 5765 { 5766 static if (Prefix.length == 1) 5767 {// the last level is value level, so no index once reduced to 1-level 5768 alias idxTypes = AliasSeq!(); 5769 } 5770 else 5771 { 5772 // Important note on bit packing 5773 // Each level has to hold enough of bits to address the next one 5774 // The bottom level is known to hold full bit width 5775 // thus it's size in pages is full_bit_width - size_of_last_prefix 5776 // Recourse on this notion 5777 alias idxTypes = 5778 AliasSeq!( 5779 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]), 5780 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1])) 5781 ); 5782 } 5783 } 5784 5785 //============================================================================ 5786 5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) 5788 if (is(Char1 : dchar) && is(Char2 : dchar)) 5789 { 5790 import std.algorithm.comparison : cmp; 5791 import std.algorithm.iteration : map, filter; 5792 import std.ascii : toLower; 5793 static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';} 5794 return cmp( 5795 a.map!toLower.filter!pred, 5796 b.map!toLower.filter!pred); 5797 } 5798 5799 @safe pure unittest 5800 { 5801 assert(!comparePropertyName("foo-bar", "fooBar")); 5802 } 5803 5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure 5805 if (is(Char1 : dchar) && is(Char2 : dchar)) 5806 { 5807 return comparePropertyName(a, b) < 0; 5808 } 5809 5810 //============================================================================ 5811 // Utilities for compression of Unicode code point sets 5812 //============================================================================ 5813 5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow 5815 { 5816 // not optimized as usually done 1 time (and not public interface) 5817 if (val < 128) 5818 arr ~= cast(ubyte) val; 5819 else if (val < (1 << 13)) 5820 { 5821 arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8); 5822 arr ~= val & 0xFF; 5823 } 5824 else 5825 { 5826 assert(val < (1 << 21)); 5827 arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16); 5828 arr ~= (val >> 8) & 0xFF; 5829 arr ~= val & 0xFF; 5830 } 5831 } 5832 5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure 5834 { 5835 import std.exception : enforce; 5836 immutable first = arr[idx++]; 5837 if (!(first & 0x80)) // no top bit -> [0 .. 127] 5838 return first; 5839 immutable extra = ((first >> 5) & 1) + 1; // [1, 2] 5840 uint val = (first & 0x1F); 5841 enforce(idx + extra <= arr.length, "bad code point interval encoding"); 5842 foreach (j; 0 .. extra) 5843 val = (val << 8) | arr[idx+j]; 5844 idx += extra; 5845 return val; 5846 } 5847 5848 5849 package(std) ubyte[] compressIntervals(Range)(Range intervals) 5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range)) 5851 { 5852 ubyte[] storage; 5853 uint base = 0; 5854 // RLE encode 5855 foreach (val; intervals) 5856 { 5857 compressTo(val[0]-base, storage); 5858 base = val[0]; 5859 if (val[1] != lastDchar+1) // till the end of the domain so don't store it 5860 { 5861 compressTo(val[1]-base, storage); 5862 base = val[1]; 5863 } 5864 } 5865 return storage; 5866 } 5867 5868 @safe pure unittest 5869 { 5870 import std.algorithm.comparison : equal; 5871 import std.typecons : tuple; 5872 5873 auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)]; 5874 ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0]; 5875 assert(compressIntervals(run) == enc); 5876 auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)]; 5877 ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed 5878 assert(compressIntervals(run2) == enc2); 5879 size_t idx = 0; 5880 assert(decompressFrom(enc, idx) == 80); 5881 assert(decompressFrom(enc, idx) == 47); 5882 assert(decompressFrom(enc, idx) == 1); 5883 assert(decompressFrom(enc, idx) == (1 << 10)); 5884 idx = 0; 5885 assert(decompressFrom(enc2, idx) == 0); 5886 assert(decompressFrom(enc2, idx) == (1 << 20)+512+1); 5887 assert(equal(decompressIntervals(compressIntervals(run)), run)); 5888 assert(equal(decompressIntervals(compressIntervals(run2)), run2)); 5889 } 5890 5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data. 5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure 5893 { 5894 return DecompressedIntervals(data); 5895 } 5896 5897 @safe struct DecompressedIntervals 5898 { 5899 pure: 5900 const(ubyte)[] _stream; 5901 size_t _idx; 5902 CodepointInterval _front; 5903 5904 this(const(ubyte)[] stream) 5905 { 5906 _stream = stream; 5907 popFront(); 5908 } 5909 5910 @property CodepointInterval front() 5911 { 5912 assert(!empty); 5913 return _front; 5914 } 5915 5916 void popFront() 5917 { 5918 if (_idx == _stream.length) 5919 { 5920 _idx = size_t.max; 5921 return; 5922 } 5923 uint base = _front[1]; 5924 _front[0] = base + decompressFrom(_stream, _idx); 5925 if (_idx == _stream.length)// odd length ---> till the end 5926 _front[1] = lastDchar+1; 5927 else 5928 { 5929 base = _front[0]; 5930 _front[1] = base + decompressFrom(_stream, _idx); 5931 } 5932 } 5933 5934 @property bool empty() const 5935 { 5936 return _idx == size_t.max; 5937 } 5938 5939 @property DecompressedIntervals save() return scope { return this; } 5940 } 5941 5942 @safe pure nothrow @nogc unittest 5943 { 5944 static assert(isInputRange!DecompressedIntervals); 5945 static assert(isForwardRange!DecompressedIntervals); 5946 } 5947 5948 //============================================================================ 5949 5950 version (std_uni_bootstrap){} 5951 else 5952 { 5953 5954 // helper for looking up code point sets 5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name) 5956 { 5957 import std.algorithm.iteration : map; 5958 import std.range : assumeSorted; 5959 auto range = assumeSorted!((a,b) => propertyNameLess(a,b)) 5960 (table.map!"a.name"()); 5961 size_t idx = range.lowerBound(name).length; 5962 if (idx < range.length && comparePropertyName(range[idx], name) == 0) 5963 return idx; 5964 return -1; 5965 } 5966 5967 // another one that loads it 5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest) 5969 { 5970 auto idx = findUnicodeSet!table(name); 5971 if (idx >= 0) 5972 { 5973 dest = Set(asSet(table[idx].compressed)); 5974 return true; 5975 } 5976 return false; 5977 } 5978 5979 bool loadProperty(Set=CodepointSet, C) 5980 (const scope C[] name, ref Set target) pure 5981 { 5982 import std.internal.unicode_tables : uniProps; // generated file 5983 alias ucmp = comparePropertyName; 5984 // conjure cumulative properties by hand 5985 if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0) 5986 { 5987 target = asSet(uniProps.Lu); 5988 target |= asSet(uniProps.Ll); 5989 target |= asSet(uniProps.Lt); 5990 target |= asSet(uniProps.Lo); 5991 target |= asSet(uniProps.Lm); 5992 } 5993 else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0) 5994 { 5995 target = asSet(uniProps.Ll); 5996 target |= asSet(uniProps.Lu); 5997 target |= asSet(uniProps.Lt);// Title case 5998 } 5999 else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0) 6000 { 6001 target = asSet(uniProps.Mn); 6002 target |= asSet(uniProps.Mc); 6003 target |= asSet(uniProps.Me); 6004 } 6005 else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0) 6006 { 6007 target = asSet(uniProps.Nd); 6008 target |= asSet(uniProps.Nl); 6009 target |= asSet(uniProps.No); 6010 } 6011 else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0) 6012 { 6013 target = asSet(uniProps.Pc); 6014 target |= asSet(uniProps.Pd); 6015 target |= asSet(uniProps.Ps); 6016 target |= asSet(uniProps.Pe); 6017 target |= asSet(uniProps.Pi); 6018 target |= asSet(uniProps.Pf); 6019 target |= asSet(uniProps.Po); 6020 } 6021 else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0) 6022 { 6023 target = asSet(uniProps.Sm); 6024 target |= asSet(uniProps.Sc); 6025 target |= asSet(uniProps.Sk); 6026 target |= asSet(uniProps.So); 6027 } 6028 else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0) 6029 { 6030 target = asSet(uniProps.Zs); 6031 target |= asSet(uniProps.Zl); 6032 target |= asSet(uniProps.Zp); 6033 } 6034 else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0) 6035 { 6036 target = asSet(uniProps.Cc); 6037 target |= asSet(uniProps.Cf); 6038 target |= asSet(uniProps.Cs); 6039 target |= asSet(uniProps.Co); 6040 target |= asSet(uniProps.Cn); 6041 } 6042 else if (ucmp(name, "graphical") == 0) 6043 { 6044 target = asSet(uniProps.Alphabetic); 6045 6046 target |= asSet(uniProps.Mn); 6047 target |= asSet(uniProps.Mc); 6048 target |= asSet(uniProps.Me); 6049 6050 target |= asSet(uniProps.Nd); 6051 target |= asSet(uniProps.Nl); 6052 target |= asSet(uniProps.No); 6053 6054 target |= asSet(uniProps.Pc); 6055 target |= asSet(uniProps.Pd); 6056 target |= asSet(uniProps.Ps); 6057 target |= asSet(uniProps.Pe); 6058 target |= asSet(uniProps.Pi); 6059 target |= asSet(uniProps.Pf); 6060 target |= asSet(uniProps.Po); 6061 6062 target |= asSet(uniProps.Zs); 6063 6064 target |= asSet(uniProps.Sm); 6065 target |= asSet(uniProps.Sc); 6066 target |= asSet(uniProps.Sk); 6067 target |= asSet(uniProps.So); 6068 } 6069 else if (ucmp(name, "any") == 0) 6070 target = Set.fromIntervals(0, 0x110000); 6071 else if (ucmp(name, "ascii") == 0) 6072 target = Set.fromIntervals(0, 0x80); 6073 else 6074 return loadUnicodeSet!(uniProps.tab)(name, target); 6075 return true; 6076 } 6077 6078 // CTFE-only helper for checking property names at compile-time 6079 @safe bool isPrettyPropertyName(C)(const scope C[] name) 6080 { 6081 import std.algorithm.searching : find; 6082 auto names = [ 6083 "L", "Letter", 6084 "LC", "Cased Letter", 6085 "M", "Mark", 6086 "N", "Number", 6087 "P", "Punctuation", 6088 "S", "Symbol", 6089 "Z", "Separator", 6090 "Graphical", 6091 "any", 6092 "ascii" 6093 ]; 6094 auto x = find!(x => comparePropertyName(x, name) == 0)(names); 6095 return !x.empty; 6096 } 6097 6098 // ditto, CTFE-only, not optimized 6099 @safe private static bool findSetName(alias table, C)(const scope C[] name) 6100 { 6101 return findUnicodeSet!table(name) >= 0; 6102 } 6103 6104 template SetSearcher(alias table, string kind) 6105 { 6106 /// Run-time checked search. 6107 static auto opCall(C)(const scope C[] name) 6108 if (is(C : dchar)) 6109 { 6110 import std.conv : to; 6111 CodepointSet set; 6112 if (loadUnicodeSet!table(name, set)) 6113 return set; 6114 throw new Exception("No unicode set for "~kind~" by name " 6115 ~name.to!string()~" was found."); 6116 } 6117 /// Compile-time checked search. 6118 static @property auto opDispatch(string name)() 6119 { 6120 static if (findSetName!table(name)) 6121 { 6122 CodepointSet set; 6123 loadUnicodeSet!table(name, set); 6124 return set; 6125 } 6126 else 6127 static assert(false, "No unicode set for "~kind~" by name " 6128 ~name~" was found."); 6129 } 6130 } 6131 6132 // Characters that need escaping in string posed as regular expressions 6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-', 6134 ';', ':', '#', '&', '%', '/', '<', '>', '`', '*', '+', '(', ')', '{', '}', '~'); 6135 6136 package(std) CodepointSet memoizeExpr(string expr)() 6137 { 6138 if (__ctfe) 6139 return mixin(expr); 6140 alias T = typeof(mixin(expr)); 6141 static T slot; 6142 static bool initialized; 6143 if (!initialized) 6144 { 6145 slot = mixin(expr); 6146 initialized = true; 6147 } 6148 return slot; 6149 } 6150 6151 //property for \w character class 6152 package(std) @property CodepointSet wordCharacter() @safe 6153 { 6154 return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc 6155 | unicode.Me | unicode.Nd | unicode.Pc")(); 6156 } 6157 6158 //basic stack, just in case it gets used anywhere else then Parser 6159 package(std) struct Stack(T) 6160 { 6161 @safe: 6162 T[] data; 6163 @property bool empty(){ return data.empty; } 6164 6165 @property size_t length(){ return data.length; } 6166 6167 void push(T val){ data ~= val; } 6168 6169 @trusted T pop() 6170 { 6171 assert(!empty); 6172 auto val = data[$ - 1]; 6173 data = data[0 .. $ - 1]; 6174 if (!__ctfe) 6175 cast(void) data.assumeSafeAppend(); 6176 return val; 6177 } 6178 6179 @property ref T top() 6180 { 6181 assert(!empty); 6182 return data[$ - 1]; 6183 } 6184 } 6185 6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint 6187 //returns it's value and skips these maxDigit chars on success, throws on failure 6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit) 6189 { 6190 import std.exception : enforce; 6191 //std.conv.parse is both @system and bogus 6192 uint val; 6193 for (int k = 0; k < maxDigit; k++) 6194 { 6195 enforce(!str.empty, "incomplete escape sequence"); 6196 //accepts ascii only, so it's OK to index directly 6197 immutable current = str.front; 6198 if ('0' <= current && current <= '9') 6199 val = val * 16 + current - '0'; 6200 else if ('a' <= current && current <= 'f') 6201 val = val * 16 + current -'a' + 10; 6202 else if ('A' <= current && current <= 'F') 6203 val = val * 16 + current - 'A' + 10; 6204 else 6205 throw new Exception("invalid escape sequence"); 6206 str.popFront(); 6207 } 6208 enforce(val <= 0x10FFFF, "invalid codepoint"); 6209 return val; 6210 } 6211 6212 @safe unittest 6213 { 6214 import std.algorithm.searching : canFind; 6215 import std.exception : collectException; 6216 string[] non_hex = [ "000j", "000z", "FffG", "0Z"]; 6217 string[] hex = [ "01", "ff", "00af", "10FFFF" ]; 6218 int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ]; 6219 foreach (v; non_hex) 6220 assert(collectException(parseUniHex(v, v.length)).msg 6221 .canFind("invalid escape sequence")); 6222 foreach (i, v; hex) 6223 assert(parseUniHex(v, v.length) == value[i]); 6224 string over = "0011FFFF"; 6225 assert(collectException(parseUniHex(over, over.length)).msg 6226 .canFind("invalid codepoint")); 6227 } 6228 6229 auto caseEnclose(CodepointSet set) 6230 { 6231 auto cased = set & unicode.LC; 6232 foreach (dchar ch; cased.byCodepoint) 6233 { 6234 foreach (c; simpleCaseFoldings(ch)) 6235 set |= c; 6236 } 6237 return set; 6238 } 6239 6240 /+ 6241 fetch codepoint set corresponding to a name (InBlock or binary property) 6242 +/ 6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated, bool casefold) @safe 6244 { 6245 CodepointSet s = unicode(name); 6246 //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC) 6247 if (casefold) 6248 s = caseEnclose(s); 6249 if (negated) 6250 s = s.inverted; 6251 return s; 6252 } 6253 6254 struct UnicodeSetParser(Range) 6255 { 6256 import std.exception : enforce; 6257 import std.typecons : tuple, Tuple; 6258 Range range; 6259 bool casefold_; 6260 6261 @property bool empty(){ return range.empty; } 6262 @property dchar front(){ return range.front; } 6263 void popFront(){ range.popFront(); } 6264 6265 //CodepointSet operations relatively in order of priority 6266 enum Operator:uint { 6267 Open = 0, Negate, Difference, SymDifference, Intersection, Union, None 6268 } 6269 6270 //parse unit of CodepointSet spec, most notably escape sequences and char ranges 6271 //also fetches next set operation 6272 Tuple!(CodepointSet,Operator) parseCharTerm() 6273 { 6274 import std.range : drop; 6275 enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD'; 6276 enum State{ Start, Char, Escape, CharDash, CharDashEscape, 6277 PotentialTwinSymbolOperator } 6278 Operator op = Operator.None; 6279 dchar last; 6280 CodepointSet set; 6281 State state = State.Start; 6282 6283 void addWithFlags(ref CodepointSet set, uint ch) 6284 { 6285 if (casefold_) 6286 { 6287 auto foldings = simpleCaseFoldings(ch); 6288 foreach (v; foldings) 6289 set |= v; 6290 } 6291 else 6292 set |= ch; 6293 } 6294 6295 static Operator twinSymbolOperator(dchar symbol) 6296 { 6297 switch (symbol) 6298 { 6299 case '|': 6300 return Operator.Union; 6301 case '-': 6302 return Operator.Difference; 6303 case '~': 6304 return Operator.SymDifference; 6305 case '&': 6306 return Operator.Intersection; 6307 default: 6308 assert(false); 6309 } 6310 } 6311 6312 L_CharTermLoop: 6313 for (;;) 6314 { 6315 final switch (state) 6316 { 6317 case State.Start: 6318 switch (front) 6319 { 6320 case '|': 6321 case '-': 6322 case '~': 6323 case '&': 6324 state = State.PotentialTwinSymbolOperator; 6325 last = front; 6326 break; 6327 case '[': 6328 op = Operator.Union; 6329 goto case; 6330 case ']': 6331 break L_CharTermLoop; 6332 case '\\': 6333 state = State.Escape; 6334 break; 6335 default: 6336 state = State.Char; 6337 last = front; 6338 } 6339 break; 6340 case State.Char: 6341 // xxx last front xxx 6342 switch (front) 6343 { 6344 case '|': 6345 case '~': 6346 case '&': 6347 // then last is treated as normal char and added as implicit union 6348 state = State.PotentialTwinSymbolOperator; 6349 addWithFlags(set, last); 6350 last = front; 6351 break; 6352 case '-': // still need more info 6353 state = State.CharDash; 6354 break; 6355 case '\\': 6356 set |= last; 6357 state = State.Escape; 6358 break; 6359 case '[': 6360 op = Operator.Union; 6361 goto case; 6362 case ']': 6363 addWithFlags(set, last); 6364 break L_CharTermLoop; 6365 default: 6366 state = State.Char; 6367 addWithFlags(set, last); 6368 last = front; 6369 } 6370 break; 6371 case State.PotentialTwinSymbolOperator: 6372 // xxx last front xxxx 6373 // where last = [|-&~] 6374 if (front == last) 6375 { 6376 op = twinSymbolOperator(last); 6377 popFront();//skip second twin char 6378 break L_CharTermLoop; 6379 } 6380 goto case State.Char; 6381 case State.Escape: 6382 // xxx \ front xxx 6383 switch (front) 6384 { 6385 case 'f': 6386 last = '\f'; 6387 state = State.Char; 6388 break; 6389 case 'n': 6390 last = '\n'; 6391 state = State.Char; 6392 break; 6393 case 'r': 6394 last = '\r'; 6395 state = State.Char; 6396 break; 6397 case 't': 6398 last = '\t'; 6399 state = State.Char; 6400 break; 6401 case 'v': 6402 last = '\v'; 6403 state = State.Char; 6404 break; 6405 case 'c': 6406 last = unicode.parseControlCode(this); 6407 state = State.Char; 6408 break; 6409 foreach (val; Escapables) 6410 { 6411 case val: 6412 } 6413 last = front; 6414 state = State.Char; 6415 break; 6416 case 'p': 6417 set.add(unicode.parsePropertySpec(this, false, casefold_)); 6418 state = State.Start; 6419 continue L_CharTermLoop; //next char already fetched 6420 case 'P': 6421 set.add(unicode.parsePropertySpec(this, true, casefold_)); 6422 state = State.Start; 6423 continue L_CharTermLoop; //next char already fetched 6424 case 'x': 6425 popFront(); 6426 last = parseUniHex(this, 2); 6427 state = State.Char; 6428 continue L_CharTermLoop; 6429 case 'u': 6430 popFront(); 6431 last = parseUniHex(this, 4); 6432 state = State.Char; 6433 continue L_CharTermLoop; 6434 case 'U': 6435 popFront(); 6436 last = parseUniHex(this, 8); 6437 state = State.Char; 6438 continue L_CharTermLoop; 6439 case 'd': 6440 set.add(unicode.Nd); 6441 state = State.Start; 6442 break; 6443 case 'D': 6444 set.add(unicode.Nd.inverted); 6445 state = State.Start; 6446 break; 6447 case 's': 6448 set.add(unicode.White_Space); 6449 state = State.Start; 6450 break; 6451 case 'S': 6452 set.add(unicode.White_Space.inverted); 6453 state = State.Start; 6454 break; 6455 case 'w': 6456 set.add(wordCharacter); 6457 state = State.Start; 6458 break; 6459 case 'W': 6460 set.add(wordCharacter.inverted); 6461 state = State.Start; 6462 break; 6463 default: 6464 if (front >= privateUseStart && front <= privateUseEnd) 6465 enforce(false, "no matching ']' found while parsing character class"); 6466 enforce(false, "invalid escape sequence"); 6467 } 6468 break; 6469 case State.CharDash: 6470 // xxx last - front xxx 6471 switch (front) 6472 { 6473 case '[': 6474 op = Operator.Union; 6475 goto case; 6476 case ']': 6477 //means dash is a single char not an interval specifier 6478 addWithFlags(set, last); 6479 addWithFlags(set, '-'); 6480 break L_CharTermLoop; 6481 case '-'://set Difference again 6482 addWithFlags(set, last); 6483 op = Operator.Difference; 6484 popFront();//skip '-' 6485 break L_CharTermLoop; 6486 case '\\': 6487 state = State.CharDashEscape; 6488 break; 6489 default: 6490 enforce(last <= front, "inverted range"); 6491 if (casefold_) 6492 { 6493 for (uint ch = last; ch <= front; ch++) 6494 addWithFlags(set, ch); 6495 } 6496 else 6497 set.add(last, front + 1); 6498 state = State.Start; 6499 } 6500 break; 6501 case State.CharDashEscape: 6502 //xxx last - \ front xxx 6503 uint end; 6504 switch (front) 6505 { 6506 case 'f': 6507 end = '\f'; 6508 break; 6509 case 'n': 6510 end = '\n'; 6511 break; 6512 case 'r': 6513 end = '\r'; 6514 break; 6515 case 't': 6516 end = '\t'; 6517 break; 6518 case 'v': 6519 end = '\v'; 6520 break; 6521 foreach (val; Escapables) 6522 { 6523 case val: 6524 } 6525 end = front; 6526 break; 6527 case 'c': 6528 end = unicode.parseControlCode(this); 6529 break; 6530 case 'x': 6531 popFront(); 6532 end = parseUniHex(this, 2); 6533 enforce(last <= end,"inverted range"); 6534 set.add(last, end + 1); 6535 state = State.Start; 6536 continue L_CharTermLoop; 6537 case 'u': 6538 popFront(); 6539 end = parseUniHex(this, 4); 6540 enforce(last <= end,"inverted range"); 6541 set.add(last, end + 1); 6542 state = State.Start; 6543 continue L_CharTermLoop; 6544 case 'U': 6545 popFront(); 6546 end = parseUniHex(this, 8); 6547 enforce(last <= end,"inverted range"); 6548 set.add(last, end + 1); 6549 state = State.Start; 6550 continue L_CharTermLoop; 6551 default: 6552 if (front >= privateUseStart && front <= privateUseEnd) 6553 enforce(false, "no matching ']' found while parsing character class"); 6554 enforce(false, "invalid escape sequence"); 6555 } 6556 // Lookahead to check if it's a \T 6557 // where T is sub-pattern terminator in multi-pattern scheme 6558 auto lookahead = range.save.drop(1); 6559 if (end == '\\' && !lookahead.empty) 6560 { 6561 if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd) 6562 enforce(false, "no matching ']' found while parsing character class"); 6563 } 6564 enforce(last <= end,"inverted range"); 6565 set.add(last, end + 1); 6566 state = State.Start; 6567 break; 6568 } 6569 popFront(); 6570 enforce(!empty, "unexpected end of CodepointSet"); 6571 } 6572 return tuple(set, op); 6573 } 6574 6575 alias ValStack = Stack!(CodepointSet); 6576 alias OpStack = Stack!(Operator); 6577 6578 CodepointSet parseSet() 6579 { 6580 ValStack vstack; 6581 OpStack opstack; 6582 import std.functional : unaryFun; 6583 enforce(!empty, "unexpected end of input"); 6584 enforce(front == '[', "expected '[' at the start of unicode set"); 6585 // 6586 static bool apply(Operator op, ref ValStack stack) 6587 { 6588 switch (op) 6589 { 6590 case Operator.Negate: 6591 enforce(!stack.empty, "no operand for '^'"); 6592 stack.top = stack.top.inverted; 6593 break; 6594 case Operator.Union: 6595 auto s = stack.pop();//2nd operand 6596 enforce(!stack.empty, "no operand for '||'"); 6597 stack.top.add(s); 6598 break; 6599 case Operator.Difference: 6600 auto s = stack.pop();//2nd operand 6601 enforce(!stack.empty, "no operand for '--'"); 6602 stack.top.sub(s); 6603 break; 6604 case Operator.SymDifference: 6605 auto s = stack.pop();//2nd operand 6606 enforce(!stack.empty, "no operand for '~~'"); 6607 stack.top ~= s; 6608 break; 6609 case Operator.Intersection: 6610 auto s = stack.pop();//2nd operand 6611 enforce(!stack.empty, "no operand for '&&'"); 6612 stack.top.intersect(s); 6613 break; 6614 default: 6615 return false; 6616 } 6617 return true; 6618 } 6619 static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack) 6620 { 6621 while (cond(opstack.top)) 6622 { 6623 if (!apply(opstack.pop(),vstack)) 6624 return false;//syntax error 6625 if (opstack.empty) 6626 return false; 6627 } 6628 return true; 6629 } 6630 6631 L_CharsetLoop: 6632 do 6633 { 6634 switch (front) 6635 { 6636 case '[': 6637 opstack.push(Operator.Open); 6638 popFront(); 6639 enforce(!empty, "unexpected end of character class"); 6640 if (front == '^') 6641 { 6642 opstack.push(Operator.Negate); 6643 popFront(); 6644 enforce(!empty, "unexpected end of character class"); 6645 } 6646 else if (front == ']') // []...] is special cased 6647 { 6648 popFront(); 6649 enforce(!empty, "wrong character set"); 6650 auto pair = parseCharTerm(); 6651 pair[0].add(']', ']'+1); 6652 if (pair[1] != Operator.None) 6653 { 6654 if (opstack.top == Operator.Union) 6655 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6656 opstack.push(pair[1]); 6657 } 6658 vstack.push(pair[0]); 6659 } 6660 break; 6661 case ']': 6662 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack), 6663 "character class syntax error"); 6664 enforce(!opstack.empty, "unmatched ']'"); 6665 opstack.pop(); 6666 popFront(); 6667 if (opstack.empty) 6668 break L_CharsetLoop; 6669 auto pair = parseCharTerm(); 6670 if (!pair[0].empty)//not only operator e.g. -- or ~~ 6671 { 6672 vstack.top.add(pair[0]);//apply union 6673 } 6674 if (pair[1] != Operator.None) 6675 { 6676 if (opstack.top == Operator.Union) 6677 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6678 opstack.push(pair[1]); 6679 } 6680 break; 6681 // 6682 default://yet another pair of term(op)? 6683 auto pair = parseCharTerm(); 6684 if (pair[1] != Operator.None) 6685 { 6686 if (opstack.top == Operator.Union) 6687 unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack); 6688 opstack.push(pair[1]); 6689 } 6690 vstack.push(pair[0]); 6691 } 6692 6693 }while (!empty || !opstack.empty); 6694 while (!opstack.empty) 6695 apply(opstack.pop(),vstack); 6696 assert(vstack.length == 1); 6697 return vstack.top; 6698 } 6699 } 6700 6701 /** 6702 A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of 6703 a block, script or general category. 6704 6705 It uses well defined standard rules of property name lookup. 6706 This includes fuzzy matching of names, so that 6707 'White_Space', 'white-SpAce' and 'whitespace' are all considered equal 6708 and yield the same set of white space $(CHARACTERS). 6709 */ 6710 @safe public struct unicode 6711 { 6712 import std.exception : enforce; 6713 /** 6714 Performs the lookup of set of $(CODEPOINTS) 6715 with compile-time correctness checking. 6716 This short-cut version combines 3 searches: 6717 across blocks, scripts, and common binary properties. 6718 6719 Note that since scripts and blocks overlap the 6720 usual trick to disambiguate is used - to get a block use 6721 `unicode.InBlockName`, to search a script 6722 use `unicode.ScriptName`. 6723 6724 See_Also: $(LREF block), $(LREF script) 6725 and (not included in this search) $(LREF hangulSyllableType). 6726 */ 6727 6728 static @property auto opDispatch(string name)() pure 6729 { 6730 static if (findAny(name)) 6731 return loadAny(name); 6732 else 6733 static assert(false, "No unicode set by name "~name~" was found."); 6734 } 6735 6736 /// 6737 @safe unittest 6738 { 6739 import std.exception : collectException; 6740 auto ascii = unicode.ASCII; 6741 assert(ascii['A']); 6742 assert(ascii['~']); 6743 assert(!ascii['\u00e0']); 6744 // matching is case-insensitive 6745 assert(ascii == unicode.ascII); 6746 assert(!ascii['à']); 6747 // underscores, '-' and whitespace in names are ignored too 6748 auto latin = unicode.in_latin1_Supplement; 6749 assert(latin['à']); 6750 assert(!latin['$']); 6751 // BTW Latin 1 Supplement is a block, hence "In" prefix 6752 assert(latin == unicode("In Latin 1 Supplement")); 6753 // run-time look up throws if no such set is found 6754 assert(collectException(unicode("InCyrilliac"))); 6755 } 6756 6757 /** 6758 The same lookup across blocks, scripts, or binary properties, 6759 but performed at run-time. 6760 This version is provided for cases where `name` 6761 is not known beforehand; otherwise compile-time 6762 checked $(LREF opDispatch) is typically a better choice. 6763 6764 See the $(S_LINK Unicode properties, table of properties) for available 6765 sets. 6766 */ 6767 static auto opCall(C)(const scope C[] name) 6768 if (is(C : dchar)) 6769 { 6770 return loadAny(name); 6771 } 6772 6773 /** 6774 Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks. 6775 6776 Note: 6777 Here block names are unambiguous as no scripts are searched 6778 and thus to search use simply `unicode.block.BlockName` notation. 6779 6780 See $(S_LINK Unicode properties, table of properties) for available sets. 6781 See_Also: $(S_LINK Unicode properties, table of properties). 6782 */ 6783 struct block 6784 { 6785 import std.internal.unicode_tables : blocks; // generated file 6786 mixin SetSearcher!(blocks.tab, "block"); 6787 } 6788 6789 /// 6790 @safe unittest 6791 { 6792 // use .block for explicitness 6793 assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic); 6794 } 6795 6796 /** 6797 Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts. 6798 6799 See the $(S_LINK Unicode properties, table of properties) for available 6800 sets. 6801 */ 6802 struct script 6803 { 6804 import std.internal.unicode_tables : scripts; // generated file 6805 mixin SetSearcher!(scripts.tab, "script"); 6806 } 6807 6808 /// 6809 @safe unittest 6810 { 6811 auto arabicScript = unicode.script.arabic; 6812 auto arabicBlock = unicode.block.arabic; 6813 // there is an intersection between script and block 6814 assert(arabicBlock['']); 6815 assert(arabicScript['']); 6816 // but they are different 6817 assert(arabicBlock != arabicScript); 6818 assert(arabicBlock == unicode.inArabic); 6819 assert(arabicScript == unicode.arabic); 6820 } 6821 6822 /** 6823 Fetch a set of $(CODEPOINTS) that have the given hangul syllable type. 6824 6825 Other non-binary properties (once supported) follow the same 6826 notation - `unicode.propertyName.propertyValue` for compile-time 6827 checked access and `unicode.propertyName(propertyValue)` 6828 for run-time checked one. 6829 6830 See the $(S_LINK Unicode properties, table of properties) for available 6831 sets. 6832 */ 6833 struct hangulSyllableType 6834 { 6835 import std.internal.unicode_tables : hangul; // generated file 6836 mixin SetSearcher!(hangul.tab, "hangul syllable type"); 6837 } 6838 6839 /// 6840 @safe unittest 6841 { 6842 // L here is syllable type not Letter as in unicode.L short-cut 6843 auto leadingVowel = unicode.hangulSyllableType("L"); 6844 // check that some leading vowels are present 6845 foreach (vowel; '\u1110'..'\u115F') 6846 assert(leadingVowel[vowel]); 6847 assert(leadingVowel == unicode.hangulSyllableType.L); 6848 } 6849 6850 //parse control code of form \cXXX, c assumed to be the current symbol 6851 static package(std) dchar parseControlCode(Parser)(ref Parser p) 6852 { 6853 with(p) 6854 { 6855 popFront(); 6856 enforce(!empty, "Unfinished escape sequence"); 6857 enforce(('a' <= front && front <= 'z') 6858 || ('A' <= front && front <= 'Z'), 6859 "Only letters are allowed after \\c"); 6860 return front & 0x1f; 6861 } 6862 } 6863 6864 //parse and return a CodepointSet for \p{...Property...} and \P{...Property..}, 6865 //\ - assumed to be processed, p - is current 6866 static package(std) CodepointSet parsePropertySpec(Range)(ref Range p, 6867 bool negated, bool casefold) 6868 { 6869 static import std.ascii; 6870 with(p) 6871 { 6872 enum MAX_PROPERTY = 128; 6873 char[MAX_PROPERTY] result; 6874 uint k = 0; 6875 popFront(); 6876 enforce(!empty, "eof parsing unicode property spec"); 6877 if (front == '{') 6878 { 6879 popFront(); 6880 while (k < MAX_PROPERTY && !empty && front !='}' 6881 && front !=':') 6882 { 6883 if (front != '-' && front != ' ' && front != '_') 6884 result[k++] = cast(char) std.ascii.toLower(front); 6885 popFront(); 6886 } 6887 enforce(k != MAX_PROPERTY, "invalid property name"); 6888 enforce(front == '}', "} expected "); 6889 } 6890 else 6891 {//single char properties e.g.: \pL, \pN ... 6892 enforce(front < 0x80, "invalid property name"); 6893 result[k++] = cast(char) front; 6894 } 6895 auto s = getUnicodeSet(result[0 .. k], negated, casefold); 6896 enforce(!s.empty, "unrecognized unicode property spec"); 6897 popFront(); 6898 return s; 6899 } 6900 } 6901 6902 /** 6903 Parse unicode codepoint set from given `range` using standard regex 6904 syntax '[...]'. The range is advanced skiping over regex set definition. 6905 `casefold` parameter determines if the set should be casefolded - that is 6906 include both lower and upper case versions for any letters in the set. 6907 */ 6908 static CodepointSet parseSet(Range)(ref Range range, bool casefold=false) 6909 if (isInputRange!Range && is(ElementType!Range : dchar)) 6910 { 6911 auto usParser = UnicodeSetParser!Range(range, casefold); 6912 auto set = usParser.parseSet(); 6913 range = usParser.range; 6914 return set; 6915 } 6916 6917 /// 6918 @safe unittest 6919 { 6920 import std.uni : unicode; 6921 string pat = "[a-zA-Z0-9]hello"; 6922 auto set = unicode.parseSet(pat); 6923 // check some of the codepoints 6924 assert(set['a'] && set['A'] && set['9']); 6925 assert(pat == "hello"); 6926 } 6927 6928 private: 6929 alias ucmp = comparePropertyName; 6930 6931 static bool findAny(string name) 6932 { 6933 import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file 6934 return isPrettyPropertyName(name) 6935 || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name) 6936 || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$])); 6937 } 6938 6939 static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure 6940 { 6941 import std.conv : to; 6942 import std.internal.unicode_tables : blocks, scripts; // generated file 6943 Set set; 6944 immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set) 6945 || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0 6946 && loadUnicodeSet!(blocks.tab)(name[2..$], set)); 6947 if (loaded) 6948 return set; 6949 throw new Exception("No unicode set by name "~name.to!string()~" was found."); 6950 } 6951 6952 // FIXME: re-disable once the compiler is fixed 6953 // Disabled to prevent the mistake of creating instances of this pseudo-struct. 6954 //@disable ~this(); 6955 } 6956 6957 @safe unittest 6958 { 6959 import std.internal.unicode_tables : blocks, uniProps; // generated file 6960 assert(unicode("InHebrew") == asSet(blocks.Hebrew)); 6961 assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp))); 6962 assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi)); 6963 } 6964 6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally 6966 6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too 6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie, 6969 // or extend | '\u200D' separately 6970 6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow 6972 { 6973 return ch >= '\U0001F1E6' && ch <= '\U0001F1FF'; 6974 } 6975 6976 // Our grapheme decoder is a state machine, this is list of all possible 6977 // states before each code point. 6978 private enum GraphemeState 6979 { 6980 Start, 6981 CR, 6982 RI, 6983 L, 6984 V, 6985 LVT, 6986 Emoji, 6987 EmojiZWJ, 6988 Prepend, 6989 End 6990 } 6991 6992 // Message values whether end of grapheme is reached 6993 private enum TransformRes 6994 { 6995 // No, unless the source range ends here 6996 // (GB2 - break at end of text, unless text is empty) 6997 goOn, 6998 redo, // Run last character again with new state 6999 retInclude, // Yes, after the just iterated character 7000 retExclude // Yes, before the just iterated character 7001 } 7002 7003 // The logic of the grapheme decoding is all here 7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29 7005 // Note, getting GB1 (break at start of text, unless text is empty) right 7006 // relies on the user starting grapheme walking from beginning of the text, and 7007 // not attempting to walk an empty text. 7008 private immutable TransformRes 7009 function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms = 7010 [ 7011 GraphemeState.Start: (ref state, ch) 7012 { 7013 // GB4. Break after controls. 7014 if (graphemeControlTrie[ch] || ch == '\n') 7015 return TransformRes.retInclude; 7016 7017 with (GraphemeState) state = 7018 ch == '\r' ? CR : 7019 isRegionalIndicator(ch) ? RI : 7020 isHangL(ch) ? L : 7021 hangLV[ch] || isHangV(ch) ? V : 7022 hangLVT[ch] || isHangT(ch) ? LVT : 7023 prependTrie[ch] ? Prepend : 7024 xpictoTrie[ch] ? Emoji : 7025 End; 7026 7027 // No matter what we encountered, we always include the 7028 // first code point in the grapheme. 7029 return TransformRes.goOn; 7030 }, 7031 7032 // GB3, GB4. Do not break between a CR and LF. 7033 // Otherwise, break after controls. 7034 GraphemeState.CR: (ref state, ch) => ch == '\n' ? 7035 TransformRes.retInclude : 7036 TransformRes.retExclude, 7037 7038 // GB12 - GB13. Do not break within emoji flag sequences. 7039 // That is, do not break between regional indicator (RI) symbols if 7040 // there is an odd number of RI characters before the break point. 7041 // This state applies if one and only one RI code point has been 7042 // encountered. 7043 GraphemeState.RI: (ref state, ch) 7044 { 7045 state = GraphemeState.End; 7046 7047 return isRegionalIndicator(ch) ? 7048 TransformRes.goOn : 7049 TransformRes.redo; 7050 }, 7051 7052 // GB6. Do not break Hangul syllable sequences. 7053 GraphemeState.L: (ref state, ch) 7054 { 7055 if (isHangL(ch)) 7056 return TransformRes.goOn; 7057 else if (isHangV(ch) || hangLV[ch]) 7058 { 7059 state = GraphemeState.V; 7060 return TransformRes.goOn; 7061 } 7062 else if (hangLVT[ch]) 7063 { 7064 state = GraphemeState.LVT; 7065 return TransformRes.goOn; 7066 } 7067 7068 state = GraphemeState.End; 7069 return TransformRes.redo; 7070 }, 7071 7072 // GB7. Do not break Hangul syllable sequences. 7073 GraphemeState.V: (ref state, ch) 7074 { 7075 if (isHangV(ch)) 7076 return TransformRes.goOn; 7077 else if (isHangT(ch)) 7078 { 7079 state = GraphemeState.LVT; 7080 return TransformRes.goOn; 7081 } 7082 7083 state = GraphemeState.End; 7084 return TransformRes.redo; 7085 }, 7086 7087 // GB8. Do not break Hangul syllable sequences. 7088 GraphemeState.LVT: (ref state, ch) 7089 { 7090 if (isHangT(ch)) 7091 return TransformRes.goOn; 7092 7093 state = GraphemeState.End; 7094 return TransformRes.redo; 7095 }, 7096 7097 // GB11. Do not break within emoji modifier sequences or emoji 7098 // zwj sequences. This state applies when the last code point was 7099 // NOT a ZWJ. 7100 GraphemeState.Emoji: (ref state, ch) 7101 { 7102 if (graphemeExtendTrie[ch]) 7103 return TransformRes.goOn; 7104 7105 static assert(!graphemeExtendTrie['\u200D']); 7106 7107 if (ch == '\u200D') 7108 { 7109 state = GraphemeState.EmojiZWJ; 7110 return TransformRes.goOn; 7111 } 7112 7113 state = GraphemeState.End; 7114 // There might still be spacing marks are 7115 // at the end, which are not allowed in 7116 // middle of emoji sequences 7117 return TransformRes.redo; 7118 }, 7119 7120 // GB11. Do not break within emoji modifier sequences or emoji 7121 // zwj sequences. This state applies when the last code point was 7122 // a ZWJ. 7123 GraphemeState.EmojiZWJ: (ref state, ch) 7124 { 7125 state = GraphemeState.Emoji; 7126 if (xpictoTrie[ch]) 7127 return TransformRes.goOn; 7128 return TransformRes.redo; 7129 }, 7130 7131 // GB9b. Do not break after Prepend characters. 7132 GraphemeState.Prepend: (ref state, ch) 7133 { 7134 // GB5. Break before controls. 7135 if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n') 7136 return TransformRes.retExclude; 7137 7138 state = GraphemeState.Start; 7139 return TransformRes.redo; 7140 }, 7141 7142 // GB9, GB9a. Do not break before extending characters, ZWJ 7143 // or SpacingMarks. 7144 // GB999. Otherwise, break everywhere. 7145 GraphemeState.End: (ref state, ch) 7146 => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ? 7147 TransformRes.retExclude : 7148 TransformRes.goOn 7149 ]; 7150 7151 template genericDecodeGrapheme(bool getValue) 7152 { 7153 static if (getValue) 7154 alias Value = Grapheme; 7155 else 7156 alias Value = void; 7157 7158 Value genericDecodeGrapheme(Input)(ref Input range) 7159 { 7160 static if (getValue) 7161 Grapheme grapheme; 7162 auto state = GraphemeState.Start; 7163 dchar ch; 7164 7165 assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof); 7166 outer: 7167 while (!range.empty) 7168 { 7169 ch = range.front; 7170 7171 rerun: 7172 final switch (graphemeTransforms[state](state, ch)) 7173 with(TransformRes) 7174 { 7175 case goOn: 7176 static if (getValue) 7177 grapheme ~= ch; 7178 range.popFront(); 7179 continue; 7180 7181 case redo: 7182 goto rerun; 7183 7184 case retInclude: 7185 static if (getValue) 7186 grapheme ~= ch; 7187 range.popFront(); 7188 break outer; 7189 7190 case retExclude: 7191 break outer; 7192 } 7193 } 7194 7195 static if (getValue) 7196 return grapheme; 7197 } 7198 } 7199 7200 public: // Public API continues 7201 7202 /++ 7203 Computes the length of grapheme cluster starting at `index`. 7204 Both the resulting length and the `index` are measured 7205 in $(S_LINK Code unit, code units). 7206 7207 Params: 7208 C = type that is implicitly convertible to `dchars` 7209 input = array of grapheme clusters 7210 index = starting index into `input[]` 7211 7212 Returns: 7213 length of grapheme cluster 7214 +/ 7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure 7216 if (is(C : dchar)) 7217 { 7218 auto src = input[index..$]; 7219 auto n = src.length; 7220 genericDecodeGrapheme!(false)(src); 7221 return n - src.length; 7222 } 7223 7224 /// 7225 @safe unittest 7226 { 7227 assert(graphemeStride(" ", 1) == 1); 7228 // A + combing ring above 7229 string city = "A\u030Arhus"; 7230 size_t first = graphemeStride(city, 0); 7231 assert(first == 3); //\u030A has 2 UTF-8 code units 7232 assert(city[0 .. first] == "A\u030A"); 7233 assert(city[first..$] == "rhus"); 7234 } 7235 7236 @safe unittest 7237 { 7238 // Ensure that graphemeStride is usable from CTFE. 7239 enum c1 = graphemeStride("A", 0); 7240 static assert(c1 == 1); 7241 7242 enum c2 = graphemeStride("A\u0301", 0); 7243 static assert(c2 == 3); // \u0301 has 2 UTF-8 code units 7244 } 7245 7246 @safe pure nothrow @nogc unittest 7247 { 7248 // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face 7249 assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2); 7250 // skier ~ female sign ~ '€' 7251 assert(graphemeStride("\u26F7\u2640€"d, 0) == 1); 7252 // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€' 7253 assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2); 7254 // skier ~ zero-width joiner ~ female sign ~ '€' 7255 assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3); 7256 // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner 7257 // ~ female sign ~ '€' 7258 assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4); 7259 // skier ~ zero-width joiner ~ '€' 7260 assert(graphemeStride("\u26F7\u200D€"d, 0) == 2); 7261 //'€' ~ zero-width joiner ~ skier 7262 assert(graphemeStride("€\u200D\u26F7"d, 0) == 2); 7263 // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two 7264 assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2); 7265 // Kaithi number sign ~ null 7266 assert(graphemeStride("\U000110BD\0"d, 0) == 1); 7267 } 7268 7269 /++ 7270 Reads one full grapheme cluster from an 7271 $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`. 7272 7273 For examples see the $(LREF Grapheme) below. 7274 7275 Note: 7276 This function modifies `inp` and thus `inp` 7277 must be an L-value. 7278 +/ 7279 Grapheme decodeGrapheme(Input)(ref Input inp) 7280 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar)) 7281 { 7282 return genericDecodeGrapheme!true(inp); 7283 } 7284 7285 @safe unittest 7286 { 7287 import std.algorithm.comparison : equal; 7288 7289 Grapheme gr; 7290 string s = " \u0020\u0308 "; 7291 gr = decodeGrapheme(s); 7292 assert(gr.length == 1 && gr[0] == ' '); 7293 gr = decodeGrapheme(s); 7294 assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308")); 7295 s = "\u0300\u0308\u1100"; 7296 assert(equal(decodeGrapheme(s)[], "\u0300\u0308")); 7297 assert(equal(decodeGrapheme(s)[], "\u1100")); 7298 s = "\u11A8\u0308\uAC01"; 7299 assert(equal(decodeGrapheme(s)[], "\u11A8\u0308")); 7300 assert(equal(decodeGrapheme(s)[], "\uAC01")); 7301 7302 // Two Union Jacks of the Great Britain 7303 s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7"; 7304 assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7")); 7305 } 7306 7307 /++ 7308 $(P Iterate a string by $(LREF Grapheme).) 7309 7310 $(P Useful for doing string manipulation that needs to be aware 7311 of graphemes.) 7312 7313 See_Also: 7314 $(LREF byCodePoint) 7315 +/ 7316 auto byGrapheme(Range)(Range range) 7317 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7318 { 7319 // TODO: Bidirectional access 7320 static struct Result(R) 7321 { 7322 private R _range; 7323 private Grapheme _front; 7324 7325 bool empty() @property 7326 { 7327 return _front.length == 0; 7328 } 7329 7330 Grapheme front() @property 7331 { 7332 return _front; 7333 } 7334 7335 void popFront() 7336 { 7337 _front = _range.empty ? Grapheme.init : _range.decodeGrapheme(); 7338 } 7339 7340 static if (isForwardRange!R) 7341 { 7342 Result save() @property 7343 { 7344 return Result(_range.save, _front); 7345 } 7346 } 7347 } 7348 7349 auto result = Result!(Range)(range); 7350 result.popFront(); 7351 return result; 7352 } 7353 7354 /// 7355 @safe unittest 7356 { 7357 import std.algorithm.comparison : equal; 7358 import std.range.primitives : walkLength; 7359 import std.range : take, drop; 7360 auto text = "noe\u0308l"; // noël using e + combining diaeresis 7361 assert(text.walkLength == 5); // 5 code points 7362 7363 auto gText = text.byGrapheme; 7364 assert(gText.walkLength == 4); // 4 graphemes 7365 7366 assert(gText.take(3).equal("noe\u0308".byGrapheme)); 7367 assert(gText.drop(3).equal("l".byGrapheme)); 7368 } 7369 7370 // For testing non-forward-range input ranges 7371 version (StdUnittest) 7372 private static @safe struct InputRangeString 7373 { 7374 private string s; 7375 7376 bool empty() @property { return s.empty; } 7377 dchar front() @property { return s.front; } 7378 void popFront() { s.popFront(); } 7379 } 7380 7381 @safe unittest 7382 { 7383 import std.algorithm.comparison : equal; 7384 import std.array : array; 7385 import std.range : retro; 7386 import std.range.primitives : walkLength; 7387 assert("".byGrapheme.walkLength == 0); 7388 7389 auto reverse = "le\u0308on"; 7390 assert(reverse.walkLength == 5); 7391 7392 auto gReverse = reverse.byGrapheme; 7393 assert(gReverse.walkLength == 4); 7394 7395 static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d)) 7396 {{ 7397 assert(text.walkLength == 5); 7398 static assert(isForwardRange!(typeof(text))); 7399 7400 auto gText = text.byGrapheme; 7401 static assert(isForwardRange!(typeof(gText))); 7402 assert(gText.walkLength == 4); 7403 assert(gText.array.retro.equal(gReverse)); 7404 }} 7405 7406 auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme; 7407 static assert(!isForwardRange!(typeof(nonForwardRange))); 7408 assert(nonForwardRange.walkLength == 4); 7409 } 7410 7411 // Issue 23474 7412 @safe pure unittest 7413 { 7414 import std.range.primitives : walkLength; 7415 assert(byGrapheme("\r\u0308").walkLength == 2); 7416 } 7417 7418 /++ 7419 $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.) 7420 7421 $(P Useful for converting the result to a string after doing operations 7422 on graphemes.) 7423 7424 $(P If passed in a range of code points, returns a range with equivalent capabilities.) 7425 +/ 7426 auto byCodePoint(Range)(Range range) 7427 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme)) 7428 { 7429 // TODO: Propagate bidirectional access 7430 static struct Result 7431 { 7432 private Range _range; 7433 private size_t i = 0; 7434 7435 bool empty() @property 7436 { 7437 return _range.empty; 7438 } 7439 7440 dchar front() @property 7441 { 7442 return _range.front[i]; 7443 } 7444 7445 void popFront() 7446 { 7447 ++i; 7448 7449 if (i >= _range.front.length) 7450 { 7451 _range.popFront(); 7452 i = 0; 7453 } 7454 } 7455 7456 static if (isForwardRange!Range) 7457 { 7458 Result save() @property 7459 { 7460 return Result(_range.save, i); 7461 } 7462 } 7463 } 7464 7465 return Result(range); 7466 } 7467 7468 /// Ditto 7469 auto byCodePoint(Range)(Range range) 7470 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)) 7471 { 7472 import std.range.primitives : isBidirectionalRange, popBack; 7473 import std.traits : isNarrowString; 7474 static if (isNarrowString!Range) 7475 { 7476 static struct Result 7477 { 7478 private Range _range; 7479 @property bool empty() { return _range.empty; } 7480 @property dchar front(){ return _range.front; } 7481 void popFront(){ _range.popFront; } 7482 @property auto save() { return Result(_range.save); } 7483 @property dchar back(){ return _range.back; } 7484 void popBack(){ _range.popBack; } 7485 } 7486 static assert(isBidirectionalRange!(Result)); 7487 return Result(range); 7488 } 7489 else 7490 return range; 7491 } 7492 7493 /// 7494 @safe unittest 7495 { 7496 import std.array : array; 7497 import std.conv : text; 7498 import std.range : retro; 7499 7500 string s = "noe\u0308l"; // noël 7501 7502 // reverse it and convert the result to a string 7503 string reverse = s.byGrapheme 7504 .array 7505 .retro 7506 .byCodePoint 7507 .text; 7508 7509 assert(reverse == "le\u0308on"); // lëon 7510 } 7511 7512 @safe unittest 7513 { 7514 import std.algorithm.comparison : equal; 7515 import std.range.primitives : walkLength; 7516 import std.range : retro; 7517 assert("".byGrapheme.byCodePoint.equal("")); 7518 7519 string text = "noe\u0308l"; 7520 static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length)); 7521 7522 auto gText = InputRangeString(text).byGrapheme; 7523 static assert(!isForwardRange!(typeof(gText))); 7524 7525 auto cpText = gText.byCodePoint; 7526 static assert(!isForwardRange!(typeof(cpText))); 7527 7528 assert(cpText.walkLength == text.walkLength); 7529 7530 auto plainCp = text.byCodePoint; 7531 static assert(isForwardRange!(typeof(plainCp))); 7532 assert(equal(plainCp, text)); 7533 assert(equal(retro(plainCp.save), retro(text.save))); 7534 // Check that we still have length for dstring 7535 assert("абвгд"d.byCodePoint.length == 5); 7536 } 7537 7538 /++ 7539 $(P A structure designed to effectively pack $(CHARACTERS) 7540 of a $(CLUSTER). 7541 ) 7542 7543 $(P `Grapheme` has value semantics so 2 copies of a `Grapheme` 7544 always refer to distinct objects. In most actual scenarios a `Grapheme` 7545 fits on the stack and avoids memory allocation overhead for all but quite 7546 long clusters. 7547 ) 7548 7549 See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride) 7550 +/ 7551 @safe struct Grapheme 7552 { 7553 import std.exception : enforce; 7554 import std.traits : isDynamicArray; 7555 7556 public: 7557 /// Ctor 7558 this(C)(const scope C[] chars...) 7559 if (is(C : dchar)) 7560 { 7561 this ~= chars; 7562 } 7563 7564 ///ditto 7565 this(Input)(Input seq) 7566 if (!isDynamicArray!Input 7567 && isInputRange!Input && is(ElementType!Input : dchar)) 7568 { 7569 this ~= seq; 7570 } 7571 7572 /// Gets a $(CODEPOINT) at the given index in this cluster. 7573 dchar opIndex(size_t index) const @nogc nothrow pure @trusted 7574 { 7575 assert(index < length); 7576 return read24(isBig ? ptr_ : small_.ptr, index); 7577 } 7578 7579 /++ 7580 Writes a $(CODEPOINT) `ch` at given index in this cluster. 7581 7582 Warning: 7583 Use of this facility may invalidate grapheme cluster, 7584 see also $(LREF Grapheme.valid). 7585 +/ 7586 void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted 7587 { 7588 assert(index < length); 7589 write24(isBig ? ptr_ : small_.ptr, ch, index); 7590 } 7591 7592 /// 7593 @safe unittest 7594 { 7595 auto g = Grapheme("A\u0302"); 7596 assert(g[0] == 'A'); 7597 assert(g.valid); 7598 g[1] = '~'; // ASCII tilda is not a combining mark 7599 assert(g[1] == '~'); 7600 assert(!g.valid); 7601 } 7602 7603 /++ 7604 Random-access range over Grapheme's $(CHARACTERS). 7605 7606 Warning: Invalidates when this Grapheme leaves the scope, 7607 attempts to use it then would lead to memory corruption. 7608 +/ 7609 SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return 7610 { 7611 return sliceOverIndexed(a, b, &this); 7612 } 7613 7614 /// ditto 7615 SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return 7616 { 7617 return sliceOverIndexed(0, length, &this); 7618 } 7619 7620 /// Grapheme cluster length in $(CODEPOINTS). 7621 @property size_t length() const @nogc nothrow pure 7622 { 7623 return isBig ? len_ : slen_ & 0x7F; 7624 } 7625 7626 /++ 7627 Append $(CHARACTER) `ch` to this grapheme. 7628 Warning: 7629 Use of this facility may invalidate grapheme cluster, 7630 see also `valid`. 7631 7632 See_Also: $(LREF Grapheme.valid) 7633 +/ 7634 ref opOpAssign(string op)(dchar ch) @trusted 7635 { 7636 static if (op == "~") 7637 { 7638 import std.internal.memory : enforceRealloc; 7639 if (!isBig) 7640 { 7641 if (slen_ == small_cap) 7642 convertToBig();// & fallthrough to "big" branch 7643 else 7644 { 7645 write24(small_.ptr, ch, smallLength); 7646 slen_++; 7647 return this; 7648 } 7649 } 7650 7651 assert(isBig); 7652 if (len_ == cap_) 7653 { 7654 import core.checkedint : addu, mulu; 7655 bool overflow; 7656 cap_ = addu(cap_, grow, overflow); 7657 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow); 7658 if (overflow) assert(0); 7659 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems); 7660 } 7661 write24(ptr_, ch, len_++); 7662 return this; 7663 } 7664 else 7665 static assert(false, "No operation "~op~" defined for Grapheme"); 7666 } 7667 7668 /// 7669 @safe unittest 7670 { 7671 import std.algorithm.comparison : equal; 7672 auto g = Grapheme("A"); 7673 assert(g.valid); 7674 g ~= '\u0301'; 7675 assert(g[].equal("A\u0301")); 7676 assert(g.valid); 7677 g ~= "B"; 7678 // not a valid grapheme cluster anymore 7679 assert(!g.valid); 7680 // still could be useful though 7681 assert(g[].equal("A\u0301B")); 7682 } 7683 7684 /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme. 7685 ref opOpAssign(string op, Input)(scope Input inp) 7686 if (isInputRange!Input && is(ElementType!Input : dchar)) 7687 { 7688 static if (op == "~") 7689 { 7690 foreach (dchar ch; inp) 7691 this ~= ch; 7692 return this; 7693 } 7694 else 7695 static assert(false, "No operation "~op~" defined for Grapheme"); 7696 } 7697 7698 // This is not a good `opEquals`, but formerly the automatically generated 7699 // opEquals was used, which was inferred `@safe` because of bugzilla 20655: 7700 // https://issues.dlang.org/show_bug.cgi?id=20655 7701 // This `@trusted opEquals` is only here to prevent breakage. 7702 bool opEquals(R)(const auto ref R other) const @trusted 7703 { 7704 return this.tupleof == other.tupleof; 7705 } 7706 7707 // Define a default toHash to allow AA usage 7708 size_t toHash() const @trusted 7709 { 7710 return hashOf(slen_, hashOf(small_)); 7711 } 7712 7713 /++ 7714 True if this object contains valid extended grapheme cluster. 7715 Decoding primitives of this module always return a valid `Grapheme`. 7716 7717 Appending to and direct manipulation of grapheme's $(CHARACTERS) may 7718 render it no longer valid. Certain applications may chose to use 7719 Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property 7720 entirely. 7721 +/ 7722 @property bool valid()() /*const*/ 7723 { 7724 auto r = this[]; 7725 genericDecodeGrapheme!false(r); 7726 return r.length == 0; 7727 } 7728 7729 this(this) @nogc nothrow pure @trusted 7730 { 7731 import std.internal.memory : enforceMalloc; 7732 if (isBig) 7733 {// dup it 7734 import core.checkedint : addu, mulu; 7735 bool overflow; 7736 auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow); 7737 if (overflow) assert(0); 7738 7739 auto p = cast(ubyte*) enforceMalloc(raw_cap); 7740 p[0 .. raw_cap] = ptr_[0 .. raw_cap]; 7741 ptr_ = p; 7742 } 7743 } 7744 7745 ~this() @nogc nothrow pure @trusted 7746 { 7747 import core.memory : pureFree; 7748 if (isBig) 7749 { 7750 pureFree(ptr_); 7751 } 7752 } 7753 7754 7755 private: 7756 enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1); 7757 // "out of the blue" grow rate, needs testing 7758 // (though graphemes are typically small < 9) 7759 enum grow = 20; 7760 enum small_cap = small_bytes/3; 7761 enum small_flag = 0x80, small_mask = 0x7F; 7762 // 16 bytes in 32bits, should be enough for the majority of cases 7763 union 7764 { 7765 struct 7766 { 7767 ubyte* ptr_; 7768 size_t cap_; 7769 size_t len_; 7770 size_t padding_; 7771 } 7772 struct 7773 { 7774 ubyte[small_bytes] small_; 7775 ubyte slen_; 7776 } 7777 } 7778 7779 void convertToBig() @nogc nothrow pure @trusted 7780 { 7781 import std.internal.memory : enforceMalloc; 7782 static assert(grow.max / 3 - 1 >= grow); 7783 enum nbytes = 3 * (grow + 1); 7784 size_t k = smallLength; 7785 ubyte* p = cast(ubyte*) enforceMalloc(nbytes); 7786 for (int i=0; i<k; i++) 7787 write24(p, read24(small_.ptr, i), i); 7788 // now we can overwrite small array data 7789 ptr_ = p; 7790 len_ = slen_; 7791 assert(grow > len_); 7792 cap_ = grow; 7793 setBig(); 7794 } 7795 7796 void setBig() @nogc nothrow pure { slen_ |= small_flag; } 7797 7798 @property size_t smallLength() const @nogc nothrow pure 7799 { 7800 return slen_ & small_mask; 7801 } 7802 @property ubyte isBig() const @nogc nothrow pure 7803 { 7804 return slen_ & small_flag; 7805 } 7806 } 7807 7808 static assert(Grapheme.sizeof == size_t.sizeof*4); 7809 7810 7811 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw 7812 { 7813 import std.algorithm.comparison : equal; 7814 Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")]; 7815 assert(byGrapheme("ЮУЗ").equal(data[])); 7816 } 7817 7818 /// 7819 @safe unittest 7820 { 7821 import std.algorithm.comparison : equal; 7822 import std.algorithm.iteration : filter; 7823 import std.range : isRandomAccessRange; 7824 7825 string bold = "ku\u0308hn"; 7826 7827 // note that decodeGrapheme takes parameter by ref 7828 auto first = decodeGrapheme(bold); 7829 7830 assert(first.length == 1); 7831 assert(first[0] == 'k'); 7832 7833 // the next grapheme is 2 characters long 7834 auto wideOne = decodeGrapheme(bold); 7835 // slicing a grapheme yields a random-access range of dchar 7836 assert(wideOne[].equal("u\u0308")); 7837 assert(wideOne.length == 2); 7838 static assert(isRandomAccessRange!(typeof(wideOne[]))); 7839 7840 // all of the usual range manipulation is possible 7841 assert(wideOne[].filter!isMark().equal("\u0308")); 7842 7843 auto g = Grapheme("A"); 7844 assert(g.valid); 7845 g ~= '\u0301'; 7846 assert(g[].equal("A\u0301")); 7847 assert(g.valid); 7848 g ~= "B"; 7849 // not a valid grapheme cluster anymore 7850 assert(!g.valid); 7851 // still could be useful though 7852 assert(g[].equal("A\u0301B")); 7853 } 7854 7855 @safe unittest 7856 { 7857 auto g = Grapheme("A\u0302"); 7858 assert(g[0] == 'A'); 7859 assert(g.valid); 7860 g[1] = '~'; // ASCII tilda is not a combining mark 7861 assert(g[1] == '~'); 7862 assert(!g.valid); 7863 } 7864 7865 @safe unittest 7866 { 7867 import std.algorithm.comparison : equal; 7868 import std.algorithm.iteration : map; 7869 import std.conv : text; 7870 import std.range : iota; 7871 7872 // not valid clusters (but it just a test) 7873 auto g = Grapheme('a', 'b', 'c', 'd', 'e'); 7874 assert(g[0] == 'a'); 7875 assert(g[1] == 'b'); 7876 assert(g[2] == 'c'); 7877 assert(g[3] == 'd'); 7878 assert(g[4] == 'e'); 7879 g[3] = 'Й'; 7880 assert(g[2] == 'c'); 7881 assert(g[3] == 'Й', text(g[3], " vs ", 'Й')); 7882 assert(g[4] == 'e'); 7883 assert(!g.valid); 7884 7885 g ~= 'ц'; 7886 g ~= '~'; 7887 assert(g[0] == 'a'); 7888 assert(g[1] == 'b'); 7889 assert(g[2] == 'c'); 7890 assert(g[3] == 'Й'); 7891 assert(g[4] == 'e'); 7892 assert(g[5] == 'ц'); 7893 assert(g[6] == '~'); 7894 assert(!g.valid); 7895 7896 Grapheme copy = g; 7897 copy[0] = 'X'; 7898 copy[1] = '-'; 7899 assert(g[0] == 'a' && copy[0] == 'X'); 7900 assert(g[1] == 'b' && copy[1] == '-'); 7901 assert(equal(g[2 .. g.length], copy[2 .. copy.length])); 7902 copy = Grapheme("АБВГДЕЁЖЗИКЛМ"); 7903 assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8])); 7904 copy ~= "xyz"; 7905 assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15])); 7906 assert(!copy.valid); 7907 7908 Grapheme h; 7909 foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"()) 7910 h ~= v; 7911 assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1))); 7912 } 7913 7914 // ensure Grapheme can be used as an AA key. 7915 @safe unittest 7916 { 7917 int[Grapheme] aa; 7918 } 7919 7920 /++ 7921 $(P Does basic case-insensitive comparison of `r1` and `r2`. 7922 This function uses simpler comparison rule thus achieving better performance 7923 than $(LREF icmp). However keep in mind the warning below.) 7924 7925 Params: 7926 r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7927 r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters 7928 7929 Returns: 7930 An `int` that is 0 if the strings match, 7931 <0 if `r1` is lexicographically "less" than `r2`, 7932 >0 if `r1` is lexicographically "greater" than `r2` 7933 7934 Warning: 7935 This function only handles 1:1 $(CODEPOINT) mapping 7936 and thus is not sufficient for certain alphabets 7937 like German, Greek and few others. 7938 7939 See_Also: 7940 $(LREF icmp) 7941 $(REF cmp, std,algorithm,comparison) 7942 +/ 7943 int sicmp(S1, S2)(scope S1 r1, scope S2 r2) 7944 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1) 7945 && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2)) 7946 { 7947 import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file 7948 import std.range.primitives : isInfinite; 7949 import std.utf : decodeFront; 7950 import std.traits : isDynamicArray; 7951 import std.typecons : Yes; 7952 static import std.ascii; 7953 7954 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 7955 && (isDynamicArray!S2 || isRandomAccessRange!S2) 7956 && !(isInfinite!S1 && isInfinite!S2) 7957 && __traits(compiles, 7958 { 7959 size_t s = size_t.sizeof / 2; 7960 r1 = r1[s .. $]; 7961 r2 = r2[s .. $]; 7962 })) 7963 {{ 7964 // ASCII optimization for dynamic arrays & similar. 7965 size_t i = 0; 7966 static if (isInfinite!S1) 7967 immutable end = r2.length; 7968 else static if (isInfinite!S2) 7969 immutable end = r1.length; 7970 else 7971 immutable end = r1.length > r2.length ? r2.length : r1.length; 7972 for (; i < end; ++i) 7973 { 7974 auto lhs = r1[i]; 7975 auto rhs = r2[i]; 7976 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 7977 if (lhs == rhs) continue; 7978 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 7979 if (lowDiff) return lowDiff; 7980 } 7981 static if (isInfinite!S1) 7982 return 1; 7983 else static if (isInfinite!S2) 7984 return -1; 7985 else 7986 return (r1.length > r2.length) - (r2.length > r1.length); 7987 7988 NonAsciiPath: 7989 r1 = r1[i .. $]; 7990 r2 = r2[i .. $]; 7991 // Fall through to standard case. 7992 }} 7993 7994 while (!r1.empty) 7995 { 7996 immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1); 7997 if (r2.empty) 7998 return 1; 7999 immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2); 8000 int diff = lhs - rhs; 8001 if (!diff) 8002 continue; 8003 if ((lhs | rhs) < 0x80) 8004 { 8005 immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8006 if (!d) continue; 8007 return d; 8008 } 8009 size_t idx = simpleCaseTrie[lhs]; 8010 size_t idx2 = simpleCaseTrie[rhs]; 8011 // simpleCaseTrie is packed index table 8012 if (idx != EMPTY_CASE_TRIE) 8013 { 8014 if (idx2 != EMPTY_CASE_TRIE) 8015 {// both cased chars 8016 // adjust idx --> start of bucket 8017 idx = idx - sTable(idx).n; 8018 idx2 = idx2 - sTable(idx2).n; 8019 if (idx == idx2)// one bucket, equivalent chars 8020 continue; 8021 else// not the same bucket 8022 diff = sTable(idx).ch - sTable(idx2).ch; 8023 } 8024 else 8025 diff = sTable(idx - sTable(idx).n).ch - rhs; 8026 } 8027 else if (idx2 != EMPTY_CASE_TRIE) 8028 { 8029 diff = lhs - sTable(idx2 - sTable(idx2).n).ch; 8030 } 8031 // one of chars is not cased at all 8032 return diff; 8033 } 8034 return int(r2.empty) - 1; 8035 } 8036 8037 /// 8038 @safe @nogc pure nothrow unittest 8039 { 8040 assert(sicmp("Август", "авгусТ") == 0); 8041 // Greek also works as long as there is no 1:M mapping in sight 8042 assert(sicmp("ΌΎ", "όύ") == 0); 8043 // things like the following won't get matched as equal 8044 // Greek small letter iota with dialytika and tonos 8045 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8046 8047 // while icmp has no problem with that 8048 assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0); 8049 assert(icmp("ΌΎ", "όύ") == 0); 8050 } 8051 8052 // overloads for the most common cases to reduce compile time 8053 @safe @nogc pure nothrow 8054 { 8055 int sicmp(scope const(char)[] str1, scope const(char)[] str2) 8056 { return sicmp!(const(char)[], const(char)[])(str1, str2); } 8057 8058 int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2) 8059 { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8060 8061 int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2) 8062 { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8063 } 8064 8065 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail) 8066 { 8067 import std.algorithm.searching : skipOver; 8068 import std.internal.unicode_tables : fullCaseTable; // generated file 8069 alias fTable = fullCaseTable; 8070 size_t idx = fullCaseTrie[lhs]; 8071 // fullCaseTrie is packed index table 8072 if (idx == EMPTY_CASE_TRIE) 8073 return lhs; 8074 immutable start = idx - fTable(idx).n; 8075 immutable end = fTable(idx).size + start; 8076 assert(fTable(start).entry_len == 1); 8077 for (idx=start; idx<end; idx++) 8078 { 8079 const entryLen = fTable(idx).entry_len; 8080 if (entryLen == 1) 8081 { 8082 if (fTable(idx).seq[0] == rhs) 8083 { 8084 return 0; 8085 } 8086 } 8087 else 8088 {// OK it's a long chunk, like 'ss' for German 8089 dchar[3] arr = fTable(idx).seq; 8090 const dchar[] seq = arr[0 .. entryLen]; 8091 if (rhs == seq[0] 8092 && rtail.skipOver(seq[1..$])) 8093 { 8094 // note that this path modifies rtail 8095 // iff we managed to get there 8096 return 0; 8097 } 8098 } 8099 } 8100 return fTable(start).seq[0]; // new remapped character for accurate diffs 8101 } 8102 8103 /++ 8104 Does case insensitive comparison of `r1` and `r2`. 8105 Follows the rules of full case-folding mapping. 8106 This includes matching as equal german ß with "ss" and 8107 other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp). 8108 The cost of `icmp` being pedantically correct is 8109 slightly worse performance. 8110 8111 Params: 8112 r1 = a forward range of characters 8113 r2 = a forward range of characters 8114 8115 Returns: 8116 An `int` that is 0 if the strings match, 8117 <0 if `str1` is lexicographically "less" than `str2`, 8118 >0 if `str1` is lexicographically "greater" than `str2` 8119 8120 See_Also: 8121 $(LREF sicmp) 8122 $(REF cmp, std,algorithm,comparison) 8123 +/ 8124 int icmp(S1, S2)(S1 r1, S2 r2) 8125 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1) 8126 && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2)) 8127 { 8128 import std.range.primitives : isInfinite; 8129 import std.traits : isDynamicArray; 8130 import std.utf : byDchar; 8131 static import std.ascii; 8132 8133 static if ((isDynamicArray!S1 || isRandomAccessRange!S1) 8134 && (isDynamicArray!S2 || isRandomAccessRange!S2) 8135 && !(isInfinite!S1 && isInfinite!S2) 8136 && __traits(compiles, 8137 { 8138 size_t s = size_t.max / 2; 8139 r1 = r1[s .. $]; 8140 r2 = r2[s .. $]; 8141 })) 8142 {{ 8143 // ASCII optimization for dynamic arrays & similar. 8144 size_t i = 0; 8145 static if (isInfinite!S1) 8146 immutable end = r2.length; 8147 else static if (isInfinite!S2) 8148 immutable end = r1.length; 8149 else 8150 immutable end = r1.length > r2.length ? r2.length : r1.length; 8151 for (; i < end; ++i) 8152 { 8153 auto lhs = r1[i]; 8154 auto rhs = r2[i]; 8155 if ((lhs | rhs) >= 0x80) goto NonAsciiPath; 8156 if (lhs == rhs) continue; 8157 auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs); 8158 if (lowDiff) return lowDiff; 8159 } 8160 static if (isInfinite!S1) 8161 return 1; 8162 else static if (isInfinite!S2) 8163 return -1; 8164 else 8165 return (r1.length > r2.length) - (r2.length > r1.length); 8166 8167 NonAsciiPath: 8168 r1 = r1[i .. $]; 8169 r2 = r2[i .. $]; 8170 // Fall through to standard case. 8171 }} 8172 8173 auto str1 = r1.byDchar; 8174 auto str2 = r2.byDchar; 8175 8176 for (;;) 8177 { 8178 if (str1.empty) 8179 return str2.empty ? 0 : -1; 8180 immutable lhs = str1.front; 8181 if (str2.empty) 8182 return 1; 8183 immutable rhs = str2.front; 8184 str1.popFront(); 8185 str2.popFront(); 8186 if (!(lhs - rhs)) 8187 continue; 8188 // first try to match lhs to <rhs,right-tail> sequence 8189 immutable cmpLR = fullCasedCmp(lhs, rhs, str2); 8190 if (!cmpLR) 8191 continue; 8192 // then rhs to <lhs,left-tail> sequence 8193 immutable cmpRL = fullCasedCmp(rhs, lhs, str1); 8194 if (!cmpRL) 8195 continue; 8196 // cmpXX contain remapped codepoints 8197 // to obtain stable ordering of icmp 8198 return cmpLR - cmpRL; 8199 } 8200 } 8201 8202 /// 8203 @safe @nogc pure nothrow unittest 8204 { 8205 assert(icmp("Rußland", "Russland") == 0); 8206 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8207 } 8208 8209 /** 8210 * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding 8211 * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`. 8212 */ 8213 @safe @nogc nothrow pure unittest 8214 { 8215 import std.utf : byDchar; 8216 8217 assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0); 8218 assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0); 8219 } 8220 8221 // test different character types 8222 @safe unittest 8223 { 8224 assert(icmp("Rußland", "Russland") == 0); 8225 assert(icmp("Rußland"w, "Russland") == 0); 8226 assert(icmp("Rußland", "Russland"w) == 0); 8227 assert(icmp("Rußland"w, "Russland"w) == 0); 8228 assert(icmp("Rußland"d, "Russland"w) == 0); 8229 assert(icmp("Rußland"w, "Russland"d) == 0); 8230 } 8231 8232 // overloads for the most common cases to reduce compile time 8233 @safe @nogc pure nothrow 8234 { 8235 int icmp(const(char)[] str1, const(char)[] str2) 8236 { return icmp!(const(char)[], const(char)[])(str1, str2); } 8237 int icmp(const(wchar)[] str1, const(wchar)[] str2) 8238 { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); } 8239 int icmp(const(dchar)[] str1, const(dchar)[] str2) 8240 { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); } 8241 } 8242 8243 @safe unittest 8244 { 8245 import std.algorithm.sorting : sort; 8246 import std.conv : to; 8247 import std.exception : assertCTFEable; 8248 assertCTFEable!( 8249 { 8250 static foreach (cfunc; AliasSeq!(icmp, sicmp)) 8251 {{ 8252 static foreach (S1; AliasSeq!(string, wstring, dstring)) 8253 static foreach (S2; AliasSeq!(string, wstring, dstring)) 8254 { 8255 assert(cfunc("".to!S1(), "".to!S2()) == 0); 8256 assert(cfunc("A".to!S1(), "".to!S2()) > 0); 8257 assert(cfunc("".to!S1(), "0".to!S2()) < 0); 8258 assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0); 8259 assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0); 8260 assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0); 8261 assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0); 8262 assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0); 8263 // Check example: 8264 assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0); 8265 assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0); 8266 } 8267 // check that the order is properly agnostic to the case 8268 auto strs = [ "Apple", "ORANGE", "orAcle", "amp", "banana"]; 8269 sort!((a,b) => cfunc(a,b) < 0)(strs); 8270 assert(strs == ["amp", "Apple", "banana", "orAcle", "ORANGE"]); 8271 }} 8272 assert(icmp("ßb", "ssa") > 0); 8273 // Check example: 8274 assert(icmp("Russland", "Rußland") == 0); 8275 assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0); 8276 assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0); 8277 assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0); 8278 // https://issues.dlang.org/show_bug.cgi?id=11057 8279 assert( icmp("K", "L") < 0 ); 8280 }); 8281 } 8282 8283 // https://issues.dlang.org/show_bug.cgi?id=17372 8284 @safe pure unittest 8285 { 8286 import std.algorithm.iteration : joiner, map; 8287 import std.algorithm.sorting : sort; 8288 import std.array : array; 8289 auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0); 8290 } 8291 8292 // This is package(std) for the moment to be used as a support tool for std.regex 8293 // It needs a better API 8294 /* 8295 Return a range of all $(CODEPOINTS) that casefold to 8296 and from this `ch`. 8297 */ 8298 package(std) auto simpleCaseFoldings(dchar ch) @safe 8299 { 8300 import std.internal.unicode_tables : simpleCaseTable; // generated file 8301 alias sTable = simpleCaseTable; 8302 static struct Range 8303 { 8304 @safe pure nothrow: 8305 uint idx; //if == uint.max, then read c. 8306 union 8307 { 8308 dchar c; // == 0 - empty range 8309 uint len; 8310 } 8311 @property bool isSmall() const { return idx == uint.max; } 8312 8313 this(dchar ch) 8314 { 8315 idx = uint.max; 8316 c = ch; 8317 } 8318 8319 this(uint start, uint size) 8320 { 8321 idx = start; 8322 len = size; 8323 } 8324 8325 @property dchar front() const 8326 { 8327 assert(!empty); 8328 if (isSmall) 8329 { 8330 return c; 8331 } 8332 auto ch = sTable(idx).ch; 8333 return ch; 8334 } 8335 8336 @property bool empty() const 8337 { 8338 if (isSmall) 8339 { 8340 return c == 0; 8341 } 8342 return len == 0; 8343 } 8344 8345 @property size_t length() const 8346 { 8347 if (isSmall) 8348 { 8349 return c == 0 ? 0 : 1; 8350 } 8351 return len; 8352 } 8353 8354 void popFront() 8355 { 8356 if (isSmall) 8357 c = 0; 8358 else 8359 { 8360 idx++; 8361 len--; 8362 } 8363 } 8364 } 8365 immutable idx = simpleCaseTrie[ch]; 8366 if (idx == EMPTY_CASE_TRIE) 8367 return Range(ch); 8368 auto entry = sTable(idx); 8369 immutable start = idx - entry.n; 8370 return Range(start, entry.size); 8371 } 8372 8373 @safe unittest 8374 { 8375 import std.algorithm.comparison : equal; 8376 import std.algorithm.searching : canFind; 8377 import std.array : array; 8378 import std.exception : assertCTFEable; 8379 assertCTFEable!((){ 8380 auto r = simpleCaseFoldings('Э').array; 8381 assert(r.length == 2); 8382 assert(r.canFind('э') && r.canFind('Э')); 8383 auto sr = simpleCaseFoldings('~'); 8384 assert(sr.equal("~")); 8385 //A with ring above - casefolds to the same bucket as Angstrom sign 8386 sr = simpleCaseFoldings('Å'); 8387 assert(sr.length == 3); 8388 assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B')); 8389 }); 8390 } 8391 8392 /++ 8393 $(P Returns the $(S_LINK Combining class, combining class) of `ch`.) 8394 +/ 8395 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc 8396 { 8397 return combiningClassTrie[ch]; 8398 } 8399 8400 /// 8401 @safe unittest 8402 { 8403 // shorten the code 8404 alias CC = combiningClass; 8405 8406 // combining tilda 8407 assert(CC('\u0303') == 230); 8408 // combining ring below 8409 assert(CC('\u0325') == 220); 8410 // the simple consequence is that "tilda" should be 8411 // placed after a "ring below" in a sequence 8412 } 8413 8414 @safe pure nothrow @nogc unittest 8415 { 8416 foreach (ch; 0 .. 0x80) 8417 assert(combiningClass(ch) == 0); 8418 assert(combiningClass('\u05BD') == 22); 8419 assert(combiningClass('\u0300') == 230); 8420 assert(combiningClass('\u0317') == 220); 8421 assert(combiningClass('\u1939') == 222); 8422 } 8423 8424 /// Unicode character decomposition type. 8425 enum UnicodeDecomposition { 8426 /// Canonical decomposition. The result is canonically equivalent sequence. 8427 Canonical, 8428 /** 8429 Compatibility decomposition. The result is compatibility equivalent sequence. 8430 Note: Compatibility decomposition is a $(B lossy) conversion, 8431 typically suitable only for fuzzy matching and internal processing. 8432 */ 8433 Compatibility 8434 } 8435 8436 /** 8437 Shorthand aliases for character decomposition type, passed as a 8438 template parameter to $(LREF decompose). 8439 */ 8440 enum { 8441 Canonical = UnicodeDecomposition.Canonical, 8442 Compatibility = UnicodeDecomposition.Compatibility 8443 } 8444 8445 /++ 8446 Try to canonically compose 2 $(CHARACTERS). 8447 Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise. 8448 8449 The assumption is that `first` comes before `second` in the original text, 8450 usually meaning that the first is a starter. 8451 8452 Note: Hangul syllables are not covered by this function. 8453 See `composeJamo` below. 8454 +/ 8455 public dchar compose(dchar first, dchar second) pure nothrow @safe 8456 { 8457 import std.algorithm.iteration : map; 8458 import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask; 8459 import std.range : assumeSorted, stride; 8460 immutable packed = compositionJumpTrie[first]; 8461 if (packed == ushort.max) 8462 return dchar.init; 8463 // unpack offset and length 8464 immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift; 8465 // TODO: optimize this micro binary search (no more then 4-5 steps) 8466 auto r = compositionTable.stride(2)[idx .. idx+cnt].assumeSorted(); 8467 immutable target = r.lowerBound(second).length; 8468 if (target == cnt) 8469 return dchar.init; 8470 immutable entry = compositionTable[(idx+target)*2]; 8471 if (entry != second) 8472 return dchar.init; 8473 return compositionTable[(idx+target)*2 + 1]; 8474 } 8475 8476 /// 8477 @safe unittest 8478 { 8479 assert(compose('A','\u0308') == '\u00C4'); 8480 assert(compose('A', 'B') == dchar.init); 8481 assert(compose('C', '\u0301') == '\u0106'); 8482 // note that the starter is the first one 8483 // thus the following doesn't compose 8484 assert(compose('\u0308', 'A') == dchar.init); 8485 } 8486 8487 /++ 8488 Returns a full $(S_LINK Canonical decomposition, Canonical) 8489 (by default) or $(S_LINK Compatibility decomposition, Compatibility) 8490 decomposition of $(CHARACTER) `ch`. 8491 If no decomposition is available returns a $(LREF Grapheme) 8492 with the `ch` itself. 8493 8494 Note: 8495 This function also decomposes hangul syllables 8496 as prescribed by the standard. 8497 8498 See_Also: $(LREF decomposeHangul) for a restricted version 8499 that takes into account only hangul syllables but 8500 no other decompositions. 8501 +/ 8502 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe 8503 { 8504 import std.algorithm.searching : until; 8505 import std.internal.unicode_decomp : decompCompatTable, decompCanonTable; 8506 static if (decompType == Canonical) 8507 { 8508 alias table = decompCanonTable; 8509 alias mapping = canonMappingTrie; 8510 } 8511 else static if (decompType == Compatibility) 8512 { 8513 alias table = decompCompatTable; 8514 alias mapping = compatMappingTrie; 8515 } 8516 immutable idx = mapping[ch]; 8517 if (!idx) // not found, check hangul arithmetic decomposition 8518 return decomposeHangul(ch); 8519 auto decomp = table[idx..$].until(0); 8520 return Grapheme(decomp); 8521 } 8522 8523 /// 8524 @safe unittest 8525 { 8526 import std.algorithm.comparison : equal; 8527 8528 assert(compose('A','\u0308') == '\u00C4'); 8529 assert(compose('A', 'B') == dchar.init); 8530 assert(compose('C', '\u0301') == '\u0106'); 8531 // note that the starter is the first one 8532 // thus the following doesn't compose 8533 assert(compose('\u0308', 'A') == dchar.init); 8534 8535 assert(decompose('Ĉ')[].equal("C\u0302")); 8536 assert(decompose('D')[].equal("D")); 8537 assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7")); 8538 assert(decompose!Compatibility('¹')[].equal("1")); 8539 } 8540 8541 //---------------------------------------------------------------------------- 8542 // Hangul specific composition/decomposition 8543 enum jamoSBase = 0xAC00; 8544 enum jamoLBase = 0x1100; 8545 enum jamoVBase = 0x1161; 8546 enum jamoTBase = 0x11A7; 8547 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28; 8548 enum jamoNCount = jamoVCount * jamoTCount; 8549 enum jamoSCount = jamoLCount * jamoNCount; 8550 8551 // Tests if `ch` is a Hangul leading consonant jamo. 8552 bool isJamoL(dchar ch) pure nothrow @nogc @safe 8553 { 8554 // first cmp rejects ~ 1M code points above leading jamo range 8555 return ch < jamoLBase+jamoLCount && ch >= jamoLBase; 8556 } 8557 8558 // Tests if `ch` is a Hangul vowel jamo. 8559 bool isJamoT(dchar ch) pure nothrow @nogc @safe 8560 { 8561 // first cmp rejects ~ 1M code points above trailing jamo range 8562 // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0) 8563 return ch < jamoTBase+jamoTCount && ch > jamoTBase; 8564 } 8565 8566 // Tests if `ch` is a Hangul trailnig consonant jamo. 8567 bool isJamoV(dchar ch) pure nothrow @nogc @safe 8568 { 8569 // first cmp rejects ~ 1M code points above vowel range 8570 return ch < jamoVBase+jamoVCount && ch >= jamoVBase; 8571 } 8572 8573 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe 8574 { 8575 int idxS = cast(int) ch - jamoSBase; 8576 return idxS >= 0 && idxS < jamoSCount ? idxS : -1; 8577 } 8578 8579 // internal helper: compose hangul syllables leaving dchar.init in holes 8580 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe 8581 { 8582 for (size_t idx = 0; idx + 1 < seq.length; ) 8583 { 8584 if (isJamoL(seq[idx]) && isJamoV(seq[idx+1])) 8585 { 8586 immutable int indexL = seq[idx] - jamoLBase; 8587 immutable int indexV = seq[idx+1] - jamoVBase; 8588 immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount; 8589 if (idx + 2 < seq.length && isJamoT(seq[idx+2])) 8590 { 8591 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase; 8592 seq[idx+1] = dchar.init; 8593 seq[idx+2] = dchar.init; 8594 idx += 3; 8595 } 8596 else 8597 { 8598 seq[idx] = jamoSBase + indexLV; 8599 seq[idx+1] = dchar.init; 8600 idx += 2; 8601 } 8602 } 8603 else 8604 idx++; 8605 } 8606 } 8607 8608 //---------------------------------------------------------------------------- 8609 public: 8610 8611 /** 8612 Decomposes a Hangul syllable. If `ch` is not a composed syllable 8613 then this function returns $(LREF Grapheme) containing only `ch` as is. 8614 */ 8615 Grapheme decomposeHangul(dchar ch) nothrow pure @safe 8616 { 8617 immutable idxS = cast(int) ch - jamoSBase; 8618 if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch); 8619 immutable idxL = idxS / jamoNCount; 8620 immutable idxV = (idxS % jamoNCount) / jamoTCount; 8621 immutable idxT = idxS % jamoTCount; 8622 8623 immutable partL = jamoLBase + idxL; 8624 immutable partV = jamoVBase + idxV; 8625 if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition 8626 return Grapheme(partL, partV, jamoTBase + idxT); 8627 else // <L, V> decomposition 8628 return Grapheme(partL, partV); 8629 } 8630 8631 /// 8632 @safe unittest 8633 { 8634 import std.algorithm.comparison : equal; 8635 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8636 } 8637 8638 /++ 8639 Try to compose hangul syllable out of a leading consonant (`lead`), 8640 a `vowel` and optional `trailing` consonant jamos. 8641 8642 On success returns the composed LV or LVT hangul syllable. 8643 8644 If any of `lead` and `vowel` are not a valid hangul jamo 8645 of the respective $(CHARACTER) class returns dchar.init. 8646 +/ 8647 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe 8648 { 8649 if (!isJamoL(lead)) 8650 return dchar.init; 8651 immutable indexL = lead - jamoLBase; 8652 if (!isJamoV(vowel)) 8653 return dchar.init; 8654 immutable indexV = vowel - jamoVBase; 8655 immutable indexLV = indexL * jamoNCount + indexV * jamoTCount; 8656 immutable dchar syllable = jamoSBase + indexLV; 8657 return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable; 8658 } 8659 8660 /// 8661 @safe unittest 8662 { 8663 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8664 // leaving out T-vowel, or passing any codepoint 8665 // that is not trailing consonant composes an LV-syllable 8666 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); 8667 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8668 assert(composeJamo('\u1111', 'A') == dchar.init); 8669 assert(composeJamo('A', '\u1171') == dchar.init); 8670 } 8671 8672 @safe unittest 8673 { 8674 import std.algorithm.comparison : equal; 8675 import std.conv : text; 8676 8677 static void testDecomp(UnicodeDecomposition T)(dchar ch, string r) 8678 { 8679 Grapheme g = decompose!T(ch); 8680 assert(equal(g[], r), text(g[], " vs ", r)); 8681 } 8682 testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345"); 8683 testDecomp!Canonical('\uF907', "\u9F9C"); 8684 testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C"); 8685 testDecomp!Compatibility('\uA7F9', "\u0153"); 8686 8687 // check examples 8688 assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6")); 8689 assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB'); 8690 assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel 8691 assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC'); 8692 assert(composeJamo('\u1111', 'A') == dchar.init); 8693 assert(composeJamo('A', '\u1171') == dchar.init); 8694 } 8695 8696 /** 8697 Enumeration type for normalization forms, 8698 passed as template parameter for functions like $(LREF normalize). 8699 */ 8700 enum NormalizationForm { 8701 NFC, 8702 NFD, 8703 NFKC, 8704 NFKD 8705 } 8706 8707 8708 enum { 8709 /** 8710 Shorthand aliases from values indicating normalization forms. 8711 */ 8712 NFC = NormalizationForm.NFC, 8713 ///ditto 8714 NFD = NormalizationForm.NFD, 8715 ///ditto 8716 NFKC = NormalizationForm.NFKC, 8717 ///ditto 8718 NFKD = NormalizationForm.NFKD 8719 } 8720 8721 /++ 8722 Returns `input` string normalized to the chosen form. 8723 Form C is used by default. 8724 8725 For more information on normalization forms see 8726 the $(S_LINK Normalization, normalization section). 8727 8728 Note: 8729 In cases where the string in question is already normalized, 8730 it is returned unmodified and no memory allocation happens. 8731 +/ 8732 /* 8733 WARNING: @trusted lambda inside - handle with same care as @trusted 8734 functions 8735 8736 Despite being a template, the attributes do no harm since this doesn't work 8737 with user-defined range or character types anyway. 8738 */ 8739 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C) 8740 (return scope inout(C)[] input) 8741 { 8742 import std.algorithm.mutation : SwapStrategy; 8743 import std.algorithm.sorting : sort; 8744 import std.array : appender; 8745 import std.range : zip; 8746 8747 auto anchors = splitNormalized!norm(input); 8748 if (anchors[0] == input.length && anchors[1] == input.length) 8749 return input; 8750 dchar[] decomposed; 8751 decomposed.reserve(31); 8752 ubyte[] ccc; 8753 ccc.reserve(31); 8754 auto app = appender!(C[])(); 8755 do 8756 { 8757 app.put(input[0 .. anchors[0]]); 8758 foreach (dchar ch; input[anchors[0]..anchors[1]]) 8759 static if (norm == NFD || norm == NFC) 8760 { 8761 foreach (dchar c; decompose!Canonical(ch)[]) 8762 decomposed ~= c; 8763 } 8764 else // NFKD & NFKC 8765 { 8766 foreach (dchar c; decompose!Compatibility(ch)[]) 8767 decomposed ~= c; 8768 } 8769 ccc.length = decomposed.length; 8770 size_t firstNonStable = 0; 8771 ubyte lastClazz = 0; 8772 8773 foreach (idx, dchar ch; decomposed) 8774 { 8775 immutable clazz = combiningClass(ch); 8776 ccc[idx] = clazz; 8777 if (clazz == 0 && lastClazz != 0) 8778 { 8779 // found a stable code point after unstable ones 8780 sort!("a[0] < b[0]", SwapStrategy.stable) 8781 (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx])); 8782 firstNonStable = decomposed.length; 8783 } 8784 else if (clazz != 0 && lastClazz == 0) 8785 { 8786 // found first unstable code point after stable ones 8787 firstNonStable = idx; 8788 } 8789 lastClazz = clazz; 8790 } 8791 sort!("a[0] < b[0]", SwapStrategy.stable) 8792 (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$])); 8793 static if (norm == NFC || norm == NFKC) 8794 { 8795 import std.algorithm.searching : countUntil; 8796 auto first = countUntil(ccc, 0); 8797 if (first >= 0) // no starters?? no recomposition 8798 { 8799 for (;;) 8800 { 8801 immutable second = recompose(first, decomposed, ccc); 8802 if (second == decomposed.length) 8803 break; 8804 first = second; 8805 } 8806 // 2nd pass for hangul syllables 8807 hangulRecompose(decomposed); 8808 } 8809 } 8810 static if (norm == NFD || norm == NFKD) 8811 app.put(decomposed); 8812 else 8813 { 8814 import std.algorithm.mutation : remove; 8815 auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed); 8816 app.put(decomposed[0 .. clean.length]); 8817 } 8818 // reset variables 8819 decomposed.length = 0; 8820 () @trusted { 8821 // assumeSafeAppend isn't considered pure as of writing, hence the 8822 // cast. It isn't pure in the sense that the elements after 8823 // the array in question are affected, but we don't use those 8824 // making the call pure for our purposes. 8825 (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})(); 8826 ccc.length = 0; 8827 (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})(); 8828 } (); 8829 input = input[anchors[1]..$]; 8830 // and move on 8831 anchors = splitNormalized!norm(input); 8832 } while (anchors[0] != input.length); 8833 app.put(input[0 .. anchors[0]]); 8834 return () @trusted inout { return cast(inout(C)[]) app.data; } (); 8835 } 8836 8837 /// 8838 @safe pure unittest 8839 { 8840 // any encoding works 8841 wstring greet = "Hello world"; 8842 assert(normalize(greet) is greet); // the same exact slice 8843 8844 // An example of a character with all 4 forms being different: 8845 // Greek upsilon with acute and hook symbol (code point 0x03D3) 8846 assert(normalize!NFC("ϓ") == "\u03D3"); 8847 assert(normalize!NFD("ϓ") == "\u03D2\u0301"); 8848 assert(normalize!NFKC("ϓ") == "\u038E"); 8849 assert(normalize!NFKD("ϓ") == "\u03A5\u0301"); 8850 } 8851 8852 @safe pure unittest 8853 { 8854 import std.conv : text; 8855 8856 assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def"))); 8857 assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰")); 8858 assert(normalize!NFD("Äffin") == "A\u0308ffin"); 8859 8860 // test with dstring 8861 dstring greet = "Hello world"; 8862 assert(normalize(greet) is greet); // the same exact slice 8863 } 8864 8865 // canonically recompose given slice of code points, works in-place and mutates data 8866 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe 8867 { 8868 assert(input.length == ccc.length); 8869 int accumCC = -1;// so that it's out of 0 .. 255 range 8870 // writefln("recomposing %( %04x %)", input); 8871 // first one is always a starter thus we start at i == 1 8872 size_t i = start+1; 8873 for (; ; ) 8874 { 8875 if (i == input.length) 8876 break; 8877 immutable curCC = ccc[i]; 8878 // In any character sequence beginning with a starter S 8879 // a character C is blocked from S if and only if there 8880 // is some character B between S and C, and either B 8881 // is a starter or it has the same or higher combining class as C. 8882 //------------------------ 8883 // Applying to our case: 8884 // S is input[0] 8885 // accumCC is the maximum CCC of characters between C and S, 8886 // as ccc are sorted 8887 // C is input[i] 8888 8889 if (curCC > accumCC) 8890 { 8891 immutable comp = compose(input[start], input[i]); 8892 if (comp != dchar.init) 8893 { 8894 input[start] = comp; 8895 input[i] = dchar.init;// put a sentinel 8896 // current was merged so its CCC shouldn't affect 8897 // composing with the next one 8898 } 8899 else 8900 { 8901 // if it was a starter then accumCC is now 0, end of loop 8902 accumCC = curCC; 8903 if (accumCC == 0) 8904 break; 8905 } 8906 } 8907 else 8908 { 8909 // ditto here 8910 accumCC = curCC; 8911 if (accumCC == 0) 8912 break; 8913 } 8914 i++; 8915 } 8916 return i; 8917 } 8918 8919 // returns tuple of 2 indexes that delimit: 8920 // normalized text, piece that needs normalization and 8921 // the rest of input starting with stable code point 8922 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input) 8923 { 8924 import std.typecons : tuple; 8925 ubyte lastCC = 0; 8926 8927 foreach (idx, dchar ch; input) 8928 { 8929 static if (norm == NFC) 8930 if (ch < 0x0300) 8931 { 8932 lastCC = 0; 8933 continue; 8934 } 8935 immutable ubyte CC = combiningClass(ch); 8936 if (lastCC > CC && CC != 0) 8937 { 8938 return seekStable!norm(idx, input); 8939 } 8940 8941 if (notAllowedIn!norm(ch)) 8942 { 8943 return seekStable!norm(idx, input); 8944 } 8945 lastCC = CC; 8946 } 8947 return tuple(input.length, input.length); 8948 } 8949 8950 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input) 8951 { 8952 import std.typecons : tuple; 8953 import std.utf : codeLength; 8954 8955 auto br = input[0 .. idx]; 8956 size_t region_start = 0;// default 8957 for (;;) 8958 { 8959 if (br.empty)// start is 0 8960 break; 8961 dchar ch = br.back; 8962 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8963 { 8964 region_start = br.length - codeLength!C(ch); 8965 break; 8966 } 8967 br.popFront(); 8968 } 8969 ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..." 8970 size_t region_end=input.length;// end is $ by default 8971 foreach (i, dchar ch; input[idx..$]) 8972 { 8973 if (combiningClass(ch) == 0 && allowedIn!norm(ch)) 8974 { 8975 region_end = i+idx; 8976 break; 8977 } 8978 } 8979 // writeln("Region to normalize: ", input[region_start .. region_end]); 8980 return tuple(region_start, region_end); 8981 } 8982 8983 /** 8984 Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization 8985 form `norm`. 8986 */ 8987 public bool allowedIn(NormalizationForm norm)(dchar ch) 8988 { 8989 return !notAllowedIn!norm(ch); 8990 } 8991 8992 /// 8993 @safe unittest 8994 { 8995 // e.g. Cyrillic is always allowed, so is ASCII 8996 assert(allowedIn!NFC('я')); 8997 assert(allowedIn!NFD('я')); 8998 assert(allowedIn!NFKC('я')); 8999 assert(allowedIn!NFKD('я')); 9000 assert(allowedIn!NFC('Z')); 9001 } 9002 9003 // not user friendly name but more direct 9004 private bool notAllowedIn(NormalizationForm norm)(dchar ch) 9005 { 9006 static if (norm == NFC) 9007 alias qcTrie = nfcQCTrie; 9008 else static if (norm == NFD) 9009 alias qcTrie = nfdQCTrie; 9010 else static if (norm == NFKC) 9011 alias qcTrie = nfkcQCTrie; 9012 else static if (norm == NFKD) 9013 alias qcTrie = nfkdQCTrie; 9014 else 9015 static assert("Unknown normalization form "~norm); 9016 return qcTrie[ch]; 9017 } 9018 9019 @safe unittest 9020 { 9021 assert(allowedIn!NFC('я')); 9022 assert(allowedIn!NFD('я')); 9023 assert(allowedIn!NFKC('я')); 9024 assert(allowedIn!NFKD('я')); 9025 assert(allowedIn!NFC('Z')); 9026 } 9027 9028 } 9029 9030 version (std_uni_bootstrap) 9031 { 9032 // old version used for bootstrapping of gen_uni.d that generates 9033 // up to date optimal versions of all of isXXX functions 9034 @safe pure nothrow @nogc public bool isWhite(dchar c) 9035 { 9036 import std.ascii : isWhite; 9037 return isWhite(c) || 9038 c == lineSep || c == paraSep || 9039 c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' || 9040 (c >= '\u2000' && c <= '\u200A') || 9041 c == '\u202F' || c == '\u205F' || c == '\u3000'; 9042 } 9043 } 9044 else 9045 { 9046 9047 // trusted -> avoid bounds check 9048 @trusted pure nothrow @nogc private 9049 { 9050 import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file 9051 9052 // hide template instances behind functions 9053 // https://issues.dlang.org/show_bug.cgi?id=13232 9054 ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; } 9055 ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; } 9056 dchar toLowerTab(size_t idx) { return toLowerTable[idx]; } 9057 9058 ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; } 9059 ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; } 9060 dchar toTitleTab(size_t idx) { return toTitleTable[idx]; } 9061 9062 ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; } 9063 ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; } 9064 dchar toUpperTab(size_t idx) { return toUpperTable[idx]; } 9065 } 9066 9067 public: 9068 9069 /++ 9070 Whether or not `c` is a Unicode whitespace $(CHARACTER). 9071 (general Unicode category: Part of C0(tab, vertical tab, form feed, 9072 carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)) 9073 +/ 9074 @safe pure nothrow @nogc 9075 public bool isWhite(dchar c) 9076 { 9077 import std.internal.unicode_tables : isWhiteGen; // generated file 9078 return isWhiteGen(c); // call pregenerated binary search 9079 } 9080 9081 /++ 9082 Return whether `c` is a Unicode lowercase $(CHARACTER). 9083 +/ 9084 @safe pure nothrow @nogc 9085 bool isLower(dchar c) 9086 { 9087 import std.ascii : isLower, isASCII; 9088 if (isASCII(c)) 9089 return isLower(c); 9090 return lowerCaseTrie[c]; 9091 } 9092 9093 @safe unittest 9094 { 9095 import std.ascii : isLower; 9096 foreach (v; 0 .. 0x80) 9097 assert(isLower(v) == .isLower(v)); 9098 assert(.isLower('я')); 9099 assert(.isLower('й')); 9100 assert(!.isLower('Ж')); 9101 // Greek HETA 9102 assert(!.isLower('\u0370')); 9103 assert(.isLower('\u0371')); 9104 assert(!.isLower('\u039C')); // capital MU 9105 assert(.isLower('\u03B2')); // beta 9106 // from extended Greek 9107 assert(!.isLower('\u1F18')); 9108 assert(.isLower('\u1F00')); 9109 foreach (v; unicode.lowerCase.byCodepoint) 9110 assert(.isLower(v) && !isUpper(v)); 9111 } 9112 9113 9114 /++ 9115 Return whether `c` is a Unicode uppercase $(CHARACTER). 9116 +/ 9117 @safe pure nothrow @nogc 9118 bool isUpper(dchar c) 9119 { 9120 import std.ascii : isUpper, isASCII; 9121 if (isASCII(c)) 9122 return isUpper(c); 9123 return upperCaseTrie[c]; 9124 } 9125 9126 @safe unittest 9127 { 9128 import std.ascii : isLower; 9129 foreach (v; 0 .. 0x80) 9130 assert(isLower(v) == .isLower(v)); 9131 assert(!isUpper('й')); 9132 assert(isUpper('Ж')); 9133 // Greek HETA 9134 assert(isUpper('\u0370')); 9135 assert(!isUpper('\u0371')); 9136 assert(isUpper('\u039C')); // capital MU 9137 assert(!isUpper('\u03B2')); // beta 9138 // from extended Greek 9139 assert(!isUpper('\u1F00')); 9140 assert(isUpper('\u1F18')); 9141 foreach (v; unicode.upperCase.byCodepoint) 9142 assert(isUpper(v) && !.isLower(v)); 9143 } 9144 9145 9146 //TODO: Hidden for now, needs better API. 9147 //Other transforms could use better API as well, but this one is a new primitive. 9148 @safe pure nothrow @nogc 9149 private dchar toTitlecase(dchar c) 9150 { 9151 // optimize ASCII case 9152 if (c < 0xAA) 9153 { 9154 if (c < 'a') 9155 return c; 9156 if (c <= 'z') 9157 return c - 32; 9158 return c; 9159 } 9160 size_t idx = toTitleSimpleIndex(c); 9161 if (idx != ushort.max) 9162 { 9163 return toTitleTab(idx); 9164 } 9165 return c; 9166 } 9167 9168 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab); 9169 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab); 9170 9171 // generic toUpper/toLower on whole string, creates new or returns as is 9172 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s) 9173 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 9174 { 9175 import std.array : appender, array; 9176 import std.ascii : isASCII; 9177 import std.utf : byDchar, codeLength; 9178 9179 alias C = ElementEncodingType!S; 9180 9181 auto r = s.byDchar; 9182 for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront()) 9183 { 9184 auto cOuter = r.front; 9185 ushort idx = indexFn(cOuter); 9186 if (idx == ushort.max) 9187 continue; 9188 auto result = appender!(C[])(); 9189 result.reserve(s.length); 9190 result.put(s[0 .. i]); 9191 foreach (dchar c; s[i .. $].byDchar) 9192 { 9193 if (c.isASCII) 9194 { 9195 result.put(asciiConvert(c)); 9196 } 9197 else 9198 { 9199 idx = indexFn(c); 9200 if (idx == ushort.max) 9201 result.put(c); 9202 else if (idx < maxIdx) 9203 { 9204 c = tableFn(idx); 9205 result.put(c); 9206 } 9207 else 9208 { 9209 auto val = tableFn(idx); 9210 // unpack length + codepoint 9211 immutable uint len = val >> 24; 9212 result.put(cast(dchar)(val & 0xFF_FFFF)); 9213 foreach (j; idx+1 .. idx+len) 9214 result.put(tableFn(j)); 9215 } 9216 } 9217 } 9218 return result.data; 9219 } 9220 9221 static if (isSomeString!S) 9222 return s; 9223 else 9224 return s.array; 9225 } 9226 9227 // https://issues.dlang.org/show_bug.cgi?id=12428 9228 @safe unittest 9229 { 9230 import std.array : replicate; 9231 auto s = "abcdefghij".replicate(300); 9232 s = s[0 .. 10]; 9233 9234 toUpper(s); 9235 9236 assert(s == "abcdefghij"); 9237 } 9238 9239 // https://issues.dlang.org/show_bug.cgi?id=18993 9240 @safe unittest 9241 { 9242 static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length); 9243 } 9244 9245 9246 // generic toUpper/toLower on whole range, returns range 9247 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str) 9248 // Accept range of dchar's 9249 if (isInputRange!Range && 9250 isSomeChar!(ElementEncodingType!Range) && 9251 ElementEncodingType!Range.sizeof == dchar.sizeof) 9252 { 9253 static struct ToCaserImpl 9254 { 9255 @property bool empty() 9256 { 9257 return !nLeft && r.empty; 9258 } 9259 9260 @property auto front() 9261 { 9262 import std.ascii : isASCII; 9263 9264 if (!nLeft) 9265 { 9266 dchar c = r.front; 9267 if (c.isASCII) 9268 { 9269 buf[0] = asciiConvert(c); 9270 nLeft = 1; 9271 } 9272 else 9273 { 9274 const idx = indexFn(c); 9275 if (idx == ushort.max) 9276 { 9277 buf[0] = c; 9278 nLeft = 1; 9279 } 9280 else if (idx < maxIdx) 9281 { 9282 buf[0] = tableFn(idx); 9283 nLeft = 1; 9284 } 9285 else 9286 { 9287 immutable val = tableFn(idx); 9288 // unpack length + codepoint 9289 nLeft = val >> 24; 9290 if (nLeft == 0) 9291 nLeft = 1; 9292 assert(nLeft <= buf.length); 9293 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9294 foreach (j; 1 .. nLeft) 9295 buf[nLeft - j - 1] = tableFn(idx + j); 9296 } 9297 } 9298 } 9299 return buf[nLeft - 1]; 9300 } 9301 9302 void popFront() 9303 { 9304 if (!nLeft) 9305 front; 9306 assert(nLeft); 9307 --nLeft; 9308 if (!nLeft) 9309 r.popFront(); 9310 } 9311 9312 static if (isForwardRange!Range) 9313 { 9314 @property auto save() 9315 { 9316 auto ret = this; 9317 ret.r = r.save; 9318 return ret; 9319 } 9320 } 9321 9322 private: 9323 Range r; 9324 uint nLeft; 9325 dchar[3] buf = void; 9326 } 9327 9328 return ToCaserImpl(str); 9329 } 9330 9331 /********************* 9332 * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9333 * or a string to upper or lower case. 9334 * 9335 * Does not allocate memory. 9336 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9337 * are treated as $(REF replacementDchar, std,utf). 9338 * 9339 * Params: 9340 * str = string or range of characters 9341 * 9342 * Returns: 9343 * an input range of `dchar`s 9344 * 9345 * See_Also: 9346 * $(LREF toUpper), $(LREF toLower) 9347 */ 9348 9349 auto asLowerCase(Range)(Range str) 9350 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9351 !isConvertibleToString!Range) 9352 { 9353 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9354 { 9355 import std.utf : byDchar; 9356 9357 // Decode first 9358 return asLowerCase(str.byDchar); 9359 } 9360 else 9361 { 9362 static import std.ascii; 9363 return toCaser!(LowerTriple, std.ascii.toLower)(str); 9364 } 9365 } 9366 9367 /// ditto 9368 auto asUpperCase(Range)(Range str) 9369 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9370 !isConvertibleToString!Range) 9371 { 9372 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9373 { 9374 import std.utf : byDchar; 9375 9376 // Decode first 9377 return asUpperCase(str.byDchar); 9378 } 9379 else 9380 { 9381 static import std.ascii; 9382 return toCaser!(UpperTriple, std.ascii.toUpper)(str); 9383 } 9384 } 9385 9386 /// 9387 @safe pure unittest 9388 { 9389 import std.algorithm.comparison : equal; 9390 9391 assert("hEllo".asUpperCase.equal("HELLO")); 9392 } 9393 9394 // explicitly undocumented 9395 auto asLowerCase(Range)(auto ref Range str) 9396 if (isConvertibleToString!Range) 9397 { 9398 import std.traits : StringTypeOf; 9399 return asLowerCase!(StringTypeOf!Range)(str); 9400 } 9401 9402 // explicitly undocumented 9403 auto asUpperCase(Range)(auto ref Range str) 9404 if (isConvertibleToString!Range) 9405 { 9406 import std.traits : StringTypeOf; 9407 return asUpperCase!(StringTypeOf!Range)(str); 9408 } 9409 9410 @safe unittest 9411 { 9412 static struct TestAliasedString 9413 { 9414 string get() @safe @nogc pure nothrow { return _s; } 9415 alias get this; 9416 @disable this(this); 9417 string _s; 9418 } 9419 9420 static bool testAliasedString(alias func, Args...)(string s, Args args) 9421 { 9422 import std.algorithm.comparison : equal; 9423 auto a = func(TestAliasedString(s), args); 9424 auto b = func(s, args); 9425 static if (is(typeof(equal(a, b)))) 9426 { 9427 // For ranges, compare contents instead of object identity. 9428 return equal(a, b); 9429 } 9430 else 9431 { 9432 return a == b; 9433 } 9434 } 9435 assert(testAliasedString!asLowerCase("hEllo")); 9436 assert(testAliasedString!asUpperCase("hEllo")); 9437 assert(testAliasedString!asCapitalized("hEllo")); 9438 } 9439 9440 @safe unittest 9441 { 9442 import std.array : array; 9443 9444 auto a = "HELLo".asLowerCase; 9445 auto savea = a.save; 9446 auto s = a.array; 9447 assert(s == "hello"); 9448 s = savea.array; 9449 assert(s == "hello"); 9450 9451 string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 9452 string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 9453 9454 foreach (i, slwr; lower) 9455 { 9456 import std.utf : byChar; 9457 9458 auto sx = slwr.asUpperCase.byChar.array; 9459 assert(sx == toUpper(slwr)); 9460 auto sy = upper[i].asLowerCase.byChar.array; 9461 assert(sy == toLower(upper[i])); 9462 } 9463 9464 // Not necessary to call r.front 9465 for (auto r = lower[3].asUpperCase; !r.empty; r.popFront()) 9466 { 9467 } 9468 9469 import std.algorithm.comparison : equal; 9470 9471 "HELLo"w.asLowerCase.equal("hello"d); 9472 "HELLo"w.asUpperCase.equal("HELLO"d); 9473 "HELLo"d.asLowerCase.equal("hello"d); 9474 "HELLo"d.asUpperCase.equal("HELLO"d); 9475 9476 import std.utf : byChar; 9477 assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array); 9478 } 9479 9480 // generic capitalizer on whole range, returns range 9481 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper, 9482 Range)(Range str) 9483 // Accept range of dchar's 9484 if (isInputRange!Range && 9485 isSomeChar!(ElementEncodingType!Range) && 9486 ElementEncodingType!Range.sizeof == dchar.sizeof) 9487 { 9488 static struct ToCapitalizerImpl 9489 { 9490 @property bool empty() 9491 { 9492 return lower ? lwr.empty : !nLeft && r.empty; 9493 } 9494 9495 @property auto front() 9496 { 9497 if (lower) 9498 return lwr.front; 9499 9500 if (!nLeft) 9501 { 9502 immutable dchar c = r.front; 9503 const idx = indexFnUpper(c); 9504 if (idx == ushort.max) 9505 { 9506 buf[0] = c; 9507 nLeft = 1; 9508 } 9509 else if (idx < maxIdxUpper) 9510 { 9511 buf[0] = tableFnUpper(idx); 9512 nLeft = 1; 9513 } 9514 else 9515 { 9516 immutable val = tableFnUpper(idx); 9517 // unpack length + codepoint 9518 nLeft = val >> 24; 9519 if (nLeft == 0) 9520 nLeft = 1; 9521 assert(nLeft <= buf.length); 9522 buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF); 9523 foreach (j; 1 .. nLeft) 9524 buf[nLeft - j - 1] = tableFnUpper(idx + j); 9525 } 9526 } 9527 return buf[nLeft - 1]; 9528 } 9529 9530 void popFront() 9531 { 9532 if (lower) 9533 lwr.popFront(); 9534 else 9535 { 9536 if (!nLeft) 9537 front; 9538 assert(nLeft); 9539 --nLeft; 9540 if (!nLeft) 9541 { 9542 r.popFront(); 9543 lwr = r.asLowerCase(); 9544 lower = true; 9545 } 9546 } 9547 } 9548 9549 static if (isForwardRange!Range) 9550 { 9551 @property auto save() 9552 { 9553 auto ret = this; 9554 ret.r = r.save; 9555 ret.lwr = lwr.save; 9556 return ret; 9557 } 9558 } 9559 9560 private: 9561 Range r; 9562 typeof(r.asLowerCase) lwr; // range representing the lower case rest of string 9563 bool lower = false; // false for first character, true for rest of string 9564 dchar[3] buf = void; 9565 uint nLeft = 0; 9566 } 9567 9568 return ToCapitalizerImpl(str); 9569 } 9570 9571 /********************* 9572 * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) 9573 * or string, meaning convert the first 9574 * character to upper case and subsequent characters to lower case. 9575 * 9576 * Does not allocate memory. 9577 * Characters in UTF-8 or UTF-16 format that cannot be decoded 9578 * are treated as $(REF replacementDchar, std,utf). 9579 * 9580 * Params: 9581 * str = string or range of characters 9582 * 9583 * Returns: 9584 * an InputRange of dchars 9585 * 9586 * See_Also: 9587 * $(LREF toUpper), $(LREF toLower) 9588 * $(LREF asUpperCase), $(LREF asLowerCase) 9589 */ 9590 9591 auto asCapitalized(Range)(Range str) 9592 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) && 9593 !isConvertibleToString!Range) 9594 { 9595 static if (ElementEncodingType!Range.sizeof < dchar.sizeof) 9596 { 9597 import std.utf : byDchar; 9598 9599 // Decode first 9600 return toCapitalizer!UpperTriple(str.byDchar); 9601 } 9602 else 9603 { 9604 return toCapitalizer!UpperTriple(str); 9605 } 9606 } 9607 9608 /// 9609 @safe pure unittest 9610 { 9611 import std.algorithm.comparison : equal; 9612 9613 assert("hEllo".asCapitalized.equal("Hello")); 9614 } 9615 9616 auto asCapitalized(Range)(auto ref Range str) 9617 if (isConvertibleToString!Range) 9618 { 9619 import std.traits : StringTypeOf; 9620 return asCapitalized!(StringTypeOf!Range)(str); 9621 } 9622 9623 @safe pure nothrow @nogc unittest 9624 { 9625 auto r = "hEllo".asCapitalized(); 9626 assert(r.front == 'H'); 9627 } 9628 9629 @safe unittest 9630 { 9631 import std.array : array; 9632 9633 auto a = "hELLo".asCapitalized; 9634 auto savea = a.save; 9635 auto s = a.array; 9636 assert(s == "Hello"); 9637 s = savea.array; 9638 assert(s == "Hello"); 9639 9640 string[2][] cases = 9641 [ 9642 ["", ""], 9643 ["h", "H"], 9644 ["H", "H"], 9645 ["3", "3"], 9646 ["123", "123"], 9647 ["h123A", "H123a"], 9648 ["феж", "Феж"], 9649 ["\u1Fe2", "\u03a5\u0308\u0300"], 9650 ]; 9651 9652 foreach (i; 0 .. cases.length) 9653 { 9654 import std.utf : byChar; 9655 9656 auto r = cases[i][0].asCapitalized.byChar.array; 9657 auto result = cases[i][1]; 9658 assert(r == result); 9659 } 9660 9661 // Don't call r.front 9662 for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront()) 9663 { 9664 } 9665 9666 import std.algorithm.comparison : equal; 9667 9668 "HELLo"w.asCapitalized.equal("Hello"d); 9669 "hElLO"w.asCapitalized.equal("Hello"d); 9670 "hello"d.asCapitalized.equal("Hello"d); 9671 "HELLO"d.asCapitalized.equal("Hello"d); 9672 9673 import std.utf : byChar; 9674 assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array); 9675 } 9676 9677 // TODO: helper, I wish std.utf was more flexible (and stright) 9678 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9679 { 9680 if (c <= 0x7F) 9681 { 9682 buf[idx] = cast(char) c; 9683 idx++; 9684 } 9685 else if (c <= 0x7FF) 9686 { 9687 buf[idx] = cast(char)(0xC0 | (c >> 6)); 9688 buf[idx+1] = cast(char)(0x80 | (c & 0x3F)); 9689 idx += 2; 9690 } 9691 else if (c <= 0xFFFF) 9692 { 9693 buf[idx] = cast(char)(0xE0 | (c >> 12)); 9694 buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9695 buf[idx+2] = cast(char)(0x80 | (c & 0x3F)); 9696 idx += 3; 9697 } 9698 else if (c <= 0x10FFFF) 9699 { 9700 buf[idx] = cast(char)(0xF0 | (c >> 18)); 9701 buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 9702 buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 9703 buf[idx+3] = cast(char)(0x80 | (c & 0x3F)); 9704 idx += 4; 9705 } 9706 else 9707 assert(0); 9708 return idx; 9709 } 9710 9711 @safe unittest 9712 { 9713 char[] s = "abcd".dup; 9714 size_t i = 0; 9715 i = encodeTo(s, i, 'X'); 9716 assert(s == "Xbcd"); 9717 9718 i = encodeTo(s, i, cast(dchar)'\u00A9'); 9719 assert(s == "X\xC2\xA9d"); 9720 } 9721 9722 // TODO: helper, I wish std.utf was more flexible (and stright) 9723 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure 9724 { 9725 import std.utf : UTFException; 9726 if (c <= 0xFFFF) 9727 { 9728 if (0xD800 <= c && c <= 0xDFFF) 9729 throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c); 9730 buf[idx] = cast(wchar) c; 9731 idx++; 9732 } 9733 else if (c <= 0x10FFFF) 9734 { 9735 buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 9736 buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00); 9737 idx += 2; 9738 } 9739 else 9740 assert(0); 9741 return idx; 9742 } 9743 9744 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc 9745 { 9746 buf[idx] = c; 9747 idx++; 9748 return idx; 9749 } 9750 9751 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure 9752 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9753 { 9754 import std.utf : decode, codeLength; 9755 size_t curIdx = 0; 9756 size_t destIdx = 0; 9757 alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn); 9758 size_t lastUnchanged = 0; 9759 // in-buffer move of bytes to a new start index 9760 // the trick is that it may not need to copy at all 9761 static size_t moveTo(C[] str, size_t dest, size_t from, size_t to) 9762 { 9763 // Interestingly we may just bump pointer for a while 9764 // then have to copy if a re-cased char was smaller the original 9765 // later we may regain pace with char that got bigger 9766 // In the end it sometimes flip-flops between the 2 cases below 9767 if (dest == from) 9768 return to; 9769 // got to copy 9770 foreach (C c; str[from .. to]) 9771 str[dest++] = c; 9772 return dest; 9773 } 9774 while (curIdx != s.length) 9775 { 9776 size_t startIdx = curIdx; 9777 immutable ch = decode(s, curIdx); 9778 // TODO: special case for ASCII 9779 immutable caseIndex = indexFn(ch); 9780 if (caseIndex == ushort.max) // unchanged, skip over 9781 { 9782 continue; 9783 } 9784 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9785 { 9786 // previous cased chars had the same length as uncased ones 9787 // thus can just adjust pointer 9788 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9789 lastUnchanged = curIdx; 9790 immutable cased = tableFn(caseIndex); 9791 immutable casedLen = codeLength!C(cased); 9792 if (casedLen + destIdx > curIdx) // no place to fit cased char 9793 { 9794 // switch to slow codepath, where we allocate 9795 return slowToCase(s, startIdx, destIdx); 9796 } 9797 else 9798 { 9799 destIdx = encodeTo(s, destIdx, cased); 9800 } 9801 } 9802 else // 1:m codepoint mapping, slow codepath 9803 { 9804 destIdx = moveTo(s, destIdx, lastUnchanged, startIdx); 9805 lastUnchanged = curIdx; 9806 return slowToCase(s, startIdx, destIdx); 9807 } 9808 assert(destIdx <= curIdx); 9809 } 9810 if (lastUnchanged != s.length) 9811 { 9812 destIdx = moveTo(s, destIdx, lastUnchanged, s.length); 9813 } 9814 s = s[0 .. destIdx]; 9815 } 9816 9817 // helper to precalculate size of case-converted string 9818 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn) 9819 { 9820 size_t toCaseLength(C)(const scope C[] str) 9821 { 9822 import std.utf : decode, codeLength; 9823 size_t codeLen = 0; 9824 size_t lastNonTrivial = 0; 9825 size_t curIdx = 0; 9826 while (curIdx != str.length) 9827 { 9828 immutable startIdx = curIdx; 9829 immutable ch = decode(str, curIdx); 9830 immutable ushort caseIndex = indexFn(ch); 9831 if (caseIndex == ushort.max) 9832 continue; 9833 else if (caseIndex < maxIdx) 9834 { 9835 codeLen += startIdx - lastNonTrivial; 9836 lastNonTrivial = curIdx; 9837 immutable cased = tableFn(caseIndex); 9838 codeLen += codeLength!C(cased); 9839 } 9840 else 9841 { 9842 codeLen += startIdx - lastNonTrivial; 9843 lastNonTrivial = curIdx; 9844 immutable val = tableFn(caseIndex); 9845 immutable len = val >> 24; 9846 immutable dchar cased = val & 0xFF_FFFF; 9847 codeLen += codeLength!C(cased); 9848 foreach (j; caseIndex+1 .. caseIndex+len) 9849 codeLen += codeLength!C(tableFn(j)); 9850 } 9851 } 9852 if (lastNonTrivial != str.length) 9853 codeLen += str.length - lastNonTrivial; 9854 return codeLen; 9855 } 9856 } 9857 9858 @safe unittest 9859 { 9860 alias toLowerLength = toCaseLength!(LowerTriple); 9861 assert(toLowerLength("abcd") == 4); 9862 assert(toLowerLength("аБВгд456") == 10+3); 9863 } 9864 9865 // slower code path that preallocates and then copies 9866 // case-converted stuf to the new string 9867 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn) 9868 { 9869 void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx, 9870 size_t destIdx) @trusted pure 9871 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9872 { 9873 import std.utf : decode; 9874 alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn); 9875 auto trueLength = destIdx + caseLength(s[curIdx..$]); 9876 C[] ns = new C[trueLength]; 9877 ns[0 .. destIdx] = s[0 .. destIdx]; 9878 size_t lastUnchanged = curIdx; 9879 while (curIdx != s.length) 9880 { 9881 immutable startIdx = curIdx; // start of current codepoint 9882 immutable ch = decode(s, curIdx); 9883 immutable caseIndex = indexFn(ch); 9884 if (caseIndex == ushort.max) // skip over 9885 { 9886 continue; 9887 } 9888 else if (caseIndex < maxIdx) // 1:1 codepoint mapping 9889 { 9890 immutable cased = tableFn(caseIndex); 9891 auto toCopy = startIdx - lastUnchanged; 9892 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9893 lastUnchanged = curIdx; 9894 destIdx += toCopy; 9895 destIdx = encodeTo(ns, destIdx, cased); 9896 } 9897 else // 1:m codepoint mapping, slow codepath 9898 { 9899 auto toCopy = startIdx - lastUnchanged; 9900 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx]; 9901 lastUnchanged = curIdx; 9902 destIdx += toCopy; 9903 auto val = tableFn(caseIndex); 9904 // unpack length + codepoint 9905 immutable uint len = val >> 24; 9906 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF)); 9907 foreach (j; caseIndex+1 .. caseIndex+len) 9908 destIdx = encodeTo(ns, destIdx, tableFn(j)); 9909 } 9910 } 9911 if (lastUnchanged != s.length) 9912 { 9913 auto toCopy = s.length - lastUnchanged; 9914 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$]; 9915 destIdx += toCopy; 9916 } 9917 assert(ns.length == destIdx); 9918 s = ns; 9919 } 9920 } 9921 9922 /++ 9923 Converts `s` to lowercase (by performing Unicode lowercase mapping) in place. 9924 For a few characters string length may increase after the transformation, 9925 in such a case the function reallocates exactly once. 9926 If `s` does not have any uppercase characters, then `s` is unaltered. 9927 +/ 9928 void toLowerInPlace(C)(ref C[] s) @trusted pure 9929 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9930 { 9931 toCaseInPlace!(LowerTriple)(s); 9932 } 9933 // overloads for the most common cases to reduce compile time 9934 @safe pure /*TODO nothrow*/ 9935 { 9936 void toLowerInPlace(ref char[] s) 9937 { toLowerInPlace!char(s); } 9938 void toLowerInPlace(ref wchar[] s) 9939 { toLowerInPlace!wchar(s); } 9940 void toLowerInPlace(ref dchar[] s) 9941 { toLowerInPlace!dchar(s); } 9942 } 9943 9944 /++ 9945 Converts `s` to uppercase (by performing Unicode uppercase mapping) in place. 9946 For a few characters string length may increase after the transformation, 9947 in such a case the function reallocates exactly once. 9948 If `s` does not have any lowercase characters, then `s` is unaltered. 9949 +/ 9950 void toUpperInPlace(C)(ref C[] s) @trusted pure 9951 if (is(C == char) || is(C == wchar) || is(C == dchar)) 9952 { 9953 toCaseInPlace!(UpperTriple)(s); 9954 } 9955 // overloads for the most common cases to reduce compile time/code size 9956 @safe pure /*TODO nothrow*/ 9957 { 9958 void toUpperInPlace(ref char[] s) 9959 { toUpperInPlace!char(s); } 9960 void toUpperInPlace(ref wchar[] s) 9961 { toUpperInPlace!wchar(s); } 9962 void toUpperInPlace(ref dchar[] s) 9963 { toUpperInPlace!dchar(s); } 9964 } 9965 9966 /++ 9967 If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent 9968 is returned. Otherwise `c` is returned. 9969 9970 Warning: certain alphabets like German and Greek have no 1:1 9971 upper-lower mapping. Use overload of toLower which takes full string instead. 9972 +/ 9973 @safe pure nothrow @nogc 9974 dchar toLower(dchar c) 9975 { 9976 // optimize ASCII case 9977 if (c < 0xAA) 9978 { 9979 if (c < 'A') 9980 return c; 9981 if (c <= 'Z') 9982 return c + 32; 9983 return c; 9984 } 9985 size_t idx = toLowerSimpleIndex(c); 9986 if (idx != ushort.max) 9987 { 9988 return toLowerTab(idx); 9989 } 9990 return c; 9991 } 9992 9993 /++ 9994 Creates a new array which is identical to `s` except that all of its 9995 characters are converted to lowercase (by performing Unicode lowercase mapping). 9996 If none of `s` characters were affected, then `s` itself is returned if `s` is a 9997 `string`-like type. 9998 9999 Params: 10000 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10001 of characters 10002 Returns: 10003 An array with the same element type as `s`. 10004 +/ 10005 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted 10006 if (isSomeString!S) 10007 { 10008 static import std.ascii; 10009 return toCase!(LowerTriple, std.ascii.toLower)(s); 10010 } 10011 10012 /// ditto 10013 ElementEncodingType!S[] toLower(S)(S s) 10014 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10015 { 10016 static import std.ascii; 10017 return toCase!(LowerTriple, std.ascii.toLower)(s); 10018 } 10019 10020 // overloads for the most common cases to reduce compile time 10021 @safe pure /*TODO nothrow*/ 10022 { 10023 string toLower(return scope string s) 10024 { return toLower!string(s); } 10025 wstring toLower(return scope wstring s) 10026 { return toLower!wstring(s); } 10027 dstring toLower(return scope dstring s) 10028 { return toLower!dstring(s); } 10029 10030 @safe unittest 10031 { 10032 // https://issues.dlang.org/show_bug.cgi?id=16663 10033 10034 static struct String 10035 { 10036 string data; 10037 alias data this; 10038 } 10039 10040 void foo() 10041 { 10042 auto u = toLower(String("")); 10043 } 10044 } 10045 } 10046 10047 10048 @safe unittest 10049 { 10050 static import std.ascii; 10051 import std.format : format; 10052 foreach (ch; 0 .. 0x80) 10053 assert(std.ascii.toLower(ch) == toLower(ch)); 10054 assert(toLower('Я') == 'я'); 10055 assert(toLower('Δ') == 'δ'); 10056 foreach (ch; unicode.upperCase.byCodepoint) 10057 { 10058 dchar low = ch.toLower(); 10059 assert(low == ch || isLower(low), format("%s -> %s", ch, low)); 10060 } 10061 assert(toLower("АЯ") == "ая"); 10062 10063 assert("\u1E9E".toLower == "\u00df"); 10064 assert("\u00df".toUpper == "SS"); 10065 } 10066 10067 // https://issues.dlang.org/show_bug.cgi?id=9629 10068 @safe unittest 10069 { 10070 wchar[] test = "hello þ world"w.dup; 10071 auto piece = test[6 .. 7]; 10072 toUpperInPlace(piece); 10073 assert(test == "hello Þ world"); 10074 } 10075 10076 10077 @safe unittest 10078 { 10079 import std.algorithm.comparison : cmp; 10080 string s1 = "FoL"; 10081 string s2 = toLower(s1); 10082 assert(cmp(s2, "fol") == 0, s2); 10083 assert(s2 != s1); 10084 10085 char[] s3 = s1.dup; 10086 toLowerInPlace(s3); 10087 assert(s3 == s2); 10088 10089 s1 = "A\u0100B\u0101d"; 10090 s2 = toLower(s1); 10091 s3 = s1.dup; 10092 assert(cmp(s2, "a\u0101b\u0101d") == 0); 10093 assert(s2 !is s1); 10094 toLowerInPlace(s3); 10095 assert(s3 == s2); 10096 10097 s1 = "A\u0460B\u0461d"; 10098 s2 = toLower(s1); 10099 s3 = s1.dup; 10100 assert(cmp(s2, "a\u0461b\u0461d") == 0); 10101 assert(s2 !is s1); 10102 toLowerInPlace(s3); 10103 assert(s3 == s2); 10104 10105 s1 = "\u0130"; 10106 s2 = toLower(s1); 10107 s3 = s1.dup; 10108 assert(s2 == "i\u0307"); 10109 assert(s2 !is s1); 10110 toLowerInPlace(s3); 10111 assert(s3 == s2); 10112 10113 // Test on wchar and dchar strings. 10114 assert(toLower("Some String"w) == "some string"w); 10115 assert(toLower("Some String"d) == "some string"d); 10116 10117 // https://issues.dlang.org/show_bug.cgi?id=12455 10118 dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE 10119 assert(isUpper(c)); 10120 assert(toLower(c) == 'i'); 10121 // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report 10122 // check simple-case toUpper too 10123 c = '\u1f87'; 10124 assert(isLower(c)); 10125 assert(toUpper(c) == '\u1F8F'); 10126 } 10127 10128 @safe pure unittest 10129 { 10130 import std.algorithm.comparison : cmp, equal; 10131 import std.utf : byCodeUnit; 10132 auto r1 = "FoL".byCodeUnit; 10133 assert(r1.toLower.cmp("fol") == 0); 10134 auto r2 = "A\u0460B\u0461d".byCodeUnit; 10135 assert(r2.toLower.cmp("a\u0461b\u0461d") == 0); 10136 } 10137 10138 /++ 10139 If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent 10140 is returned. Otherwise `c` is returned. 10141 10142 Warning: 10143 Certain alphabets like German and Greek have no 1:1 10144 upper-lower mapping. Use overload of toUpper which takes full string instead. 10145 10146 toUpper can be used as an argument to $(REF map, std,algorithm,iteration) 10147 to produce an algorithm that can convert a range of characters to upper case 10148 without allocating memory. 10149 A string can then be produced by using $(REF copy, std,algorithm,mutation) 10150 to send it to an $(REF appender, std,array). 10151 +/ 10152 @safe pure nothrow @nogc 10153 dchar toUpper(dchar c) 10154 { 10155 // optimize ASCII case 10156 if (c < 0xAA) 10157 { 10158 if (c < 'a') 10159 return c; 10160 if (c <= 'z') 10161 return c - 32; 10162 return c; 10163 } 10164 size_t idx = toUpperSimpleIndex(c); 10165 if (idx != ushort.max) 10166 { 10167 return toUpperTab(idx); 10168 } 10169 return c; 10170 } 10171 10172 /// 10173 @safe unittest 10174 { 10175 import std.algorithm.iteration : map; 10176 import std.algorithm.mutation : copy; 10177 import std.array : appender; 10178 10179 auto abuf = appender!(char[])(); 10180 "hello".map!toUpper.copy(abuf); 10181 assert(abuf.data == "HELLO"); 10182 } 10183 10184 @safe unittest 10185 { 10186 static import std.ascii; 10187 import std.format : format; 10188 foreach (ch; 0 .. 0x80) 10189 assert(std.ascii.toUpper(ch) == toUpper(ch)); 10190 assert(toUpper('я') == 'Я'); 10191 assert(toUpper('δ') == 'Δ'); 10192 auto title = unicode.Titlecase_Letter; 10193 foreach (ch; unicode.lowerCase.byCodepoint) 10194 { 10195 dchar up = ch.toUpper(); 10196 assert(up == ch || isUpper(up) || title[up], 10197 format("%x -> %x", ch, up)); 10198 } 10199 } 10200 10201 /++ 10202 Allocates a new array which is identical to `s` except that all of its 10203 characters are converted to uppercase (by performing Unicode uppercase mapping). 10204 If none of `s` characters were affected, then `s` itself is returned if `s` 10205 is a `string`-like type. 10206 10207 Params: 10208 s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives) 10209 of characters 10210 Returns: 10211 An new array with the same element type as `s`. 10212 +/ 10213 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted 10214 if (isSomeString!S) 10215 { 10216 static import std.ascii; 10217 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10218 } 10219 10220 /// ditto 10221 ElementEncodingType!S[] toUpper(S)(S s) 10222 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S))) 10223 { 10224 static import std.ascii; 10225 return toCase!(UpperTriple, std.ascii.toUpper)(s); 10226 } 10227 10228 // overloads for the most common cases to reduce compile time 10229 @safe pure /*TODO nothrow*/ 10230 { 10231 string toUpper(return scope string s) 10232 { return toUpper!string(s); } 10233 wstring toUpper(return scope wstring s) 10234 { return toUpper!wstring(s); } 10235 dstring toUpper(return scope dstring s) 10236 { return toUpper!dstring(s); } 10237 10238 @safe unittest 10239 { 10240 // https://issues.dlang.org/show_bug.cgi?id=16663 10241 10242 static struct String 10243 { 10244 string data; 10245 alias data this; 10246 } 10247 10248 void foo() 10249 { 10250 auto u = toUpper(String("")); 10251 } 10252 } 10253 } 10254 10255 @safe unittest 10256 { 10257 import std.algorithm.comparison : cmp; 10258 10259 string s1 = "FoL"; 10260 string s2; 10261 char[] s3; 10262 10263 s2 = toUpper(s1); 10264 s3 = s1.dup; toUpperInPlace(s3); 10265 assert(s3 == s2, s3); 10266 assert(cmp(s2, "FOL") == 0); 10267 assert(s2 !is s1); 10268 10269 s1 = "a\u0100B\u0101d"; 10270 s2 = toUpper(s1); 10271 s3 = s1.dup; toUpperInPlace(s3); 10272 assert(s3 == s2); 10273 assert(cmp(s2, "A\u0100B\u0100D") == 0); 10274 assert(s2 !is s1); 10275 10276 s1 = "a\u0460B\u0461d"; 10277 s2 = toUpper(s1); 10278 s3 = s1.dup; toUpperInPlace(s3); 10279 assert(s3 == s2); 10280 assert(cmp(s2, "A\u0460B\u0460D") == 0); 10281 assert(s2 !is s1); 10282 } 10283 10284 @safe unittest 10285 { 10286 static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow) 10287 { 10288 import std.format : format; 10289 string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)"; 10290 auto low = s.toLower() , up = s.toUpper(); 10291 auto lowInp = s.dup, upInp = s.dup; 10292 lowInp.toLowerInPlace(); 10293 upInp.toUpperInPlace(); 10294 assert(low == trueLow, format(diff, low, trueLow)); 10295 assert(up == trueUp, format(diff, up, trueUp)); 10296 assert(lowInp == trueLow, 10297 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow)); 10298 assert(upInp == trueUp, 10299 format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp)); 10300 } 10301 static foreach (S; AliasSeq!(dstring, wstring, string)) 10302 {{ 10303 10304 S easy = "123"; 10305 S good = "abCФеж"; 10306 S awful = "\u0131\u023f\u2126"; 10307 S wicked = "\u0130\u1FE2"; 10308 auto options = [easy, good, awful, wicked]; 10309 S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"]; 10310 S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"]; 10311 10312 foreach (val; [easy, good]) 10313 { 10314 auto e = val.dup; 10315 auto g = e; 10316 e.toUpperInPlace(); 10317 assert(e is g); 10318 e.toLowerInPlace(); 10319 assert(e is g); 10320 } 10321 foreach (i, v; options) 10322 { 10323 doTest(v, upper[i], lower[i]); 10324 } 10325 10326 // a few combinatorial runs 10327 foreach (i; 0 .. options.length) 10328 foreach (j; i .. options.length) 10329 foreach (k; j .. options.length) 10330 { 10331 auto sample = options[i] ~ options[j] ~ options[k]; 10332 auto sample2 = options[k] ~ options[j] ~ options[i]; 10333 doTest(sample, upper[i] ~ upper[j] ~ upper[k], 10334 lower[i] ~ lower[j] ~ lower[k]); 10335 doTest(sample2, upper[k] ~ upper[j] ~ upper[i], 10336 lower[k] ~ lower[j] ~ lower[i]); 10337 } 10338 }} 10339 } 10340 10341 // test random access ranges 10342 @safe pure unittest 10343 { 10344 import std.algorithm.comparison : cmp; 10345 import std.utf : byCodeUnit; 10346 auto s1 = "FoL".byCodeUnit; 10347 assert(s1.toUpper.cmp("FOL") == 0); 10348 auto s2 = "a\u0460B\u0461d".byCodeUnit; 10349 assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0); 10350 } 10351 10352 /++ 10353 Returns whether `c` is a Unicode alphabetic $(CHARACTER) 10354 (general Unicode category: Alphabetic). 10355 +/ 10356 @safe pure nothrow @nogc 10357 bool isAlpha(dchar c) 10358 { 10359 // optimization 10360 if (c < 0xAA) 10361 { 10362 return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); 10363 } 10364 10365 return alphaTrie[c]; 10366 } 10367 10368 @safe unittest 10369 { 10370 auto alpha = unicode("Alphabetic"); 10371 foreach (ch; alpha.byCodepoint) 10372 assert(isAlpha(ch)); 10373 foreach (ch; 0 .. 0x4000) 10374 assert((ch in alpha) == isAlpha(ch)); 10375 } 10376 10377 10378 /++ 10379 Returns whether `c` is a Unicode mark 10380 (general Unicode category: Mn, Me, Mc). 10381 +/ 10382 @safe pure nothrow @nogc 10383 bool isMark(dchar c) 10384 { 10385 return markTrie[c]; 10386 } 10387 10388 @safe unittest 10389 { 10390 auto mark = unicode("Mark"); 10391 foreach (ch; mark.byCodepoint) 10392 assert(isMark(ch)); 10393 foreach (ch; 0 .. 0x4000) 10394 assert((ch in mark) == isMark(ch)); 10395 } 10396 10397 /++ 10398 Returns whether `c` is a Unicode numerical $(CHARACTER) 10399 (general Unicode category: Nd, Nl, No). 10400 +/ 10401 @safe pure nothrow @nogc 10402 bool isNumber(dchar c) 10403 { 10404 // optimization for ascii case 10405 if (c <= 0x7F) 10406 { 10407 return c >= '0' && c <= '9'; 10408 } 10409 else 10410 { 10411 return numberTrie[c]; 10412 } 10413 } 10414 10415 @safe unittest 10416 { 10417 auto n = unicode("N"); 10418 foreach (ch; n.byCodepoint) 10419 assert(isNumber(ch)); 10420 foreach (ch; 0 .. 0x4000) 10421 assert((ch in n) == isNumber(ch)); 10422 } 10423 10424 /++ 10425 Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number. 10426 (general Unicode category: Alphabetic, Nd, Nl, No). 10427 10428 Params: 10429 c = any Unicode character 10430 Returns: 10431 `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode 10432 categories 10433 +/ 10434 @safe pure nothrow @nogc 10435 bool isAlphaNum(dchar c) 10436 { 10437 static import std.ascii; 10438 10439 // optimization for ascii case 10440 if (std.ascii.isASCII(c)) 10441 { 10442 return std.ascii.isAlphaNum(c); 10443 } 10444 else 10445 { 10446 return isAlpha(c) || isNumber(c); 10447 } 10448 } 10449 10450 @safe unittest 10451 { 10452 auto n = unicode("N"); 10453 auto alpha = unicode("Alphabetic"); 10454 10455 foreach (ch; n.byCodepoint) 10456 assert(isAlphaNum(ch)); 10457 10458 foreach (ch; alpha.byCodepoint) 10459 assert(isAlphaNum(ch)); 10460 10461 foreach (ch; 0 .. 0x4000) 10462 { 10463 assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch)); 10464 } 10465 } 10466 10467 /++ 10468 Returns whether `c` is a Unicode punctuation $(CHARACTER) 10469 (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf). 10470 +/ 10471 @safe pure nothrow @nogc 10472 bool isPunctuation(dchar c) 10473 { 10474 static import std.ascii; 10475 10476 // optimization for ascii case 10477 if (c <= 0x7F) 10478 { 10479 return std.ascii.isPunctuation(c); 10480 } 10481 else 10482 { 10483 return punctuationTrie[c]; 10484 } 10485 } 10486 10487 @safe unittest 10488 { 10489 assert(isPunctuation('\u0021')); 10490 assert(isPunctuation('\u0028')); 10491 assert(isPunctuation('\u0029')); 10492 assert(isPunctuation('\u002D')); 10493 assert(isPunctuation('\u005F')); 10494 assert(isPunctuation('\u00AB')); 10495 assert(isPunctuation('\u00BB')); 10496 foreach (ch; unicode("P").byCodepoint) 10497 assert(isPunctuation(ch)); 10498 } 10499 10500 /++ 10501 Returns whether `c` is a Unicode symbol $(CHARACTER) 10502 (general Unicode category: Sm, Sc, Sk, So). 10503 +/ 10504 @safe pure nothrow @nogc 10505 bool isSymbol(dchar c) 10506 { 10507 return symbolTrie[c]; 10508 } 10509 10510 @safe unittest 10511 { 10512 import std.format : format; 10513 assert(isSymbol('\u0024')); 10514 assert(isSymbol('\u002B')); 10515 assert(isSymbol('\u005E')); 10516 assert(isSymbol('\u00A6')); 10517 foreach (ch; unicode("S").byCodepoint) 10518 assert(isSymbol(ch), format("%04x", ch)); 10519 } 10520 10521 /++ 10522 Returns whether `c` is a Unicode space $(CHARACTER) 10523 (general Unicode category: Zs) 10524 Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER). 10525 For commonly used less strict semantics see $(LREF isWhite). 10526 +/ 10527 @safe pure nothrow @nogc 10528 bool isSpace(dchar c) 10529 { 10530 import std.internal.unicode_tables : isSpaceGen; // generated file 10531 return isSpaceGen(c); 10532 } 10533 10534 @safe unittest 10535 { 10536 assert(isSpace('\u0020')); 10537 auto space = unicode.Zs; 10538 foreach (ch; space.byCodepoint) 10539 assert(isSpace(ch)); 10540 foreach (ch; 0 .. 0x1000) 10541 assert(isSpace(ch) == space[ch]); 10542 } 10543 10544 10545 /++ 10546 Returns whether `c` is a Unicode graphical $(CHARACTER) 10547 (general Unicode category: L, M, N, P, S, Zs). 10548 10549 +/ 10550 @safe pure nothrow @nogc 10551 bool isGraphical(dchar c) 10552 { 10553 return graphicalTrie[c]; 10554 } 10555 10556 10557 @safe unittest 10558 { 10559 auto set = unicode("Graphical"); 10560 import std.format : format; 10561 foreach (ch; set.byCodepoint) 10562 assert(isGraphical(ch), format("%4x", ch)); 10563 foreach (ch; 0 .. 0x4000) 10564 assert((ch in set) == isGraphical(ch)); 10565 } 10566 10567 10568 /++ 10569 Returns whether `c` is a Unicode control $(CHARACTER) 10570 (general Unicode category: Cc). 10571 +/ 10572 @safe pure nothrow @nogc 10573 bool isControl(dchar c) 10574 { 10575 import std.internal.unicode_tables : isControlGen; // generated file 10576 return isControlGen(c); 10577 } 10578 10579 @safe unittest 10580 { 10581 assert(isControl('\u0000')); 10582 assert(isControl('\u0081')); 10583 assert(!isControl('\u0100')); 10584 auto cc = unicode.Cc; 10585 foreach (ch; cc.byCodepoint) 10586 assert(isControl(ch)); 10587 foreach (ch; 0 .. 0x1000) 10588 assert(isControl(ch) == cc[ch]); 10589 } 10590 10591 10592 /++ 10593 Returns whether `c` is a Unicode formatting $(CHARACTER) 10594 (general Unicode category: Cf). 10595 +/ 10596 @safe pure nothrow @nogc 10597 bool isFormat(dchar c) 10598 { 10599 import std.internal.unicode_tables : isFormatGen; // generated file 10600 return isFormatGen(c); 10601 } 10602 10603 10604 @safe unittest 10605 { 10606 assert(isFormat('\u00AD')); 10607 foreach (ch; unicode("Format").byCodepoint) 10608 assert(isFormat(ch)); 10609 } 10610 10611 // code points for private use, surrogates are not likely to change in near feature 10612 // if need be they can be generated from unicode data as well 10613 10614 /++ 10615 Returns whether `c` is a Unicode Private Use $(CODEPOINT) 10616 (general Unicode category: Co). 10617 +/ 10618 @safe pure nothrow @nogc 10619 bool isPrivateUse(dchar c) 10620 { 10621 return (0x00_E000 <= c && c <= 0x00_F8FF) 10622 || (0x0F_0000 <= c && c <= 0x0F_FFFD) 10623 || (0x10_0000 <= c && c <= 0x10_FFFD); 10624 } 10625 10626 /++ 10627 Returns whether `c` is a Unicode surrogate $(CODEPOINT) 10628 (general Unicode category: Cs). 10629 +/ 10630 @safe pure nothrow @nogc 10631 bool isSurrogate(dchar c) 10632 { 10633 return (0xD800 <= c && c <= 0xDFFF); 10634 } 10635 10636 /++ 10637 Returns whether `c` is a Unicode high surrogate (lead surrogate). 10638 +/ 10639 @safe pure nothrow @nogc 10640 bool isSurrogateHi(dchar c) 10641 { 10642 return (0xD800 <= c && c <= 0xDBFF); 10643 } 10644 10645 /++ 10646 Returns whether `c` is a Unicode low surrogate (trail surrogate). 10647 +/ 10648 @safe pure nothrow @nogc 10649 bool isSurrogateLo(dchar c) 10650 { 10651 return (0xDC00 <= c && c <= 0xDFFF); 10652 } 10653 10654 /++ 10655 Returns whether `c` is a Unicode non-character i.e. 10656 a $(CODEPOINT) with no assigned abstract character. 10657 (general Unicode category: Cn) 10658 +/ 10659 @safe pure nothrow @nogc 10660 bool isNonCharacter(dchar c) 10661 { 10662 return nonCharacterTrie[c]; 10663 } 10664 10665 @safe unittest 10666 { 10667 auto set = unicode("Cn"); 10668 foreach (ch; set.byCodepoint) 10669 assert(isNonCharacter(ch)); 10670 } 10671 10672 private: 10673 // load static data from pre-generated tables into usable datastructures 10674 10675 10676 @safe auto asSet(const (ubyte)[] compressed) pure 10677 { 10678 return CodepointSet.fromIntervals(decompressIntervals(compressed)); 10679 } 10680 10681 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e) 10682 { 10683 return const(CodepointTrie!T)(e.offsets, e.sizes, e.data); 10684 } 10685 10686 @safe pure nothrow @nogc @property 10687 { 10688 // It's important to use auto return here, so that the compiler 10689 // only runs semantic on the return type if the function gets 10690 // used. Also these are functions rather than templates to not 10691 // increase the object size of the caller. 10692 auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; } 10693 auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; } 10694 auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; } 10695 auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; } 10696 auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; } 10697 auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; } 10698 auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; } 10699 auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; } 10700 auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; } 10701 auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; } 10702 auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; } 10703 10704 //normalization quick-check tables 10705 auto nfcQCTrie() 10706 { 10707 import std.internal.unicode_norm : nfcQCTrieEntries; 10708 static immutable res = asTrie(nfcQCTrieEntries); 10709 return res; 10710 } 10711 10712 auto nfdQCTrie() 10713 { 10714 import std.internal.unicode_norm : nfdQCTrieEntries; 10715 static immutable res = asTrie(nfdQCTrieEntries); 10716 return res; 10717 } 10718 10719 auto nfkcQCTrie() 10720 { 10721 import std.internal.unicode_norm : nfkcQCTrieEntries; 10722 static immutable res = asTrie(nfkcQCTrieEntries); 10723 return res; 10724 } 10725 10726 auto nfkdQCTrie() 10727 { 10728 import std.internal.unicode_norm : nfkdQCTrieEntries; 10729 static immutable res = asTrie(nfkdQCTrieEntries); 10730 return res; 10731 } 10732 10733 //grapheme breaking algorithm tables 10734 auto spacingMarkTrie() 10735 { 10736 import std.internal.unicode_grapheme : spacingMarkTrieEntries; 10737 static immutable res = asTrie(spacingMarkTrieEntries); 10738 return res; 10739 } 10740 10741 auto graphemeExtendTrie() 10742 { 10743 import std.internal.unicode_grapheme : graphemeExtendTrieEntries; 10744 static immutable res = asTrie(graphemeExtendTrieEntries); 10745 return res; 10746 } 10747 10748 auto hangLV() 10749 { 10750 import std.internal.unicode_grapheme : hangulLVTrieEntries; 10751 static immutable res = asTrie(hangulLVTrieEntries); 10752 return res; 10753 } 10754 10755 auto hangLVT() 10756 { 10757 import std.internal.unicode_grapheme : hangulLVTTrieEntries; 10758 static immutable res = asTrie(hangulLVTTrieEntries); 10759 return res; 10760 } 10761 10762 auto prependTrie() 10763 { 10764 import std.internal.unicode_grapheme : prependTrieEntries; 10765 static immutable res = asTrie(prependTrieEntries); 10766 return res; 10767 } 10768 10769 auto graphemeControlTrie() 10770 { 10771 import std.internal.unicode_grapheme : controlTrieEntries; 10772 static immutable res = asTrie(controlTrieEntries); 10773 return res; 10774 } 10775 10776 auto xpictoTrie() 10777 { 10778 import std.internal.unicode_grapheme : Extended_PictographicTrieEntries; 10779 static immutable res = asTrie(Extended_PictographicTrieEntries); 10780 return res; 10781 } 10782 10783 // tables below are used for composition/decomposition 10784 auto combiningClassTrie() 10785 { 10786 import std.internal.unicode_comp : combiningClassTrieEntries; 10787 static immutable res = asTrie(combiningClassTrieEntries); 10788 return res; 10789 } 10790 10791 auto compatMappingTrie() 10792 { 10793 import std.internal.unicode_decomp : compatMappingTrieEntries; 10794 static immutable res = asTrie(compatMappingTrieEntries); 10795 return res; 10796 } 10797 10798 auto canonMappingTrie() 10799 { 10800 import std.internal.unicode_decomp : canonMappingTrieEntries; 10801 static immutable res = asTrie(canonMappingTrieEntries); 10802 return res; 10803 } 10804 10805 auto compositionJumpTrie() 10806 { 10807 import std.internal.unicode_comp : compositionJumpTrieEntries; 10808 static immutable res = asTrie(compositionJumpTrieEntries); 10809 return res; 10810 } 10811 10812 //case conversion tables 10813 auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; } 10814 auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; } 10815 auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; } 10816 //simple case conversion tables 10817 auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; } 10818 auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; } 10819 auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; } 10820 10821 } 10822 10823 }// version (!std_uni_bootstrap)