std.uni source code

1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19     $(LREF popGrapheme)
20 ))
21 $(TR $(TD Comparison) $(TD
22     $(LREF icmp)
23     $(LREF sicmp)
24 ))
25 $(TR $(TD Classification) $(TD
26     $(LREF isAlpha)
27     $(LREF isAlphaNum)
28     $(LREF isCodepointSet)
29     $(LREF isControl)
30     $(LREF isFormat)
31     $(LREF isGraphical)
32     $(LREF isIntegralPair)
33     $(LREF isMark)
34     $(LREF isNonCharacter)
35     $(LREF isNumber)
36     $(LREF isPrivateUse)
37     $(LREF isPunctuation)
38     $(LREF isSpace)
39     $(LREF isSurrogate)
40     $(LREF isSurrogateHi)
41     $(LREF isSurrogateLo)
42     $(LREF isSymbol)
43     $(LREF isWhite)
44 ))
45 $(TR $(TD Normalization) $(TD
46     $(LREF NFC)
47     $(LREF NFD)
48     $(LREF NFKD)
49     $(LREF NormalizationForm)
50     $(LREF normalize)
51 ))
52 $(TR $(TD Decompose) $(TD
53     $(LREF decompose)
54     $(LREF decomposeHangul)
55     $(LREF UnicodeDecomposition)
56 ))
57 $(TR $(TD Compose) $(TD
58     $(LREF compose)
59     $(LREF composeJamo)
60 ))
61 $(TR $(TD Sets) $(TD
62     $(LREF CodepointInterval)
63     $(LREF CodepointSet)
64     $(LREF InversionList)
65     $(LREF unicode)
66 ))
67 $(TR $(TD Trie) $(TD
68     $(LREF codepointSetTrie)
69     $(LREF CodepointSetTrie)
70     $(LREF codepointTrie)
71     $(LREF CodepointTrie)
72     $(LREF toTrie)
73     $(LREF toDelegate)
74 ))
75 $(TR $(TD Casing) $(TD
76     $(LREF asCapitalized)
77     $(LREF asLowerCase)
78     $(LREF asUpperCase)
79     $(LREF isLower)
80     $(LREF isUpper)
81     $(LREF toLower)
82     $(LREF toLowerInPlace)
83     $(LREF toUpper)
84     $(LREF toUpperInPlace)
85 ))
86 $(TR $(TD Utf8Matcher) $(TD
87     $(LREF isUtfMatcher)
88     $(LREF MatcherConcept)
89     $(LREF utfMatcher)
90 ))
91 $(TR $(TD Separators) $(TD
92     $(LREF lineSep)
93     $(LREF nelSep)
94     $(LREF paraSep)
95 ))
96 $(TR $(TD Building blocks) $(TD
97     $(LREF allowedIn)
98     $(LREF combiningClass)
99     $(LREF Grapheme)
100 ))
101 ))
102 
103     $(P All primitives listed operate on Unicode characters and
104         sets of characters. For functions which operate on ASCII characters
105         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
106         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
107         used throughout this module see the $(S_LINK Terminology, terminology) section
108         below.
109     )
110     $(P The focus of this module is the core needs of developing Unicode-aware
111         applications. To that effect it provides the following optimized primitives:
112     )
113     $(UL
114         $(LI Character classification by category and common properties:
115             $(LREF isAlpha), $(LREF isWhite) and others.
116         )
117         $(LI
118             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
119         )
120         $(LI
121             Converting text to any of the four normalization forms via $(LREF normalize).
122         )
123         $(LI
124             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
125             by user-perceived characters, that is by $(LREF Grapheme) clusters.
126         )
127         $(LI
128             Decomposing and composing of individual character(s) according to canonical
129             or compatibility rules, see $(LREF compose) and $(LREF decompose),
130             including the specific version for Hangul syllables $(LREF composeJamo)
131             and $(LREF decomposeHangul).
132         )
133     )
134     $(P It's recognized that an application may need further enhancements
135         and extensions, such as less commonly known algorithms,
136         or tailoring existing ones for region specific needs. To help users
137         with building any extra functionality beyond the core primitives,
138         the module provides:
139     )
140     $(UL
141         $(LI
142             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
143             Besides the typical set algebra it provides an unusual feature:
144             a D source code generator for detection of $(CODEPOINTS) in this set.
145             This is a boon for meta-programming parser frameworks,
146             and is used internally to power classification in small
147             sets like $(LREF isWhite).
148         )
149         $(LI
150             A way to construct optimal packed multi-stage tables also known as a
151             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
152             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
153             construct custom tries that map dchar to value.
154             The end result is a fast and predictable $(BIGOH 1) lookup that powers
155             functions like $(LREF isAlpha) and $(LREF combiningClass),
156             but for user-defined data sets.
157         )
158         $(LI
159             A useful technique for Unicode-aware parsers that perform
160             character classification of encoded $(CODEPOINTS)
161             is to avoid unnecassary decoding at all costs.
162             $(LREF utfMatcher) provides an improvement over the usual workflow
163             of decode-classify-process, combining the decoding and classification
164             steps. By extracting necessary bits directly from encoded
165             $(S_LINK Code unit, code units) matchers achieve
166             significant performance improvements. See $(LREF MatcherConcept) for
167             the common interface of UTF matchers.
168         )
169         $(LI
170             Generally useful building blocks for customized normalization:
171             $(LREF combiningClass) for querying combining class
172             and $(LREF allowedIn) for testing the Quick_Check
173             property of a given normalization form.
174         )
175         $(LI
176             Access to a large selection of commonly used sets of $(CODEPOINTS).
177             $(S_LINK Unicode properties, Supported sets) include Script,
178             Block and General Category. The exact contents of a set can be
179             observed in the CLDR utility, on the
180             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
181             of the Unicode website.
182             See $(LREF unicode) for easy and (optionally) compile-time checked set
183             queries.
184         )
185     )
186     $(SECTION Synopsis)
187     ---
188     import std.uni;
189     void main()
190     {
191         // initialize code point sets using script/block or property name
192         // now 'set' contains code points from both scripts.
193         auto set = unicode("Cyrillic") | unicode("Armenian");
194         // same thing but simpler and checked at compile-time
195         auto ascii = unicode.ASCII;
196         auto currency = unicode.Currency_Symbol;
197 
198         // easy set ops
199         auto a = set & ascii;
200         assert(a.empty); // as it has no intersection with ascii
201         a = set | ascii;
202         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
203 
204         // some properties of code point sets
205         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
206         // testing presence of a code point in a set
207         // is just fine, it is O(logN)
208         assert(!b['$']);
209         assert(!b['\u058F']); // Armenian dram sign
210         assert(b['¥']);
211 
212         // building fast lookup tables, these guarantee O(1) complexity
213         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
214         auto oneTrie = toTrie!1(b);
215         // 2-level far more compact but typically slightly slower
216         auto twoTrie = toTrie!2(b);
217         // 3-level even smaller, and a bit slower yet
218         auto threeTrie = toTrie!3(b);
219         assert(oneTrie['£']);
220         assert(twoTrie['£']);
221         assert(threeTrie['£']);
222 
223         // build the trie with the most sensible trie level
224         // and bind it as a functor
225         auto cyrillicOrArmenian = toDelegate(set);
226         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
227         assert(balance == "ընկեր!");
228         // compatible with bool delegate(dchar)
229         bool delegate(dchar) bindIt = cyrillicOrArmenian;
230 
231         // Normalization
232         string s = "Plain ascii (and not only), is always normalized!";
233         assert(s is normalize(s));// is the same string
234 
235         string nonS = "A\u0308ffin"; // A ligature
236         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
237         assert(nS == "Äffin");
238         assert(nS != nonS);
239         string composed = "Äffin";
240 
241         assert(normalize!NFD(composed) == "A\u0308ffin");
242         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
243         assert(normalize!NFKD("2¹⁰") == "210");
244     }
245     ---
246     $(SECTION Terminology)
247     $(P The following is a list of important Unicode notions
248     and definitions. Any conventions used specifically in this
249     module alone are marked as such. The descriptions are based on the formal
250     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
251     chapter three of The Unicode Standard Core Specification.)
252     )
253     $(P $(DEF Abstract character) A unit of information used for the organization,
254         control, or representation of textual data.
255         Note that:
256         $(UL
257             $(LI When representing data, the nature of that data
258                 is generally symbolic as opposed to some other
259                 kind of data (for example, visual).
260             )
261              $(LI An abstract character has no concrete form
262                 and should not be confused with a $(S_LINK Glyph, glyph).
263             )
264             $(LI An abstract character does not necessarily
265                 correspond to what a user thinks of as a “character”
266                 and should not be confused with a $(LREF Grapheme).
267             )
268             $(LI The abstract characters encoded (see Encoded character)
269                 are known as Unicode abstract characters.
270             )
271             $(LI Abstract characters not directly
272                 encoded by the Unicode Standard can often be
273                 represented by the use of combining character sequences.
274             )
275         )
276     )
277     $(P $(DEF Canonical decomposition)
278         The decomposition of a character or character sequence
279         that results from recursively applying the canonical
280         mappings found in the Unicode Character Database
281         and these described in Conjoining Jamo Behavior
282         (section 12 of
283         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
284     )
285     $(P $(DEF Canonical composition)
286         The precise definition of the Canonical composition
287         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
288         Unicode Conformance) section 11.
289         Informally it's the process that does the reverse of the canonical
290         decomposition with the addition of certain rules
291         that e.g. prevent legacy characters from appearing in the composed result.
292     )
293     $(P $(DEF Canonical equivalent)
294         Two character sequences are said to be canonical equivalents if
295         their full canonical decompositions are identical.
296     )
297     $(P $(DEF Character) Typically differs by context.
298         For the purpose of this documentation the term $(I character)
299         implies $(I encoded character), that is, a code point having
300         an assigned abstract character (a symbolic meaning).
301     )
302     $(P $(DEF Code point) Any value in the Unicode codespace;
303         that is, the range of integers from 0 to 10FFFF (hex).
304         Not all code points are assigned to encoded characters.
305     )
306     $(P $(DEF Code unit) The minimal bit combination that can represent
307         a unit of encoded text for processing or interchange.
308         Depending on the encoding this could be:
309         8-bit code units in the UTF-8 (`char`),
310         16-bit code units in the UTF-16 (`wchar`),
311         and 32-bit code units in the UTF-32 (`dchar`).
312         $(I Note that in UTF-32, a code unit is a code point
313         and is represented by the D `dchar` type.)
314     )
315     $(P $(DEF Combining character) A character with the General Category
316         of Combining Mark(M).
317         $(UL
318             $(LI All characters with non-zero canonical combining class
319             are combining characters, but the reverse is not the case:
320             there are combining characters with a zero combining class.
321             )
322             $(LI These characters are not normally used in isolation
323             unless they are being described. They include such characters
324             as accents, diacritics, Hebrew points, Arabic vowel signs,
325             and Indic matras.
326             )
327         )
328     )
329     $(P $(DEF Combining class)
330         A numerical value used by the Unicode Canonical Ordering Algorithm
331         to determine which sequences of combining marks are to be
332         considered canonically equivalent and  which are not.
333     )
334     $(P $(DEF Compatibility decomposition)
335         The decomposition of a character or character sequence that results
336         from recursively applying both the compatibility mappings and
337         the canonical mappings found in the Unicode Character Database, and those
338         described in Conjoining Jamo Behavior no characters
339         can be further decomposed.
340     )
341     $(P $(DEF Compatibility equivalent)
342         Two character sequences are said to be compatibility
343         equivalents if their full compatibility decompositions are identical.
344     )
345     $(P $(DEF Encoded character) An association (or mapping)
346         between an abstract character and a code point.
347     )
348     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
349         having been rasterized or otherwise imaged onto some display surface.
350     )
351     $(P $(DEF Grapheme base) A character with the property
352         Grapheme_Base, or any standard Korean syllable block.
353     )
354     $(P $(DEF Grapheme cluster) Defined as the text between
355         grapheme boundaries  as specified by Unicode Standard Annex #29,
356         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
357         Important general properties of a grapheme:
358         $(UL
359             $(LI The grapheme cluster represents a horizontally segmentable
360             unit of text, consisting of some grapheme base (which may
361             consist of a Korean syllable) together with any number of
362             nonspacing marks applied to it.
363             )
364             $(LI  A grapheme cluster typically starts with a grapheme base
365             and then extends across any subsequent sequence of nonspacing marks.
366             A grapheme cluster is most directly relevant to text rendering and
367             processes such as cursor placement and text selection in editing,
368             but may also be relevant to comparison and searching.
369             )
370             $(LI For many processes, a grapheme cluster behaves as if it was a
371             single character with the same properties as its grapheme base.
372             Effectively, nonspacing marks apply $(I graphically) to the base,
373             but do not change its properties.
374             )
375         )
376         $(P This module defines a number of primitives that work with graphemes:
377         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
378         All of them are using $(I extended grapheme) boundaries
379         as defined in the aforementioned standard annex.
380         )
381     )
382     $(P $(DEF Nonspacing mark) A combining character with the
383         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
384     )
385     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
386     )
387     $(SECTION Normalization)
388     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
389         or $(S_LINK Compatibility equivalent, compatibility equivalent)
390         characters in the Unicode Standard make it necessary to have a full, formal
391         definition of equivalence for Unicode strings.
392         String equivalence is determined by a process called normalization,
393         whereby strings are converted into forms which are compared
394         directly for identity. This is the primary goal of the normalization process,
395         see the function $(LREF normalize) to convert into any of
396         the four defined forms.
397     )
398     $(P A very important attribute of the Unicode Normalization Forms
399         is that they must remain stable between versions of the Unicode Standard.
400         A Unicode string normalized to a particular Unicode Normalization Form
401         in one version of the standard is guaranteed to remain in that Normalization
402         Form for implementations of future versions of the standard.
403     )
404     $(P The Unicode Standard specifies four normalization forms.
405         Informally, two of these forms are defined by maximal decomposition
406         of equivalent sequences, and two of these forms are defined
407         by maximal $(I composition) of equivalent sequences.
408             $(UL
409             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
410                 canonical decomposition) of a character sequence.)
411             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
412                 compatibility decomposition) of a character sequence.)
413             $(LI Normalization Form C (NFC): The canonical composition of the
414                 $(S_LINK Canonical decomposition, canonical decomposition)
415                 of a coded character sequence.)
416             $(LI Normalization Form KC (NFKC): The canonical composition
417             of the $(S_LINK Compatibility decomposition,
418                 compatibility decomposition) of a character sequence)
419             )
420     )
421     $(P The choice of the normalization form depends on the particular use case.
422         NFC is the best form for general text, since it's more compatible with
423         strings converted from legacy encodings. NFKC is the preferred form for
424         identifiers, especially where there are security concerns. NFD and NFKD
425         are the most useful for internal processing.
426     )
427     $(SECTION Construction of lookup tables)
428     $(P The Unicode standard describes a set of algorithms that
429         depend on having the ability to quickly look up various properties
430         of a code point. Given the codespace of about 1 million $(CODEPOINTS),
431         it is not a trivial task to provide a space-efficient solution for
432         the multitude of properties.
433     )
434     $(P Common approaches such as hash-tables or binary search over
435         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
436         Hash-tables have enormous memory footprint and binary search
437         over intervals is not fast enough for some heavy-duty algorithms.
438     )
439     $(P The recommended solution (see Unicode Implementation Guidelines)
440         is using multi-stage tables that are an implementation of the
441         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
442         keys and a fixed number of stages. For the remainder of the section
443         this will be called a fixed trie. The following describes a particular
444         implementation that is aimed for the speed of access at the expense
445         of ideal size savings.
446     )
447     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
448         Split the number of bits in a key (code point, 21 bits) into 2 components
449         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
450          and the other is number of bits in each page of the trie.
451         The layout of the trie is then an array of size 2^^bits-of-index followed
452         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
453     )
454     $(P The number of pages is variable (but not less then 1)
455         unlike the number of entries in the index. The slots of the index
456         all have to contain a number of a page that is present. The lookup is then
457         just a couple of operations - slice the upper bits,
458         lookup an index for these, take a page at this index and use
459         the lower bits as an offset within this page.
460 
461         Assuming that pages are laid out consequently
462         in one array at `pages`, the pseudo-code is:
463     )
464     ---
465     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
466     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
467     ---
468     $(P Where if `elemsPerPage` is a power of 2 the whole process is
469         a handful of simple instructions and 2 array reads. Subsequent levels
470         of the trie are introduced by recursing on this notion - the index array
471         is treated as values. The number of bits in index is then again
472         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
473     )
474 
475     $(P For completeness a level 1 trie is simply an array.
476         The current implementation takes advantage of bit-packing values
477         when the range is known to be limited in advance (such as `bool`).
478         See also $(LREF BitPacked) for enforcing it manually.
479         The major size advantage however comes from the fact
480         that multiple $(B identical pages on every level are merged) by construction.
481     )
482     $(P The process of constructing a trie is more involved and is hidden from
483         the user in a form of the convenience functions $(LREF codepointTrie),
484         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
485         In general a set or built-in AA with `dchar` type
486         can be turned into a trie. The trie object in this module
487         is read-only (immutable); it's effectively frozen after construction.
488     )
489     $(SECTION Unicode properties)
490     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
491         with specific helpers per category nested within. Consult the
492         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
493         when in doubt about the contents of a particular set.
494     )
495     $(P General category sets listed below are only accessible with the
496         $(LREF unicode) shorthand accessor.)
497         $(BOOKTABLE $(B General category ),
498              $(TR $(TH Abb.) $(TH Long form)
499                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
500             $(TR $(TD L) $(TD Letter)
501                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
502             $(TR $(TD Ll) $(TD Lowercase_Letter)
503                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
504             $(TR $(TD Lm) $(TD Modifier_Letter)
505                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
506             $(TR $(TD Lo) $(TD Other_Letter)
507                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
508             $(TR $(TD Lt) $(TD Titlecase_Letter)
509               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
510             $(TR $(TD Lu) $(TD Uppercase_Letter)
511               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
512             $(TR $(TD M) $(TD Mark)
513               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
514             $(TR $(TD Mc) $(TD Spacing_Mark)
515               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
516             $(TR $(TD Me) $(TD Enclosing_Mark)
517               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
518             $(TR $(TD Mn) $(TD Nonspacing_Mark)
519               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
520             $(TR $(TD C) $(TD Other)
521               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
522             $(TR $(TD Cc) $(TD Control) $(TD Pf)
523               $(TD Final_Punctuation)   $(TD -) $(TD Any))
524             $(TR $(TD Cf) $(TD Format)
525               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
526     )
527     $(P Sets for other commonly useful properties that are
528         accessible with $(LREF unicode):)
529         $(BOOKTABLE $(B Common binary properties),
530             $(TR $(TH Name) $(TH Name) $(TH Name))
531             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
532             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
533             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
534             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
535             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
536             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
537             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
538             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
539             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
540             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
541             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
542             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
543             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
544             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
545             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
546             $(TR $(TD ID_Continue) $(TD Other_Math)  )
547     )
548     $(P Below is the table with block names accepted by $(LREF unicode.block).
549         Note that the shorthand version $(LREF unicode) requires "In"
550         to be prepended to the names of blocks so as to disambiguate
551         scripts and blocks.
552     )
553     $(BOOKTABLE $(B Blocks),
554         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
555         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
556         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
557         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
558         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
559         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
560         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
561         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
562         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
563         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
564         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
565         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
566         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
567         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
568         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
569         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
570         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
571         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
572         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
573         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
574         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
575         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
576         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
577         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
578         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
579         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
580         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
581         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
582         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
583         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
584         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
585         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
586         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
587         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
588         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
589         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
590         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
591         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
592         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
593         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
594         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
595         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
596         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
597         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
598         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
599         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
600         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
601         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
602         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
603         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
604         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
605         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
606         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
607         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
608         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
609         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
610         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
611         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
612         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
613         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
614         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
615         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
616         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
617         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
618         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
619         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
620         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
621         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
622         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
623         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
624         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
625         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
626         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
627         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
628     )
629     $(P Below is the table with script names accepted by $(LREF unicode.script)
630         and by the shorthand version $(LREF unicode):)
631         $(BOOKTABLE $(B Scripts),
632             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
633             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
634             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
635             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
636             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
637             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
638             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
639             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
640             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
641             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
642             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
643             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
644             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
645             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
646             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
647             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
648             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
649             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
650             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
651             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
652             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
653             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
654             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
655             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
656             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
657             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
658             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
659             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
660             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
661             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
662             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
663             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
664             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
665             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
666     )
667     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
668         $(BOOKTABLE $(B Hangul syllable type),
669             $(TR $(TH Abb.) $(TH Long form))
670             $(TR $(TD L)   $(TD Leading_Jamo))
671             $(TR $(TD LV)  $(TD LV_Syllable))
672             $(TR $(TD LVT) $(TD LVT_Syllable) )
673             $(TR $(TD T)   $(TD Trailing_Jamo))
674             $(TR $(TD V)   $(TD Vowel_Jamo))
675     )
676     References:
677         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
678         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
679         $(HTTP www.unicode.org, The Unicode Consortium),
680         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
681         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
682         $(HTTP www.unicode.org/uni2book/ch05.pdf,
683             Unicode Implementation Guidelines)
684         $(HTTP www.unicode.org/uni2book/ch03.pdf,
685             Unicode Conformance)
686     Trademarks:
687         Unicode(tm) is a trademark of Unicode, Inc.
688 
689     Copyright: Copyright 2013 -
690     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
691     Authors:   Dmitry Olshansky
692     Source:    $(PHOBOSSRC std/uni/package.d)
693     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
694 
695 Macros:
696 
697 SECTION = <h3><a id="$1">$0</a></h3>
698 DEF = <div><a id="$1"><i>$0</i></a></div>
699 S_LINK = <a href="#$1">$+</a>
700 CODEPOINT = $(S_LINK Code point, code point)
701 CODEPOINTS = $(S_LINK Code point, code points)
702 CHARACTER = $(S_LINK Character, character)
703 CHARACTERS = $(S_LINK Character, characters)
704 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
705 +/
706 module std.uni;
707 
708 import std.meta : AliasSeq;
709 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
710     front, hasLength, hasSlicing, isForwardRange, isInputRange,
711     isRandomAccessRange, popFront, put, save;
712 import std.traits : isAutodecodableString, isConvertibleToString, isIntegral,
713     isSomeChar, isSomeString, Unqual, isDynamicArray;
714 // debug = std_uni;
715 
716 import std.internal.unicode_tables; // generated file
717 
718 debug(std_uni) import std.stdio; // writefln, writeln
719 
720 private:
721 
722 
723 void copyBackwards(T,U)(T[] src, U[] dest)
724 {
725     assert(src.length == dest.length);
726     for (size_t i=src.length; i-- > 0; )
727         dest[i] = src[i];
728 }
729 
730 void copyForward(T,U)(T[] src, U[] dest)
731 {
732     assert(src.length == dest.length);
733     for (size_t i=0; i<src.length; i++)
734         dest[i] = src[i];
735 }
736 
737 // TODO: update to reflect all major CPUs supporting unaligned reads
738 version (X86)
739     enum hasUnalignedReads = true;
740 else version (X86_64)
741     enum hasUnalignedReads = true;
742 else version (SystemZ)
743     enum hasUnalignedReads = true;
744 else
745     enum hasUnalignedReads = false; // better be safe then sorry
746 
747 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
748 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
749 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
750 
751 // test the intro example
752 @safe unittest
753 {
754     import std.algorithm.searching : find;
755     // initialize code point sets using script/block or property name
756     // set contains code points from both scripts.
757     auto set = unicode("Cyrillic") | unicode("Armenian");
758     // or simpler and statically-checked look
759     auto ascii = unicode.ASCII;
760     auto currency = unicode.Currency_Symbol;
761 
762     // easy set ops
763     auto a = set & ascii;
764     assert(a.empty); // as it has no intersection with ascii
765     a = set | ascii;
766     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
767 
768     // some properties of code point sets
769     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
770     // testing presence of a code point in a set
771     // is just fine, it is O(logN)
772     assert(!b['$']);
773     assert(!b['\u058F']); // Armenian dram sign
774     assert(b['¥']);
775 
776     // building fast lookup tables, these guarantee O(1) complexity
777     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
778     auto oneTrie = toTrie!1(b);
779     // 2-level far more compact but typically slightly slower
780     auto twoTrie = toTrie!2(b);
781     // 3-level even smaller, and a bit slower yet
782     auto threeTrie = toTrie!3(b);
783     assert(oneTrie['£']);
784     assert(twoTrie['£']);
785     assert(threeTrie['£']);
786 
787     // build the trie with the most sensible trie level
788     // and bind it as a functor
789     auto cyrillicOrArmenian = toDelegate(set);
790     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
791     assert(balance == "ընկեր!");
792     // compatible with bool delegate(dchar)
793     bool delegate(dchar) bindIt = cyrillicOrArmenian;
794 
795     // Normalization
796     string s = "Plain ascii (and not only), is always normalized!";
797     assert(s is normalize(s));// is the same string
798 
799     string nonS = "A\u0308ffin"; // A ligature
800     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
801     assert(nS == "Äffin");
802     assert(nS != nonS);
803     string composed = "Äffin";
804 
805     assert(normalize!NFD(composed) == "A\u0308ffin");
806     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
807     assert(normalize!NFKD("2¹⁰") == "210");
808 }
809 
810 enum lastDchar = 0x10FFFF;
811 
812 auto force(T, F)(F from)
813 if (isIntegral!T && !is(T == F))
814 {
815     assert(from <= T.max && from >= T.min);
816     return cast(T) from;
817 }
818 
819 auto force(T, F)(F from)
820 if (isBitPacked!T && !is(T == F))
821 {
822     assert(from <= 2^^bitSizeOf!T-1);
823     return T(cast(TypeOfBitPacked!T) from);
824 }
825 
826 auto force(T, F)(F from)
827 if (is(T == F))
828 {
829     return from;
830 }
831 
832 // repeat X times the bit-pattern in val assuming it's length is 'bits'
833 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
834 {
835     static if (times == 1)
836         return val;
837     else static if (bits == 1)
838     {
839         static if (times == size_t.sizeof*8)
840             return val ? size_t.max : 0;
841         else
842             return val ? (1 << times)-1 : 0;
843     }
844     else static if (times % 2)
845         return (replicateBits!(times-1, bits)(val)<<bits) | val;
846     else
847         return replicateBits!(times/2, bits*2)((val << bits) | val);
848 }
849 
850 @safe pure nothrow @nogc unittest // for replicate
851 {
852     import std.algorithm.iteration : sum, map;
853     import std.range : iota;
854     size_t m = 0b111;
855     size_t m2 = 0b01;
856     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
857     {
858         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
859         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
860     }
861 }
862 
863 // multiple arrays squashed into one memory block
864 struct MultiArray(Types...)
865 {
866     import std.range.primitives : isOutputRange;
867     this(size_t[] sizes...) @safe pure nothrow
868     {
869         assert(dim == sizes.length);
870         size_t full_size;
871         foreach (i, v; Types)
872         {
873             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
874             sz[i] = sizes[i];
875             static if (i >= 1)
876                 offsets[i] = offsets[i-1] +
877                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
878         }
879 
880         storage = new size_t[full_size];
881     }
882 
883     this(const(size_t)[] raw_offsets,
884         const(size_t)[] raw_sizes,
885         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
886     {
887         offsets[] = raw_offsets[];
888         sz[] = raw_sizes[];
889         storage = data;
890     }
891 
892     @property auto slice(size_t n)()inout pure nothrow @nogc
893     {
894         auto ptr = raw_ptr!n;
895         return packedArrayView!(Types[n])(ptr, sz[n]);
896     }
897 
898     @property auto ptr(size_t n)()inout pure nothrow @nogc
899     {
900         auto ptr = raw_ptr!n;
901         return inout(PackedPtr!(Types[n]))(ptr);
902     }
903 
904     template length(size_t n)
905     {
906         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
907 
908         @property void length(size_t new_size)
909         {
910             if (new_size > sz[n])
911             {// extend
912                 size_t delta = (new_size - sz[n]);
913                 sz[n] += delta;
914                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
915                 storage.length +=  delta;// extend space at end
916                 // raw_slice!x must follow resize as it could be moved!
917                 // next stmts move all data past this array, last-one-goes-first
918                 static if (n != dim-1)
919                 {
920                     auto start = raw_ptr!(n+1);
921                     // len includes delta
922                     size_t len = (storage.ptr+storage.length-start);
923 
924                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
925 
926                     start[0 .. delta] = 0;
927                     // offsets are used for raw_slice, ptr etc.
928                     foreach (i; n+1 .. dim)
929                         offsets[i] += delta;
930                 }
931             }
932             else if (new_size < sz[n])
933             {// shrink
934                 size_t delta = (sz[n] - new_size);
935                 sz[n] -= delta;
936                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
937                 // move all data past this array, forward direction
938                 static if (n != dim-1)
939                 {
940                     auto start = raw_ptr!(n+1);
941                     size_t len = (storage.ptr+storage.length-start);
942                     copyForward(start[0 .. len-delta], start[delta .. len]);
943 
944                     // adjust offsets last, they affect raw_slice
945                     foreach (i; n+1 .. dim)
946                         offsets[i] -= delta;
947                 }
948                 storage.length -= delta;
949             }
950             // else - NOP
951         }
952     }
953 
954     @property size_t bytes(size_t n=size_t.max)() const @safe
955     {
956         static if (n == size_t.max)
957             return storage.length*size_t.sizeof;
958         else static if (n != Types.length-1)
959             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
960         else
961             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
962     }
963 
964     void store(OutRange)(scope OutRange sink) const
965     if (isOutputRange!(OutRange, char))
966     {
967         import std.format.write : formattedWrite;
968         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
969         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
970         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
971     }
972 
973 private:
974     import std.meta : staticMap;
975     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
976     {
977         static if (n == 0)
978             return storage.ptr;
979         else
980         {
981             return storage.ptr+offsets[n];
982         }
983     }
984     enum dim = Types.length;
985     size_t[dim] offsets;// offset for level x
986     size_t[dim] sz;// size of level x
987     alias bitWidth = staticMap!(bitSizeOf, Types);
988     size_t[] storage;
989 }
990 
991 @system unittest
992 {
993     import std.conv : text;
994     enum dg = (){
995         // sizes are:
996         // lvl0: 3, lvl1 : 2, lvl2: 1
997         auto m = MultiArray!(int, ubyte, int)(3,2,1);
998 
999         static void check(size_t k, T)(ref T m, int n)
1000         {
1001             foreach (i; 0 .. n)
1002                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1003         }
1004 
1005         static void checkB(size_t k, T)(ref T m, int n)
1006         {
1007             foreach (i; 0 .. n)
1008                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1009         }
1010 
1011         static void fill(size_t k, T)(ref T m, int n)
1012         {
1013             foreach (i; 0 .. n)
1014                 m.slice!(k)[i] = force!ubyte(i+1);
1015         }
1016 
1017         static void fillB(size_t k, T)(ref T m, int n)
1018         {
1019             foreach (i; 0 .. n)
1020                 m.slice!(k)[i] = force!ubyte(n-i);
1021         }
1022 
1023         m.length!1 = 100;
1024         fill!1(m, 100);
1025         check!1(m, 100);
1026 
1027         m.length!0 = 220;
1028         fill!0(m, 220);
1029         check!1(m, 100);
1030         check!0(m, 220);
1031 
1032         m.length!2 = 17;
1033         fillB!2(m, 17);
1034         checkB!2(m, 17);
1035         check!0(m, 220);
1036         check!1(m, 100);
1037 
1038         m.length!2 = 33;
1039         checkB!2(m, 17);
1040         fillB!2(m, 33);
1041         checkB!2(m, 33);
1042         check!0(m, 220);
1043         check!1(m, 100);
1044 
1045         m.length!1 = 195;
1046         fillB!1(m, 195);
1047         checkB!1(m, 195);
1048         checkB!2(m, 33);
1049         check!0(m, 220);
1050 
1051         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1052         marr.length!0 = 15;
1053         marr.length!1 = 30;
1054         fill!1(marr, 30);
1055         fill!0(marr, 15);
1056         check!1(marr, 30);
1057         check!0(marr, 15);
1058         return 0;
1059     };
1060     enum ct = dg();
1061     auto rt = dg();
1062 }
1063 
1064 @system unittest
1065 {// more bitpacking tests
1066     import std.conv : text;
1067 
1068     alias Bitty =
1069       MultiArray!(BitPacked!(size_t, 3)
1070                 , BitPacked!(size_t, 4)
1071                 , BitPacked!(size_t, 3)
1072                 , BitPacked!(size_t, 6)
1073                 , bool);
1074     alias fn1 = sliceBits!(13, 16);
1075     alias fn2 = sliceBits!( 9, 13);
1076     alias fn3 = sliceBits!( 6,  9);
1077     alias fn4 = sliceBits!( 0,  6);
1078     static void check(size_t lvl, MA)(ref MA arr){
1079         for (size_t i = 0; i< arr.length!lvl; i++)
1080             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1081     }
1082 
1083     static void fillIdx(size_t lvl, MA)(ref MA arr){
1084         for (size_t i = 0; i< arr.length!lvl; i++)
1085             arr.slice!(lvl)[i] = i;
1086     }
1087     Bitty m1;
1088 
1089     m1.length!4 = 10;
1090     m1.length!3 = 2^^6;
1091     m1.length!2 = 2^^3;
1092     m1.length!1 = 2^^4;
1093     m1.length!0 = 2^^3;
1094 
1095     m1.length!4 = 2^^16;
1096 
1097     for (size_t i = 0; i< m1.length!4; i++)
1098         m1.slice!(4)[i] = i % 2;
1099 
1100     fillIdx!1(m1);
1101     check!1(m1);
1102     fillIdx!2(m1);
1103     check!2(m1);
1104     fillIdx!3(m1);
1105     check!3(m1);
1106     fillIdx!0(m1);
1107     check!0(m1);
1108     check!3(m1);
1109     check!2(m1);
1110     check!1(m1);
1111     for (size_t i=0; i < 2^^16; i++)
1112     {
1113         m1.slice!(4)[i] = i % 2;
1114         m1.slice!(0)[fn1(i)] = fn1(i);
1115         m1.slice!(1)[fn2(i)] = fn2(i);
1116         m1.slice!(2)[fn3(i)] = fn3(i);
1117         m1.slice!(3)[fn4(i)] = fn4(i);
1118     }
1119     for (size_t i=0; i < 2^^16; i++)
1120     {
1121         assert(m1.slice!(4)[i] == i % 2);
1122         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1123         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1124         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1125         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1126     }
1127 }
1128 
1129 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1130 {
1131     import std.math.algebraic : nextPow2;
1132     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1133     static if (bits > 8*size_t.sizeof)
1134     {
1135         static assert(bits % (size_t.sizeof*8) == 0);
1136         return new_len * bits/(8*size_t.sizeof);
1137     }
1138     else
1139     {
1140         enum factor = size_t.sizeof*8/bits;
1141         return (new_len+factor-1)/factor; // rounded up
1142     }
1143 }
1144 
1145 template isBitPackableType(T)
1146 {
1147     enum isBitPackableType = isBitPacked!T
1148         || isIntegral!T || is(T == bool) || isSomeChar!T;
1149 }
1150 
1151 //============================================================================
1152 template PackedArrayView(T)
1153 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1154     && isBitPackableType!U) || isBitPackableType!T)
1155 {
1156     import std.math.algebraic : nextPow2;
1157     private enum bits = bitSizeOf!T;
1158     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1159 }
1160 
1161 //unsafe and fast access to a chunk of RAM as if it contains packed values
1162 template PackedPtr(T)
1163 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1164     && isBitPackableType!U) || isBitPackableType!T)
1165 {
1166     import std.math.algebraic : nextPow2;
1167     private enum bits = bitSizeOf!T;
1168     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1169 }
1170 
1171 struct PackedPtrImpl(T, size_t bits)
1172 {
1173 pure nothrow:
1174     static assert(isPow2OrZero(bits));
1175 
1176     this(inout(size_t)* ptr)inout @safe @nogc
1177     {
1178         origin = ptr;
1179     }
1180 
1181     private T simpleIndex(size_t n) inout
1182     {
1183         immutable q = n / factor;
1184         immutable r = n % factor;
1185         return cast(T)((origin[q] >> bits*r) & mask);
1186     }
1187 
1188     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1189     in
1190     {
1191         static if (isIntegral!T)
1192             assert(val <= mask);
1193     }
1194     do
1195     {
1196         immutable q = n / factor;
1197         immutable r = n % factor;
1198         immutable tgt_shift = bits*r;
1199         immutable word = origin[q];
1200         origin[q] = (word & ~(mask << tgt_shift))
1201             | (cast(size_t) val << tgt_shift);
1202     }
1203 
1204     static if (factor == bytesPerWord// can safely pack by byte
1205          || factor == 1 // a whole word at a time
1206          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1207                 && hasUnalignedReads)) // this needs unaligned reads
1208     {
1209         static if (factor == bytesPerWord)
1210             alias U = ubyte;
1211         else static if (factor == bytesPerWord/2)
1212             alias U = ushort;
1213         else static if (factor == bytesPerWord/4)
1214             alias U = uint;
1215         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1216             alias U = ulong;
1217 
1218         T opIndex(size_t idx) inout
1219         {
1220             T ret;
1221             version (LittleEndian)
1222                 ret = __ctfe ? simpleIndex(idx) :
1223                     cast(inout(T))(cast(U*) origin)[idx];
1224             else
1225                 ret = simpleIndex(idx);
1226             return ret;
1227         }
1228 
1229         static if (isBitPacked!T) // lack of user-defined implicit conversion
1230         {
1231             void opIndexAssign(T val, size_t idx)
1232             {
1233                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1234             }
1235         }
1236 
1237         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1238         {
1239             version (LittleEndian)
1240             {
1241                 if (__ctfe)
1242                     simpleWrite(val, idx);
1243                 else
1244                     (cast(U*) origin)[idx] = cast(U) val;
1245             }
1246             else
1247                 simpleWrite(val, idx);
1248         }
1249     }
1250     else
1251     {
1252         T opIndex(size_t n) inout
1253         {
1254             return simpleIndex(n);
1255         }
1256 
1257         static if (isBitPacked!T) // lack of user-defined implicit conversion
1258         {
1259             void opIndexAssign(T val, size_t idx)
1260             {
1261                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1262             }
1263         }
1264 
1265         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1266         {
1267             return simpleWrite(val, n);
1268         }
1269     }
1270 
1271 private:
1272     // factor - number of elements in one machine word
1273     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1274     enum bytesPerWord =  size_t.sizeof;
1275     size_t* origin;
1276 }
1277 
1278 // data is packed only by power of two sized packs per word,
1279 // thus avoiding mul/div overhead at the cost of ultimate packing
1280 // this construct doesn't own memory, only provides access, see MultiArray for usage
1281 struct PackedArrayViewImpl(T, size_t bits)
1282 {
1283 pure nothrow:
1284 
1285     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1286     {
1287         ptr = inout(PackedPtr!(T))(origin);
1288         ofs = offset;
1289         limit = items;
1290     }
1291 
1292     bool zeros(size_t s, size_t e)
1293     in
1294     {
1295         assert(s <= e);
1296     }
1297     do
1298     {
1299         s += ofs;
1300         e += ofs;
1301         immutable pad_s = roundUp(s);
1302         if ( s >= e)
1303         {
1304             foreach (i; s .. e)
1305                 if (ptr[i])
1306                     return false;
1307             return true;
1308         }
1309         immutable pad_e = roundDown(e);
1310         size_t i;
1311         for (i=s; i<pad_s; i++)
1312             if (ptr[i])
1313                 return false;
1314         // all in between is x*factor elements
1315         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1316             if (ptr.origin[j])
1317                 return false;
1318         for (; i<e; i++)
1319             if (ptr[i])
1320                 return false;
1321         return true;
1322     }
1323 
1324     T opIndex(size_t idx) inout
1325     in
1326     {
1327         assert(idx < limit);
1328     }
1329     do
1330     {
1331         return ptr[ofs + idx];
1332     }
1333 
1334     static if (isBitPacked!T) // lack of user-defined implicit conversion
1335     {
1336         void opIndexAssign(T val, size_t idx)
1337         {
1338             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1339         }
1340     }
1341 
1342     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1343     in
1344     {
1345         assert(idx < limit);
1346     }
1347     do
1348     {
1349         ptr[ofs + idx] = val;
1350     }
1351 
1352     static if (isBitPacked!T) // lack of user-defined implicit conversions
1353     {
1354         void opSliceAssign(T val, size_t start, size_t end)
1355         {
1356             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1357         }
1358     }
1359 
1360     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1361     in
1362     {
1363         assert(start <= end);
1364         assert(end <= limit);
1365     }
1366     do
1367     {
1368         // account for ofsetted view
1369         start += ofs;
1370         end += ofs;
1371         // rounded to factor granularity
1372         immutable pad_start = roundUp(start);// rounded up
1373         if (pad_start >= end) //rounded up >= then end of slice
1374         {
1375             //nothing to gain, use per element assignment
1376             foreach (i; start .. end)
1377                 ptr[i] = val;
1378             return;
1379         }
1380         immutable pad_end = roundDown(end); // rounded down
1381         size_t i;
1382         for (i=start; i<pad_start; i++)
1383             ptr[i] = val;
1384         // all in between is x*factor elements
1385         if (pad_start != pad_end)
1386         {
1387             immutable repval = replicateBits!(factor, bits)(val);
1388             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1389                 ptr.origin[j] = repval;// so speed it up by factor
1390         }
1391         for (; i<end; i++)
1392             ptr[i] = val;
1393     }
1394 
1395     auto opSlice(size_t from, size_t to)inout
1396     in
1397     {
1398         assert(from <= to);
1399         assert(ofs + to <= limit);
1400     }
1401     do
1402     {
1403         return typeof(this)(ptr.origin, ofs + from, to - from);
1404     }
1405 
1406     auto opSlice(){ return opSlice(0, length); }
1407 
1408     bool opEquals(T)(auto ref T arr) const
1409     {
1410         if (limit != arr.limit)
1411            return false;
1412         size_t s1 = ofs, s2 = arr.ofs;
1413         size_t e1 = s1 + limit, e2 = s2 + limit;
1414         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1415         {
1416             return ptr.origin[s1/factor .. e1/factor]
1417                 == arr.ptr.origin[s2/factor .. e2/factor];
1418         }
1419         for (size_t i=0;i<limit; i++)
1420             if (this[i] != arr[i])
1421                 return false;
1422         return true;
1423     }
1424 
1425     @property size_t length()const{ return limit; }
1426 
1427 private:
1428     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1429     auto roundDown()(size_t val){ return val/factor*factor; }
1430     // factor - number of elements in one machine word
1431     enum factor = size_t.sizeof*8/bits;
1432     PackedPtr!(T) ptr;
1433     size_t ofs, limit;
1434 }
1435 
1436 
1437 private struct SliceOverIndexed(T)
1438 {
1439     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1440     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1441     auto opIndex(size_t idx)const
1442     in
1443     {
1444         assert(idx < to - from);
1445     }
1446     do
1447     {
1448         return (*arr)[from+idx];
1449     }
1450 
1451     static if (assignableIndex)
1452     void opIndexAssign(Item val, size_t idx)
1453     in
1454     {
1455         assert(idx < to - from);
1456     }
1457     do
1458     {
1459        (*arr)[from+idx] = val;
1460     }
1461 
1462     auto opSlice(size_t a, size_t b)
1463     {
1464         return typeof(this)(from+a, from+b, arr);
1465     }
1466 
1467     // static if (assignableSlice)
1468     void opSliceAssign(T)(T val, size_t start, size_t end)
1469     {
1470         (*arr)[start+from .. end+from] = val;
1471     }
1472 
1473     auto opSlice()
1474     {
1475         return typeof(this)(from, to, arr);
1476     }
1477 
1478     @property size_t length()const { return to-from;}
1479 
1480     alias opDollar = length;
1481 
1482     @property bool empty()const { return from == to; }
1483 
1484     @property auto front()const { return (*arr)[from]; }
1485 
1486     static if (assignableIndex)
1487     @property void front(Item val) { (*arr)[from] = val; }
1488 
1489     @property auto back()const { return (*arr)[to-1]; }
1490 
1491     static if (assignableIndex)
1492     @property void back(Item val) { (*arr)[to-1] = val; }
1493 
1494     @property auto save() inout { return this; }
1495 
1496     void popFront() {   from++; }
1497 
1498     void popBack() {    to--; }
1499 
1500     bool opEquals(T)(auto ref T arr) const
1501     {
1502         if (arr.length != length)
1503             return false;
1504         for (size_t i=0; i <length; i++)
1505             if (this[i] != arr[i])
1506                 return false;
1507         return true;
1508     }
1509 private:
1510     alias Item = typeof(T.init[0]);
1511     size_t from, to;
1512     T* arr;
1513 }
1514 
1515 @safe pure nothrow @nogc unittest
1516 {
1517     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1518 }
1519 
1520 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1521 if (is(Unqual!T == T))
1522 {
1523     return SliceOverIndexed!(const(T))(a, b, x);
1524 }
1525 
1526 // BUG? inout is out of reach
1527 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1528 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1529 if (is(Unqual!T == T))
1530 {
1531     return SliceOverIndexed!T(a, b, x);
1532 }
1533 
1534 @system unittest
1535 {
1536     int[] idxArray = [2, 3, 5, 8, 13];
1537     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1538 
1539     assert(!sliced.empty);
1540     assert(sliced.front == 2);
1541     sliced.front = 1;
1542     assert(sliced.front == 1);
1543     assert(sliced.back == 13);
1544     sliced.popFront();
1545     assert(sliced.front == 3);
1546     assert(sliced.back == 13);
1547     sliced.back = 11;
1548     assert(sliced.back == 11);
1549     sliced.popBack();
1550 
1551     assert(sliced.front == 3);
1552     assert(sliced[$-1] == 8);
1553     sliced = sliced[];
1554     assert(sliced[0] == 3);
1555     assert(sliced.back == 8);
1556     sliced = sliced[1..$];
1557     assert(sliced.front == 5);
1558     sliced = sliced[0..$-1];
1559     assert(sliced[$-1] == 5);
1560 
1561     int[] other = [2, 5];
1562     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1563     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1564     assert(idxArray[0 .. 2] == [-1, -1]);
1565     uint[] nullArr = null;
1566     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1567     assert(nullSlice.empty);
1568 }
1569 
1570 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1571 {
1572     return inout(PackedArrayView!T)(ptr, 0, items);
1573 }
1574 
1575 
1576 //============================================================================
1577 // Partially unrolled binary search using Shar's method
1578 //============================================================================
1579 
1580 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1581 {
1582     import core.bitop : bsr;
1583     import std.array : replace;
1584     import std.conv : to;
1585     assert(isPow2OrZero(size));
1586     string code = `
1587     import core.bitop : bsr;
1588     auto power = bsr(m)+1;
1589     switch (power){`;
1590     size_t i = bsr(size);
1591     foreach_reverse (val; 0 .. bsr(size))
1592     {
1593         auto v = 2^^val;
1594         code ~= `
1595         case pow:
1596             if (pred(range[idx+m], needle))
1597                 idx +=  m;
1598             goto case;
1599         `.replace("m", to!string(v))
1600         .replace("pow", to!string(i));
1601         i--;
1602     }
1603     code ~= `
1604         case 0:
1605             if (pred(range[idx], needle))
1606                 idx += 1;
1607             goto default;
1608         `;
1609     code ~= `
1610         default:
1611     }`;
1612     return code;
1613 }
1614 
1615 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1616 {
1617     // See also: std.math.isPowerOf2()
1618     return (sz & (sz-1)) == 0;
1619 }
1620 
1621 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1622 if (is(T : ElementType!Range))
1623 {
1624     assert(isPow2OrZero(range.length));
1625     size_t idx = 0, m = range.length/2;
1626     while (m != 0)
1627     {
1628         if (pred(range[idx+m], needle))
1629             idx += m;
1630         m /= 2;
1631     }
1632     if (pred(range[idx], needle))
1633         idx += 1;
1634     return idx;
1635 }
1636 
1637 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1638 if (is(T : ElementType!Range))
1639 {
1640     assert(isPow2OrZero(range.length));
1641     size_t idx = 0, m = range.length/2;
1642     enum max = 1 << 10;
1643     while (m >= max)
1644     {
1645         if (pred(range[idx+m], needle))
1646             idx += m;
1647         m /= 2;
1648     }
1649     mixin(genUnrolledSwitchSearch(max));
1650     return idx;
1651 }
1652 
1653 template sharMethod(alias uniLowerBound)
1654 {
1655     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1656     if (is(T : ElementType!Range))
1657     {
1658         import std.functional : binaryFun;
1659         import std.math.algebraic : nextPow2, truncPow2;
1660         alias pred = binaryFun!_pred;
1661         if (range.length == 0)
1662             return 0;
1663         if (isPow2OrZero(range.length))
1664             return uniLowerBound!pred(range, needle);
1665         size_t n = truncPow2(range.length);
1666         if (pred(range[n-1], needle))
1667         {// search in another 2^^k area that fully covers the tail of range
1668             size_t k = nextPow2(range.length - n + 1);
1669             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1670         }
1671         else
1672             return uniLowerBound!pred(range[0 .. n], needle);
1673     }
1674 }
1675 
1676 alias sharLowerBound = sharMethod!uniformLowerBound;
1677 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1678 
1679 @safe unittest
1680 {
1681     import std.array : array;
1682     import std.range : assumeSorted, iota;
1683 
1684     auto stdLowerBound(T)(T[] range, T needle)
1685     {
1686         return assumeSorted(range).lowerBound(needle).length;
1687     }
1688     immutable MAX = 5*1173;
1689     auto arr = array(iota(5, MAX, 5));
1690     assert(arr.length == MAX/5-1);
1691     foreach (i; 0 .. MAX+5)
1692     {
1693         auto st = stdLowerBound(arr, i);
1694         assert(st == sharLowerBound(arr, i));
1695         assert(st == sharSwitchLowerBound(arr, i));
1696     }
1697     arr = [];
1698     auto st = stdLowerBound(arr, 33);
1699     assert(st == sharLowerBound(arr, 33));
1700     assert(st == sharSwitchLowerBound(arr, 33));
1701 }
1702 //============================================================================
1703 
1704 @safe
1705 {
1706 // hope to see simillar stuff in public interface... once Allocators are out
1707 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1708 
1709 @trusted size_t genericReplace(Policy=void, T, Range)
1710     (ref T dest, size_t from, size_t to, Range stuff)
1711 {
1712     import std.algorithm.mutation : copy;
1713     size_t delta = to - from;
1714     size_t stuff_end = from+stuff.length;
1715     if (stuff.length > delta)
1716     {// replace increases length
1717         delta = stuff.length - delta;// now, new is > old  by delta
1718         static if (is(Policy == void))
1719             dest.length = dest.length+delta;//@@@BUG lame @property
1720         else
1721             dest = Policy.realloc(dest, dest.length+delta);
1722         copyBackwards(dest[to .. dest.length-delta],
1723             dest[to+delta .. dest.length]);
1724         copyForward(stuff, dest[from .. stuff_end]);
1725     }
1726     else if (stuff.length == delta)
1727     {
1728         copy(stuff, dest[from .. to]);
1729     }
1730     else
1731     {// replace decreases length by delta
1732         delta = delta - stuff.length;
1733         copy(stuff, dest[from .. stuff_end]);
1734         copyForward(dest[to .. dest.length],
1735             dest[stuff_end .. dest.length-delta]);
1736         static if (is(Policy == void))
1737             dest.length = dest.length - delta;//@@@BUG lame @property
1738         else
1739             dest = Policy.realloc(dest, dest.length-delta);
1740     }
1741     return stuff_end;
1742 }
1743 
1744 
1745 // Simple storage manipulation policy
1746 @safe private struct GcPolicy
1747 {
1748     import std.traits : isDynamicArray;
1749 
1750     static T[] dup(T)(const T[] arr)
1751     {
1752         return arr.dup;
1753     }
1754 
1755     static T[] alloc(T)(size_t size)
1756     {
1757         return new T[size];
1758     }
1759 
1760     static T[] realloc(T)(T[] arr, size_t sz)
1761     {
1762         arr.length = sz;
1763         return arr;
1764     }
1765 
1766     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1767     {
1768         replaceInPlace(dest, from, to, stuff);
1769     }
1770 
1771     static void append(T, V)(ref T[] arr, V value)
1772     if (!isInputRange!V)
1773     {
1774         arr ~= force!T(value);
1775     }
1776 
1777     static void append(T, V)(ref T[] arr, V value)
1778     if (isInputRange!V)
1779     {
1780         insertInPlace(arr, arr.length, value);
1781     }
1782 
1783     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1784     if (isDynamicArray!T && is(Unqual!T == T))
1785     {
1786         debug
1787         {
1788             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1789         }
1790         arr = null;
1791     }
1792 
1793     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1794     if (isDynamicArray!T && !is(Unqual!T == T))
1795     {
1796         arr = null;
1797     }
1798 }
1799 
1800 // ditto
1801 @safe struct ReallocPolicy
1802 {
1803     import std.range.primitives : hasLength;
1804 
1805     static T[] dup(T)(const T[] arr)
1806     {
1807         auto result = alloc!T(arr.length);
1808         result[] = arr[];
1809         return result;
1810     }
1811 
1812     static T[] alloc(T)(size_t size) @trusted
1813     {
1814         import std.internal.memory : enforceMalloc;
1815 
1816         import core.checkedint : mulu;
1817         bool overflow;
1818         size_t nbytes = mulu(size, T.sizeof, overflow);
1819         if (overflow) assert(0);
1820 
1821         auto ptr = cast(T*) enforceMalloc(nbytes);
1822         return ptr[0 .. size];
1823     }
1824 
1825     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1826     {
1827         import std.internal.memory : enforceRealloc;
1828         if (!size)
1829         {
1830             destroy(arr);
1831             return null;
1832         }
1833 
1834         import core.checkedint : mulu;
1835         bool overflow;
1836         size_t nbytes = mulu(size, T.sizeof, overflow);
1837         if (overflow) assert(0);
1838 
1839         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1840         return ptr[0 .. size];
1841     }
1842 
1843     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1844     {
1845         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1846     }
1847 
1848     static void append(T, V)(ref T[] arr, V value)
1849     if (!isInputRange!V)
1850     {
1851         if (arr.length == size_t.max) assert(0);
1852         arr = realloc(arr, arr.length+1);
1853         arr[$-1] = force!T(value);
1854     }
1855 
1856     pure @safe unittest
1857     {
1858         int[] arr;
1859         ReallocPolicy.append(arr, 3);
1860 
1861         import std.algorithm.comparison : equal;
1862         assert(equal(arr, [3]));
1863     }
1864 
1865     static void append(T, V)(ref T[] arr, V value)
1866     if (isInputRange!V && hasLength!V)
1867     {
1868         import core.checkedint : addu;
1869         bool overflow;
1870         size_t nelems = addu(arr.length, value.length, overflow);
1871         if (overflow) assert(0);
1872 
1873         arr = realloc(arr, nelems);
1874 
1875         import std.algorithm.mutation : copy;
1876         copy(value, arr[$-value.length..$]);
1877     }
1878 
1879     pure @safe unittest
1880     {
1881         int[] arr;
1882         ReallocPolicy.append(arr, [1,2,3]);
1883 
1884         import std.algorithm.comparison : equal;
1885         assert(equal(arr, [1,2,3]));
1886     }
1887 
1888     static void destroy(T)(scope ref T[] arr) @trusted
1889     {
1890         import core.memory : pureFree;
1891         if (arr.ptr)
1892             pureFree(arr.ptr);
1893         arr = null;
1894     }
1895 }
1896 
1897 //build hack
1898 alias _RealArray = CowArray!ReallocPolicy;
1899 
1900 pure @safe unittest
1901 {
1902     import std.algorithm.comparison : equal;
1903 
1904     with(ReallocPolicy)
1905     {
1906         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1907                    string file = __FILE__, size_t line = __LINE__)
1908         {
1909             {
1910                 replaceImpl(orig, from, to, toReplace);
1911                 scope(exit) destroy(orig);
1912                 if (!equal(orig, result))
1913                     return false;
1914             }
1915             return true;
1916         }
1917         static T[] arr(T)(T[] args... )
1918         {
1919             return dup(args);
1920         }
1921 
1922         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1923         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1924         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1925         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1926         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1927     }
1928 }
1929 
1930 /**
1931     Tests if T is some kind a set of code points. Intended for template constraints.
1932 */
1933 public template isCodepointSet(T)
1934 {
1935     static if (is(T dummy == InversionList!(Args), Args...))
1936         enum isCodepointSet = true;
1937     else
1938         enum isCodepointSet = false;
1939 }
1940 
1941 /**
1942     Tests if `T` is a pair of integers that implicitly convert to `V`.
1943     The following code must compile for any pair `T`:
1944     ---
1945     (T x){ V a = x[0]; V b = x[1];}
1946     ---
1947     The following must not compile:
1948      ---
1949     (T x){ V c = x[2];}
1950     ---
1951 */
1952 public template isIntegralPair(T, V=uint)
1953 {
1954     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1955         && !is(typeof((T x){ V c = x[2]; }));
1956 }
1957 
1958 
1959 /**
1960     The recommended default type for set of $(CODEPOINTS).
1961     For details, see the current implementation: $(LREF InversionList).
1962 */
1963 public alias CodepointSet = InversionList!GcPolicy;
1964 
1965 
1966 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1967 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1968 // hence below doesn't seem to work
1969 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1970 
1971 /**
1972     The recommended type of $(REF Tuple, std,_typecons)
1973     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1974     Any interval type should pass $(LREF isIntegralPair) trait.
1975 */
1976 public struct CodepointInterval
1977 {
1978 pure:
1979     uint[2] _tuple;
1980     alias _tuple this;
1981 
1982 @safe pure nothrow @nogc:
1983 
1984     this(uint low, uint high)
1985     {
1986         _tuple[0] = low;
1987         _tuple[1] = high;
1988     }
1989     bool opEquals(T)(T val) const
1990     {
1991         return this[0] == val[0] && this[1] == val[1];
1992     }
1993     @property ref inout(uint) a() return inout { return _tuple[0]; }
1994     @property ref inout(uint) b() return inout { return _tuple[1]; }
1995 }
1996 
1997 /**
1998     $(P
1999     `InversionList` is a set of $(CODEPOINTS)
2000     represented as an array of open-right [a, b$(RPAREN)
2001     intervals (see $(LREF CodepointInterval) above).
2002     The name comes from the way the representation reads left to right.
2003     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2004     plus a singular value 60 looks like this:
2005     )
2006     ---
2007     10, 50, 60, 61, 80, 90
2008     ---
2009     $(P
2010     The way to read this is: start with negative meaning that all numbers
2011     smaller then the next one are not present in this set (and positive -
2012     the contrary). Then switch positive/negative after each
2013     number passed from left to right.
2014     )
2015     $(P This way negative spans until 10, then positive until 50,
2016     then negative until 60, then positive until 61, and so on.
2017     As seen this provides a space-efficient storage of highly redundant data
2018     that comes in long runs. A description which Unicode $(CHARACTER)
2019     properties fit nicely. The technique itself could be seen as a variation
2020     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2021     )
2022 
2023     $(P Sets are value types (just like `int` is) thus they
2024         are never aliased.
2025     )
2026         Example:
2027         ---
2028         auto a = CodepointSet('a', 'z'+1);
2029         auto b = CodepointSet('A', 'Z'+1);
2030         auto c = a;
2031         a = a | b;
2032         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2033         assert(a != c);
2034         ---
2035     $(P See also $(LREF unicode) for simpler construction of sets
2036         from predefined ones.
2037     )
2038 
2039     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2040     The value semantics are achieved by using the
2041     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2042     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2043     )
2044 
2045     Note:
2046     $(P It's not recommended to rely on the template parameters
2047     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2048     The type and parameters may change when the standard
2049     allocators design is finalized.
2050     Use $(LREF isCodepointSet) with templates or just stick with the default
2051     alias $(LREF CodepointSet) throughout the whole code base.
2052     )
2053 */
2054 public struct InversionList(SP=GcPolicy)
2055 {
2056     import std.range : assumeSorted;
2057 
2058     /**
2059         Construct from another code point set of any type.
2060     */
2061     this(Set)(Set set) pure
2062     if (isCodepointSet!Set)
2063     {
2064         uint[] arr;
2065         foreach (v; set.byInterval)
2066         {
2067             arr ~= v.a;
2068             arr ~= v.b;
2069         }
2070         data = CowArray!(SP).reuse(arr);
2071     }
2072 
2073     /**
2074         Construct a set from a forward range of code point intervals.
2075     */
2076     this(Range)(Range intervals) pure
2077     if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2078     {
2079         uint[] arr;
2080         foreach (v; intervals)
2081         {
2082             SP.append(arr, v.a);
2083             SP.append(arr, v.b);
2084         }
2085         data = CowArray!(SP).reuse(arr);
2086         sanitize(); //enforce invariant: sort intervals etc.
2087     }
2088 
2089     //helper function that avoids sanity check to be CTFE-friendly
2090     private static fromIntervals(Range)(Range intervals) pure
2091     {
2092         import std.algorithm.iteration : map;
2093         import std.range : roundRobin;
2094         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2095             intervals.save.map!"a[1]"());
2096         InversionList set;
2097         set.data = CowArray!(SP)(flattened);
2098         return set;
2099     }
2100     //ditto untill sort is CTFE-able
2101     private static fromIntervals()(uint[] intervals...) pure
2102     in
2103     {
2104         import std.conv : text;
2105         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2106         for (uint i = 0; i < intervals.length; i += 2)
2107         {
2108             auto a = intervals[i], b = intervals[i+1];
2109             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2110         }
2111     }
2112     do
2113     {
2114         InversionList set;
2115         set.data = CowArray!(SP)(intervals);
2116         return set;
2117     }
2118 
2119     /**
2120         Construct a set from plain values of code point intervals.
2121     */
2122     this()(uint[] intervals...)
2123     in
2124     {
2125         import std.conv : text;
2126         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2127         for (uint i = 0; i < intervals.length; i += 2)
2128         {
2129             auto a = intervals[i], b = intervals[i+1];
2130             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2131         }
2132     }
2133     do
2134     {
2135         data = CowArray!(SP)(intervals);
2136         sanitize(); //enforce invariant: sort intervals etc.
2137     }
2138 
2139     ///
2140     pure @safe unittest
2141     {
2142         import std.algorithm.comparison : equal;
2143 
2144         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2145         foreach (v; 'a'..'z'+1)
2146             assert(set[v]);
2147         // Cyrillic lowercase interval
2148         foreach (v; 'а'..'я'+1)
2149             assert(set[v]);
2150         //specific order is not required, intervals may interesect
2151         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2152         //the same end result
2153         assert(set2.byInterval.equal(set.byInterval));
2154         // test constructor this(Range)(Range intervals)
2155         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2156         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2157         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2158         foreach (v; '♔'..'♟'+1)
2159             assert(set3[v]);
2160     }
2161 
2162     /**
2163         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2164     */
2165     @property auto byInterval() scope
2166     {
2167         // TODO: change this to data[] once the -dip1000 errors have been fixed
2168         // see e.g. https://github.com/dlang/phobos/pull/6638
2169         import std.array : array;
2170         return Intervals!(typeof(data.array))(data.array);
2171     }
2172 
2173     @safe unittest
2174     {
2175         import std.algorithm.comparison : equal;
2176         import std.typecons : tuple;
2177 
2178         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2179 
2180         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2181     }
2182 
2183     package(std) @property const(CodepointInterval)[] intervals() const
2184     {
2185         import std.array : array;
2186         return Intervals!(typeof(data[]))(data[]).array;
2187     }
2188 
2189     /**
2190         Tests the presence of code point `val` in this set.
2191     */
2192     bool opIndex(uint val) const
2193     {
2194         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2195         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2196         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2197     }
2198 
2199     ///
2200     pure @safe unittest
2201     {
2202         auto gothic = unicode.Gothic;
2203         // Gothic letter ahsa
2204         assert(gothic['\U00010330']);
2205         // no ascii in Gothic obviously
2206         assert(!gothic['$']);
2207     }
2208 
2209 
2210     // Linear scan for `ch`. Useful only for small sets.
2211     // TODO:
2212     // used internally in std.regex
2213     // should be properly exposed in a public API ?
2214     package(std) auto scanFor()(dchar ch) const
2215     {
2216         immutable len = data.length;
2217         for (size_t i = 0; i < len; i++)
2218             if (ch < data[i])
2219                 return i & 1;
2220         return 0;
2221     }
2222 
2223     /// Number of $(CODEPOINTS) in this set
2224     @property size_t length()
2225     {
2226         size_t sum = 0;
2227         foreach (iv; byInterval)
2228         {
2229             sum += iv.b - iv.a;
2230         }
2231         return sum;
2232     }
2233 
2234 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2235 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2236 //============================================================================
2237 public:
2238     /**
2239         $(P Sets support natural syntax for set algebra, namely: )
2240         $(BOOKTABLE ,
2241             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2242             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2243             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2244             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2245             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2246         )
2247     */
2248     This opBinary(string op, U)(U rhs)
2249     if (isCodepointSet!U || is(U:dchar))
2250     {
2251         static if (op == "&" || op == "|" || op == "~")
2252         {// symmetric ops thus can swap arguments to reuse r-value
2253             static if (is(U:dchar))
2254             {
2255                 auto tmp = this;
2256                 mixin("tmp "~op~"= rhs; ");
2257                 return tmp;
2258             }
2259             else
2260             {
2261                 static if (is(Unqual!U == U))
2262                 {
2263                     // try hard to reuse r-value
2264                     mixin("rhs "~op~"= this;");
2265                     return rhs;
2266                 }
2267                 else
2268                 {
2269                     auto tmp = this;
2270                     mixin("tmp "~op~"= rhs;");
2271                     return tmp;
2272                 }
2273             }
2274         }
2275         else static if (op == "-") // anti-symmetric
2276         {
2277             auto tmp = this;
2278             tmp -= rhs;
2279             return tmp;
2280         }
2281         else
2282             static assert(0, "no operator "~op~" defined for Set");
2283     }
2284 
2285     ///
2286     pure @safe unittest
2287     {
2288         import std.algorithm.comparison : equal;
2289         import std.range : iota;
2290 
2291         auto lower = unicode.LowerCase;
2292         auto upper = unicode.UpperCase;
2293         auto ascii = unicode.ASCII;
2294 
2295         assert((lower & upper).empty); // no intersection
2296         auto lowerASCII = lower & ascii;
2297         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2298         // throw away all of the lowercase ASCII
2299         assert((ascii - lower).length == 128 - 26);
2300 
2301         auto onlyOneOf = lower ~ ascii;
2302         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2303         assert(onlyOneOf['$']); // ASCII and not lowercase
2304         assert(!onlyOneOf['a']); // ASCII and lowercase
2305         assert(onlyOneOf['я']); // not ASCII but lowercase
2306 
2307         // throw away all cased letters from ASCII
2308         auto noLetters = ascii - (lower | upper);
2309         assert(noLetters.length == 128 - 26*2);
2310     }
2311 
2312     /// The 'op=' versions of the above overloaded operators.
2313     ref This opOpAssign(string op, U)(U rhs)
2314     if (isCodepointSet!U || is(U:dchar))
2315     {
2316         static if (op == "|")    // union
2317         {
2318             static if (is(U:dchar))
2319             {
2320                 this.addInterval(rhs, rhs+1);
2321                 return this;
2322             }
2323             else
2324                 return this.add(rhs);
2325         }
2326         else static if (op == "&")   // intersection
2327                 return this.intersect(rhs);// overloaded
2328         else static if (op == "-")   // set difference
2329                 return this.sub(rhs);// overloaded
2330         else static if (op == "~")   // symmetric set difference
2331         {
2332             auto copy = this & rhs;
2333             this |= rhs;
2334             this -= copy;
2335             return this;
2336         }
2337         else
2338             static assert(0, "no operator "~op~" defined for Set");
2339     }
2340 
2341     /**
2342         Tests the presence of codepoint `ch` in this set,
2343         the same as $(LREF opIndex).
2344     */
2345     bool opBinaryRight(string op: "in", U)(U ch) const
2346     if (is(U : dchar))
2347     {
2348         return this[ch];
2349     }
2350 
2351     ///
2352     pure @safe unittest
2353     {
2354         assert('я' in unicode.Cyrillic);
2355         assert(!('z' in unicode.Cyrillic));
2356     }
2357 
2358 
2359 
2360     /**
2361      * Obtains a set that is the inversion of this set.
2362      *
2363      * See_Also: $(LREF inverted)
2364      */
2365     auto opUnary(string op: "!")()
2366     {
2367         return this.inverted;
2368     }
2369 
2370     /**
2371         A range that spans each $(CODEPOINT) in this set.
2372     */
2373     @property auto byCodepoint()
2374     {
2375         static struct CodepointRange
2376         {
2377             this(This set)
2378             {
2379                 r = set.byInterval;
2380                 if (!r.empty)
2381                     cur = r.front.a;
2382             }
2383 
2384             @property dchar front() const
2385             {
2386                 return cast(dchar) cur;
2387             }
2388 
2389             @property bool empty() const
2390             {
2391                 return r.empty;
2392             }
2393 
2394             void popFront()
2395             {
2396                 cur++;
2397                 while (cur >= r.front.b)
2398                 {
2399                     r.popFront();
2400                     if (r.empty)
2401                         break;
2402                     cur = r.front.a;
2403                 }
2404             }
2405         private:
2406             uint cur;
2407             typeof(This.init.byInterval) r;
2408         }
2409 
2410         return CodepointRange(this);
2411     }
2412 
2413     ///
2414     pure @safe unittest
2415     {
2416         import std.algorithm.comparison : equal;
2417         import std.range : iota;
2418 
2419         auto set = unicode.ASCII;
2420         set.byCodepoint.equal(iota(0, 0x80));
2421     }
2422 
2423     /**
2424         $(P Obtain textual representation of this set in from of
2425         open-right intervals and feed it to `sink`.
2426         )
2427         $(P Used by various standard formatting facilities such as
2428          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2429          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2430         )
2431         Example:
2432         ---
2433         import std.conv;
2434         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2435         ---
2436     */
2437 
2438     private import std.format.spec : FormatSpec;
2439 
2440     /***************************************
2441      * Obtain a textual representation of this InversionList
2442      * in form of open-right intervals.
2443      *
2444      * The formatting flag is applied individually to each value, for example:
2445      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2446      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2447      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2448      */
2449     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2450     {
2451         import std.format.write : formatValue;
2452         auto range = byInterval;
2453         if (range.empty)
2454             return;
2455 
2456         while (1)
2457         {
2458             auto i = range.front;
2459             range.popFront();
2460 
2461             put(sink, "[");
2462             formatValue(sink, i.a, fmt);
2463             put(sink, "..");
2464             formatValue(sink, i.b, fmt);
2465             put(sink, ")");
2466             if (range.empty) return;
2467             put(sink, " ");
2468         }
2469     }
2470 
2471     ///
2472     pure @safe unittest
2473     {
2474         import std.conv : to;
2475         import std.format : format;
2476         import std.uni : unicode;
2477 
2478         // This was originally using Cyrillic script.
2479         // Unfortunately this is a pretty active range for changes,
2480         // and hence broke in an update.
2481         // Therefore the range Basic latin was used instead as it
2482         // unlikely to ever change.
2483 
2484         assert(unicode.InBasic_latin.to!string == "[0..128)");
2485 
2486         // The specs '%s' and '%d' are equivalent to the to!string call above.
2487         assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
2488 
2489         assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
2490         assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
2491     }
2492 
2493     pure @safe unittest
2494     {
2495         import std.exception : assertThrown;
2496         import std.format : format, FormatException;
2497         assertThrown!FormatException(format("%z", unicode.ASCII));
2498     }
2499 
2500 
2501     /**
2502         Add an interval [a, b$(RPAREN) to this set.
2503     */
2504     ref add()(uint a, uint b)
2505     {
2506         addInterval(a, b);
2507         return this;
2508     }
2509 
2510     ///
2511     pure @safe unittest
2512     {
2513         CodepointSet someSet;
2514         someSet.add('0', '5').add('A','Z'+1);
2515         someSet.add('5', '9'+1);
2516         assert(someSet['0']);
2517         assert(someSet['5']);
2518         assert(someSet['9']);
2519         assert(someSet['Z']);
2520     }
2521 
2522 private:
2523 
2524   package(std)  // used from: std.regex.internal.parser
2525     ref intersect(U)(U rhs)
2526     if (isCodepointSet!U)
2527     {
2528         Marker mark;
2529         foreach ( i; rhs.byInterval)
2530         {
2531             mark = this.dropUpTo(i.a, mark);
2532             mark = this.skipUpTo(i.b, mark);
2533         }
2534         this.dropUpTo(uint.max, mark);
2535         return this;
2536     }
2537 
2538     ref intersect()(dchar ch)
2539     {
2540         foreach (i; byInterval)
2541             if (i.a <= ch && ch < i.b)
2542                 return this = This.init.add(ch, ch+1);
2543         this = This.init;
2544         return this;
2545     }
2546 
2547     pure @safe unittest
2548     {
2549         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2550     }
2551 
2552     ref sub()(dchar ch)
2553     {
2554         return subChar(ch);
2555     }
2556 
2557     // same as the above except that skip & drop parts are swapped
2558   package(std)  // used from: std.regex.internal.parser
2559     ref sub(U)(U rhs)
2560     if (isCodepointSet!U)
2561     {
2562         Marker mark;
2563         foreach (i; rhs.byInterval)
2564         {
2565             mark = this.skipUpTo(i.a, mark);
2566             mark = this.dropUpTo(i.b, mark);
2567         }
2568         return this;
2569     }
2570 
2571   package(std)  // used from: std.regex.internal.parse
2572     ref add(U)(U rhs)
2573     if (isCodepointSet!U)
2574     {
2575         Marker start;
2576         foreach (i; rhs.byInterval)
2577         {
2578             start = addInterval(i.a, i.b, start);
2579         }
2580         return this;
2581     }
2582 
2583 // end of mixin-able part
2584 //============================================================================
2585 public:
2586     /**
2587         Obtains a set that is the inversion of this set.
2588 
2589         See the '!' $(LREF opUnary) for the same but using operators.
2590     */
2591     @property auto inverted()
2592     {
2593         InversionList inversion = this;
2594         if (inversion.data.length == 0)
2595         {
2596             inversion.addInterval(0, lastDchar+1);
2597             return inversion;
2598         }
2599         if (inversion.data[0] != 0)
2600             genericReplace(inversion.data, 0, 0, [0]);
2601         else
2602             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2603         if (data[data.length-1] != lastDchar+1)
2604             genericReplace(inversion.data,
2605                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2606         else
2607             genericReplace(inversion.data,
2608                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2609 
2610         return inversion;
2611     }
2612 
2613     ///
2614     pure @safe unittest
2615     {
2616         auto set = unicode.ASCII;
2617         // union with the inverse gets all of the code points in the Unicode
2618         assert((set | set.inverted).length == 0x110000);
2619         // no intersection with the inverse
2620         assert((set & set.inverted).empty);
2621     }
2622 
2623     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2624     {
2625         import std.algorithm.searching : countUntil;
2626         import std.format : format;
2627         enum maxBinary = 3;
2628         static string linearScope(R)(R ivals, string indent)
2629         {
2630             string result = indent~"{\n";
2631             string deeper = indent~"    ";
2632             foreach (ival; ivals)
2633             {
2634                 immutable span = ival[1] - ival[0];
2635                 assert(span != 0);
2636                 if (span == 1)
2637                 {
2638                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2639                 }
2640                 else if (span == 2)
2641                 {
2642                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2643                         deeper, ival[0], ival[0]+1);
2644                 }
2645                 else
2646                 {
2647                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2648                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2649                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2650                 }
2651             }
2652             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2653             return result;
2654         }
2655 
2656         static string binaryScope(R)(R ivals, string indent) @safe
2657         {
2658             // time to do unrolled comparisons?
2659             if (ivals.length < maxBinary)
2660                 return linearScope(ivals, indent);
2661             else
2662                 return bisect(ivals, ivals.length/2, indent);
2663         }
2664 
2665         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2666         // and GDC is doing fine job either way
2667         static string switchScope(R)(R ivals, string indent)
2668         {
2669             string result = indent~"switch (ch){\n";
2670             string deeper = indent~"    ";
2671             foreach (ival; ivals)
2672             {
2673                 if (ival[0]+1 == ival[1])
2674                 {
2675                     result ~= format("%scase %s: return true;\n",
2676                         deeper, ival[0]);
2677                 }
2678                 else
2679                 {
2680                     result ~= format("%scase %s: .. case %s: return true;\n",
2681                          deeper, ival[0], ival[1]-1);
2682                 }
2683             }
2684             result ~= deeper~"default: return false;\n"~indent~"}\n";
2685             return result;
2686         }
2687 
2688         static string bisect(R)(R range, size_t idx, string indent)
2689         {
2690             string deeper = indent ~ "    ";
2691             // bisect on one [a, b) interval at idx
2692             string result = indent~"{\n";
2693             // less branch, < a
2694             result ~= format("%sif (ch < %s)\n%s",
2695                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2696             // middle point,  >= a && < b
2697             result ~= format("%selse if (ch < %s) return true;\n",
2698                 deeper, range[idx][1]);
2699             // greater or equal branch,  >= b
2700             result ~= format("%selse\n%s",
2701                 deeper, binaryScope(range[idx+1..$], deeper));
2702             return result~indent~"}\n";
2703         }
2704 
2705         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2706             funcName.empty ? "function" : funcName);
2707         // special case first bisection to be on ASCII vs beyond
2708         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2709         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2710             code ~= binaryScope(range, "");
2711         else
2712             code ~= bisect(range, tillAscii, "");
2713         return code;
2714     }
2715 
2716     /**
2717         Generates string with D source code of unary function with name of
2718         `funcName` taking a single `dchar` argument. If `funcName` is empty
2719         the code is adjusted to be a lambda function.
2720 
2721         The function generated tests if the $(CODEPOINT) passed
2722         belongs to this set or not. The result is to be used with string mixin.
2723         The intended usage area is aggressive optimization via meta programming
2724         in parser generators and the like.
2725 
2726         Note: Use with care for relatively small or regular sets. It
2727         could end up being slower then just using multi-staged tables.
2728 
2729         Example:
2730         ---
2731         import std.stdio;
2732 
2733         // construct set directly from [a, b$RPAREN intervals
2734         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2735         writeln(set);
2736         writeln(set.toSourceCode("func"));
2737         ---
2738 
2739         The above outputs something along the lines of:
2740         ---
2741         bool func(dchar ch)  @safe pure nothrow @nogc
2742         {
2743             if (ch < 45)
2744             {
2745                 if (ch == 10 || ch == 11) return true;
2746                 return false;
2747             }
2748             else if (ch < 65) return true;
2749             else
2750             {
2751                 if (ch < 100) return false;
2752                 if (ch < 200) return true;
2753                 return false;
2754             }
2755         }
2756         ---
2757     */
2758     string toSourceCode(string funcName="")
2759     {
2760         import std.array : array;
2761         auto range = byInterval.array();
2762         return toSourceCode(range, funcName);
2763     }
2764 
2765     /**
2766         True if this set doesn't contain any $(CODEPOINTS).
2767     */
2768     @property bool empty() const
2769     {
2770         return data.length == 0;
2771     }
2772 
2773     ///
2774     pure @safe unittest
2775     {
2776         CodepointSet emptySet;
2777         assert(emptySet.length == 0);
2778         assert(emptySet.empty);
2779     }
2780 
2781 private:
2782     alias This = typeof(this);
2783     alias Marker = size_t;
2784 
2785     // a random-access range of integral pairs
2786     static struct Intervals(Range)
2787     {
2788         import std.range.primitives : hasAssignableElements;
2789 
2790         this(Range sp) scope
2791         {
2792             slice = sp;
2793             start = 0;
2794             end = sp.length;
2795         }
2796 
2797         this(Range sp, size_t s, size_t e) scope
2798         {
2799             slice = sp;
2800             start = s;
2801             end = e;
2802         }
2803 
2804         @property auto front()const
2805         {
2806             immutable a = slice[start];
2807             immutable b = slice[start+1];
2808             return CodepointInterval(a, b);
2809         }
2810 
2811         //may break sorted property - but we need std.sort to access it
2812         //hence package(std) protection attribute
2813         static if (hasAssignableElements!Range)
2814         package(std) @property void front(CodepointInterval val)
2815         {
2816             slice[start] = val.a;
2817             slice[start+1] = val.b;
2818         }
2819 
2820         @property auto back()const
2821         {
2822             immutable a = slice[end-2];
2823             immutable b = slice[end-1];
2824             return CodepointInterval(a, b);
2825         }
2826 
2827         //ditto about package
2828         static if (hasAssignableElements!Range)
2829         package(std) @property void back(CodepointInterval val)
2830         {
2831             slice[end-2] = val.a;
2832             slice[end-1] = val.b;
2833         }
2834 
2835         void popFront()
2836         {
2837             start += 2;
2838         }
2839 
2840         void popBack()
2841         {
2842             end -= 2;
2843         }
2844 
2845         auto opIndex(size_t idx) const
2846         {
2847             immutable a = slice[start+idx*2];
2848             immutable b = slice[start+idx*2+1];
2849             return CodepointInterval(a, b);
2850         }
2851 
2852         //ditto about package
2853         static if (hasAssignableElements!Range)
2854         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2855         {
2856             slice[start+idx*2] = val.a;
2857             slice[start+idx*2+1] = val.b;
2858         }
2859 
2860         auto opSlice(size_t s, size_t e)
2861         {
2862             return Intervals(slice, s*2+start, e*2+start);
2863         }
2864 
2865         @property size_t length()const {  return slice.length/2; }
2866 
2867         @property bool empty()const { return start == end; }
2868 
2869         @property auto save(){ return this; }
2870     private:
2871         size_t start, end;
2872         Range slice;
2873     }
2874 
2875     // called after construction from intervals
2876     // to make sure invariants hold
2877     void sanitize()
2878     {
2879         import std.algorithm.comparison : max;
2880         import std.algorithm.mutation : SwapStrategy;
2881         import std.algorithm.sorting : sort;
2882         if (data.length == 0)
2883             return;
2884         alias Ival = CodepointInterval;
2885         //intervals wrapper for a _range_ over packed array
2886         auto ivals = Intervals!(typeof(data[]))(data[]);
2887         //@@@BUG@@@ can't use "a.a < b.a" see
2888         // https://issues.dlang.org/show_bug.cgi?id=12265
2889         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2890         // what follows is a variation on stable remove
2891         // differences:
2892         // - predicate is binary, and is tested against
2893         //   the last kept element (at 'i').
2894         // - predicate mutates lhs (merges rhs into lhs)
2895         size_t len = ivals.length;
2896         size_t i = 0;
2897         size_t j = 1;
2898         while (j < len)
2899         {
2900             if (ivals[i].b >= ivals[j].a)
2901             {
2902                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2903                 j++;
2904             }
2905             else //unmergable
2906             {
2907                 // check if there is a hole after merges
2908                 // (in the best case we do 0 writes to ivals)
2909                 if (j != i+1)
2910                     ivals[i+1] = ivals[j]; //copy over
2911                 i++;
2912                 j++;
2913             }
2914         }
2915         len = i + 1;
2916         for (size_t k=0; k + 1 < len; k++)
2917         {
2918             assert(ivals[k].a < ivals[k].b);
2919             assert(ivals[k].b < ivals[k+1].a);
2920         }
2921         data.length = len * 2;
2922     }
2923 
2924     // special case for normal InversionList
2925     ref subChar(dchar ch)
2926     {
2927         auto mark = skipUpTo(ch);
2928         if (mark != data.length
2929             && data[mark] == ch && data[mark-1] == ch)
2930         {
2931             // it has split, meaning that ch happens to be in one of intervals
2932             data[mark] = data[mark]+1;
2933         }
2934         return this;
2935     }
2936 
2937     //
2938     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2939     in
2940     {
2941         assert(a <= b);
2942     }
2943     do
2944     {
2945         import std.range : assumeSorted, SearchPolicy;
2946         auto range = assumeSorted(data[]);
2947         size_t pos;
2948         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2949         if (a_idx == range.length)
2950         {
2951             //  [---+++----++++----++++++]
2952             //  [                         a  b]
2953             data.append(a, b);
2954             return data.length-1;
2955         }
2956         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2957         uint[3] buf = void;
2958         uint to_insert;
2959         debug(std_uni)
2960         {
2961             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2962         }
2963         if (b_idx == range.length)
2964         {
2965             //  [-------++++++++----++++++-]
2966             //  [      s     a                 b]
2967             if (a_idx & 1)// a in positive
2968             {
2969                 buf[0] = b;
2970                 to_insert = 1;
2971             }
2972             else// a in negative
2973             {
2974                 buf[0] = a;
2975                 buf[1] = b;
2976                 to_insert = 2;
2977             }
2978             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2979             return pos - 1;
2980         }
2981 
2982         uint top = data[b_idx];
2983 
2984         debug(std_uni)
2985         {
2986             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2987             writefln("a=%s; b=%s; top=%s;", a, b, top);
2988         }
2989         if (a_idx & 1)
2990         {// a in positive
2991             if (b_idx & 1)// b in positive
2992             {
2993                 //  [-------++++++++----++++++-]
2994                 //  [       s    a        b    ]
2995                 buf[0] = top;
2996                 to_insert = 1;
2997             }
2998             else // b in negative
2999             {
3000                 //  [-------++++++++----++++++-]
3001                 //  [       s    a   b         ]
3002                 if (top == b)
3003                 {
3004                     assert(b_idx+1 < data.length);
3005                     buf[0] = data[b_idx+1];
3006                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3007                     return pos - 1;
3008                 }
3009                 buf[0] = b;
3010                 buf[1] = top;
3011                 to_insert = 2;
3012             }
3013         }
3014         else
3015         { // a in negative
3016             if (b_idx & 1) // b in positive
3017             {
3018                 //  [----------+++++----++++++-]
3019                 //  [     a     b              ]
3020                 buf[0] = a;
3021                 buf[1] = top;
3022                 to_insert = 2;
3023             }
3024             else// b in negative
3025             {
3026                 //  [----------+++++----++++++-]
3027                 //  [  a       s      b        ]
3028                 if (top == b)
3029                 {
3030                     assert(b_idx+1 < data.length);
3031                     buf[0] = a;
3032                     buf[1] = data[b_idx+1];
3033                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3034                     return pos - 1;
3035                 }
3036                 buf[0] = a;
3037                 buf[1] = b;
3038                 buf[2] = top;
3039                 to_insert = 3;
3040             }
3041         }
3042         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3043         debug(std_uni)
3044         {
3045             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3046             writeln("inserting ", buf[0 .. to_insert]);
3047         }
3048         return pos - 1;
3049     }
3050 
3051     //
3052     Marker dropUpTo(uint a, Marker pos=Marker.init)
3053     in
3054     {
3055         assert(pos % 2 == 0); // at start of interval
3056     }
3057     do
3058     {
3059         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3060         if (range.empty)
3061             return pos;
3062         size_t idx = pos;
3063         idx += range.lowerBound(a).length;
3064 
3065         debug(std_uni)
3066         {
3067             writeln("dropUpTo full length=", data.length);
3068             writeln(pos,"~~~", idx);
3069         }
3070         if (idx == data.length)
3071             return genericReplace(data, pos, idx, cast(uint[])[]);
3072         if (idx & 1)
3073         {   // a in positive
3074             //[--+++----++++++----+++++++------...]
3075             //      |<---si       s  a  t
3076             genericReplace(data, pos, idx, [a]);
3077         }
3078         else
3079         {   // a in negative
3080             //[--+++----++++++----+++++++-------+++...]
3081             //      |<---si              s  a  t
3082             genericReplace(data, pos, idx, cast(uint[])[]);
3083         }
3084         return pos;
3085     }
3086 
3087     //
3088     Marker skipUpTo(uint a, Marker pos=Marker.init)
3089     out(result)
3090     {
3091         assert(result % 2 == 0);// always start of interval
3092         //(may be  0-width after-split)
3093     }
3094     do
3095     {
3096         assert(data.length % 2 == 0);
3097         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3098         size_t idx = pos+range.lowerBound(a).length;
3099 
3100         if (idx >= data.length) // could have Marker point to recently removed stuff
3101             return data.length;
3102 
3103         if (idx & 1)// inside of interval, check for split
3104         {
3105 
3106             immutable top = data[idx];
3107             if (top == a)// no need to split, it's end
3108                 return idx+1;
3109             immutable start = data[idx-1];
3110             if (a == start)
3111                 return idx-1;
3112             // split it up
3113             genericReplace(data, idx, idx+1, [a, a, top]);
3114             return idx+1;        // avoid odd index
3115         }
3116         return idx;
3117     }
3118 
3119     CowArray!SP data;
3120 }
3121 
3122 pure @safe unittest
3123 {
3124     import std.conv : to;
3125     assert(unicode.ASCII.to!string() == "[0..128)");
3126 }
3127 
3128 // pedantic version for ctfe, and aligned-access only architectures
3129 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3130 {
3131     idx *= 3;
3132     version (LittleEndian)
3133         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3134              + (cast(uint) ptr[idx+2]<<16);
3135     else
3136         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3137              + ptr[idx+2];
3138 }
3139 
3140 // ditto
3141 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3142 {
3143     idx *= 3;
3144     version (LittleEndian)
3145     {
3146         ptr[idx] = val & 0xFF;
3147         ptr[idx+1] = (val >> 8) & 0xFF;
3148         ptr[idx+2] = (val >> 16) & 0xFF;
3149     }
3150     else
3151     {
3152         ptr[idx] = (val >> 16) & 0xFF;
3153         ptr[idx+1] = (val >> 8) & 0xFF;
3154         ptr[idx+2] = val & 0xFF;
3155     }
3156 }
3157 
3158 // unaligned x86-like read/write functions
3159 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3160 {
3161     uint* src = cast(uint*)(ptr+3*idx);
3162     version (LittleEndian)
3163         return *src & 0xFF_FFFF;
3164     else
3165         return *src >> 8;
3166 }
3167 
3168 // ditto
3169 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3170 {
3171     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3172     version (LittleEndian)
3173         *dest = val | (*dest & 0xFF00_0000);
3174     else
3175         *dest = (val << 8) | (*dest & 0xFF);
3176 }
3177 
3178 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3179 {
3180     static if (hasUnalignedReads)
3181         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3182     else
3183         return safeRead24(ptr, idx);
3184 }
3185 
3186 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3187 {
3188     static if (hasUnalignedReads)
3189         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3190     else
3191         return safeWrite24(ptr, val, idx);
3192 }
3193 
3194 struct CowArray(SP=GcPolicy)
3195 {
3196     import std.range.primitives : hasLength;
3197 
3198   @safe:
3199     static auto reuse(uint[] arr)
3200     {
3201         CowArray cow;
3202         cow.data = arr;
3203         SP.append(cow.data, 1);
3204         assert(cow.refCount == 1);
3205         assert(cow.length == arr.length);
3206         return cow;
3207     }
3208 
3209     this(Range)(Range range)
3210     if (isInputRange!Range && hasLength!Range)
3211     {
3212         import std.algorithm.mutation : copy;
3213         length = range.length;
3214         copy(range, data[0..$-1]);
3215     }
3216 
3217     this(Range)(Range range)
3218     if (isForwardRange!Range && !hasLength!Range)
3219     {
3220         import std.algorithm.mutation : copy;
3221         import std.range.primitives : walkLength;
3222         immutable len = walkLength(range.save);
3223         length = len;
3224         copy(range, data[0..$-1]);
3225     }
3226 
3227     this(this)
3228     {
3229         if (!empty)
3230         {
3231             refCount = refCount + 1;
3232         }
3233     }
3234 
3235     ~this()
3236     {
3237         if (!empty)
3238         {
3239             immutable cnt = refCount;
3240             if (cnt == 1)
3241                 SP.destroy(data);
3242             else
3243                 refCount = cnt - 1;
3244         }
3245     }
3246 
3247     // no ref-count for empty U24 array
3248     @property bool empty() const { return data.length == 0; }
3249 
3250     // report one less then actual size
3251     @property size_t length() const
3252     {
3253         return data.length ? data.length - 1 : 0;
3254     }
3255 
3256     //+ an extra slot for ref-count
3257     @property void length(size_t len)
3258     {
3259         import std.algorithm.comparison : min;
3260         import std.algorithm.mutation : copy;
3261         if (len == 0)
3262         {
3263             if (!empty)
3264                 freeThisReference();
3265             return;
3266         }
3267         immutable total = len + 1; // including ref-count
3268         if (empty)
3269         {
3270             data = SP.alloc!uint(total);
3271             refCount = 1;
3272             return;
3273         }
3274         immutable cur_cnt = refCount;
3275         if (cur_cnt != 1) // have more references to this memory
3276         {
3277             refCount = cur_cnt - 1;
3278             auto new_data = SP.alloc!uint(total);
3279             // take shrinking into account
3280             auto to_copy = min(total, data.length) - 1;
3281             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3282             data = new_data; // before setting refCount!
3283             refCount = 1;
3284         }
3285         else // 'this' is the only reference
3286         {
3287             // use the realloc (hopefully in-place operation)
3288             data = SP.realloc(data, total);
3289             refCount = 1; // setup a ref-count in the new end of the array
3290         }
3291     }
3292 
3293     alias opDollar = length;
3294 
3295     uint opIndex()(size_t idx)const
3296     {
3297         return data[idx];
3298     }
3299 
3300     void opIndexAssign(uint val, size_t idx)
3301     {
3302         auto cnt = refCount;
3303         if (cnt != 1)
3304             dupThisReference(cnt);
3305         data[idx] = val;
3306     }
3307 
3308     //
3309     auto opSlice(size_t from, size_t to)
3310     {
3311         if (!empty)
3312         {
3313             auto cnt = refCount;
3314             if (cnt != 1)
3315                 dupThisReference(cnt);
3316         }
3317         return data[from .. to];
3318 
3319     }
3320 
3321     //
3322     auto opSlice(size_t from, size_t to) const
3323     {
3324         return data[from .. to];
3325     }
3326 
3327     // length slices before the ref count
3328     auto opSlice()
3329     {
3330         return opSlice(0, length);
3331     }
3332 
3333     // ditto
3334     auto opSlice() const
3335     {
3336         return opSlice(0, length);
3337     }
3338 
3339     void append(Range)(Range range)
3340     if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3341     {
3342         size_t nl = length + range.length;
3343         length = nl;
3344         copy(range, this[nl-range.length .. nl]);
3345     }
3346 
3347     void append()(uint[] val...)
3348     {
3349         length = length + val.length;
3350         data[$-val.length-1 .. $-1] = val[];
3351     }
3352 
3353     bool opEquals()(auto ref const CowArray rhs) const
3354     {
3355         if (empty ^ rhs.empty)
3356             return false; // one is empty and the other isn't
3357         return empty || data[0..$-1] == rhs.data[0..$-1];
3358     }
3359 
3360 private:
3361     // ref-count is right after the data
3362     @property uint refCount() const
3363     {
3364         return data[$-1];
3365     }
3366 
3367     @property void refCount(uint cnt)
3368     {
3369         data[$-1] = cnt;
3370     }
3371 
3372     void freeThisReference()
3373     {
3374         immutable count = refCount;
3375         if (count != 1) // have more references to this memory
3376         {
3377             // dec shared ref-count
3378             refCount = count - 1;
3379             data = [];
3380         }
3381         else
3382             SP.destroy(data);
3383         assert(!data.ptr);
3384     }
3385 
3386     void dupThisReference(uint count)
3387     in
3388     {
3389         assert(!empty && count != 1 && count == refCount);
3390     }
3391     do
3392     {
3393         import std.algorithm.mutation : copy;
3394         // dec shared ref-count
3395         refCount = count - 1;
3396         // copy to the new chunk of RAM
3397         auto new_data = SP.alloc!uint(data.length);
3398         // bit-blit old stuff except the counter
3399         copy(data[0..$-1], new_data[0..$-1]);
3400         data = new_data; // before setting refCount!
3401         refCount = 1; // so that this updates the right one
3402     }
3403 
3404     uint[] data;
3405 }
3406 
3407 pure @safe unittest// Uint24 tests
3408 {
3409     import std.algorithm.comparison : equal;
3410     import std.algorithm.mutation : copy;
3411     import std.conv : text;
3412     import std.range : iota, chain;
3413     import std.range.primitives : isBidirectionalRange, isOutputRange;
3414     void funcRef(T)(ref T u24)
3415     {
3416         u24.length = 2;
3417         u24[1] = 1024;
3418         T u24_c = u24;
3419         assert(u24[1] == 1024);
3420         u24.length = 0;
3421         assert(u24.empty);
3422         u24.append([1, 2]);
3423         assert(equal(u24[], [1, 2]));
3424         u24.append(111);
3425         assert(equal(u24[], [1, 2, 111]));
3426         assert(!u24_c.empty && u24_c[1] == 1024);
3427         u24.length = 3;
3428         copy(iota(0, 3), u24[]);
3429         assert(equal(u24[], iota(0, 3)));
3430         assert(u24_c[1] == 1024);
3431     }
3432 
3433     void func2(T)(T u24)
3434     {
3435         T u24_2 = u24;
3436         T u24_3;
3437         u24_3 = u24_2;
3438         assert(u24_2 == u24_3);
3439         assert(equal(u24[], u24_2[]));
3440         assert(equal(u24_2[], u24_3[]));
3441         funcRef(u24_3);
3442 
3443         assert(equal(u24_3[], iota(0, 3)));
3444         assert(!equal(u24_2[], u24_3[]));
3445         assert(equal(u24_2[], u24[]));
3446         u24_2 = u24_3;
3447         assert(equal(u24_2[], iota(0, 3)));
3448         // to test that passed arg is intact outside
3449         // plus try out opEquals
3450         u24 = u24_3;
3451         u24 = T.init;
3452         u24_3 = T.init;
3453         assert(u24.empty);
3454         assert(u24 == u24_3);
3455         assert(u24 != u24_2);
3456     }
3457 
3458     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3459     {{
3460         alias Range = typeof(CowArray!Policy.init[]);
3461         alias U24A = CowArray!Policy;
3462         static assert(isForwardRange!Range);
3463         static assert(isBidirectionalRange!Range);
3464         static assert(isOutputRange!(Range, uint));
3465         static assert(isRandomAccessRange!(Range));
3466 
3467         auto arr = U24A([42u, 36, 100]);
3468         assert(arr[0] == 42);
3469         assert(arr[1] == 36);
3470         arr[0] = 72;
3471         arr[1] = 0xFE_FEFE;
3472         assert(arr[0] == 72);
3473         assert(arr[1] == 0xFE_FEFE);
3474         assert(arr[2] == 100);
3475         U24A arr2 = arr;
3476         assert(arr2[0] == 72);
3477         arr2[0] = 11;
3478         // test COW-ness
3479         assert(arr[0] == 72);
3480         assert(arr2[0] == 11);
3481         // set this to about 100M to stress-test COW memory management
3482         foreach (v; 0 .. 10_000)
3483             func2(arr);
3484         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3485 
3486         auto r2 = U24A(iota(0, 100));
3487         assert(equal(r2[], iota(0, 100)), text(r2[]));
3488         copy(iota(10, 170, 2), r2[10 .. 90]);
3489         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3490                , text(r2[]));
3491     }}
3492 }
3493 
3494 pure @safe unittest// core set primitives test
3495 {
3496     import std.conv : text;
3497     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3498     foreach (CodeList; AllSets)
3499     {
3500         CodeList a;
3501         //"plug a hole" test
3502         a.add(10, 20).add(25, 30).add(15, 27);
3503         assert(a == CodeList(10, 30), text(a));
3504 
3505         auto x = CodeList.init;
3506         x.add(10, 20).add(30, 40).add(50, 60);
3507 
3508         a = x;
3509         a.add(20, 49);//[10, 49) [50, 60)
3510         assert(a == CodeList(10, 49, 50 ,60));
3511 
3512         a = x;
3513         a.add(20, 50);
3514         assert(a == CodeList(10, 60), text(a));
3515 
3516         // simple unions, mostly edge effects
3517         x = CodeList.init;
3518         x.add(10, 20).add(40, 60);
3519 
3520         a = x;
3521         a.add(10, 25); //[10, 25) [40, 60)
3522         assert(a == CodeList(10, 25, 40, 60));
3523 
3524         a = x;
3525         a.add(5, 15); //[5, 20) [40, 60)
3526         assert(a == CodeList(5, 20, 40, 60));
3527 
3528         a = x;
3529         a.add(0, 10); // [0, 20) [40, 60)
3530         assert(a == CodeList(0, 20, 40, 60));
3531 
3532         a = x;
3533         a.add(0, 5); // prepand
3534         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3535 
3536         a = x;
3537         a.add(5, 20);
3538         assert(a == CodeList(5, 20, 40, 60));
3539 
3540         a = x;
3541         a.add(3, 37);
3542         assert(a == CodeList(3, 37, 40, 60));
3543 
3544         a = x;
3545         a.add(37, 65);
3546         assert(a == CodeList(10, 20, 37, 65));
3547 
3548         // some tests on helpers for set intersection
3549         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3550         a = x;
3551 
3552         auto m = a.skipUpTo(60);
3553         a.dropUpTo(110, m);
3554         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3555 
3556         a = x;
3557         a.dropUpTo(100);
3558         assert(a == CodeList(100, 120), text(a.data[]));
3559 
3560         a = x;
3561         m = a.skipUpTo(50);
3562         a.dropUpTo(140, m);
3563         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3564         a = x;
3565         a.dropUpTo(60);
3566         assert(a == CodeList(100, 120), text(a.data[]));
3567     }
3568 }
3569 
3570 
3571 //test constructor to work with any order of intervals
3572 pure @safe unittest
3573 {
3574     import std.algorithm.comparison : equal;
3575     import std.conv : text, to;
3576     import std.range : chain, iota;
3577     import std.typecons : tuple;
3578     //ensure constructor handles bad ordering and overlap
3579     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3580     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3581         assert(ch in c1, to!string(ch));
3582 
3583     //contiguos
3584     assert(CodepointSet(1000, 1006, 1006, 1009)
3585         .byInterval.equal([tuple(1000, 1009)]));
3586     //contains
3587     assert(CodepointSet(900, 1200, 1000, 1100)
3588         .byInterval.equal([tuple(900, 1200)]));
3589     //intersect left
3590     assert(CodepointSet(900, 1100, 1000, 1200)
3591         .byInterval.equal([tuple(900, 1200)]));
3592     //intersect right
3593     assert(CodepointSet(1000, 1200, 900, 1100)
3594         .byInterval.equal([tuple(900, 1200)]));
3595 
3596     //ditto with extra items at end
3597     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3598         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3599     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3600         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3601 
3602     //"plug a hole" test
3603     auto c2 = CodepointSet(20, 40,
3604         60, 80, 100, 140, 150, 200,
3605         40, 60, 80, 100, 140, 150
3606     );
3607     assert(c2.byInterval.equal([tuple(20, 200)]));
3608 
3609     auto c3 = CodepointSet(
3610         20, 40, 60, 80, 100, 140, 150, 200,
3611         0, 10, 15, 100, 10, 20, 200, 220);
3612     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3613 }
3614 
3615 
3616 pure @safe unittest
3617 {   // full set operations
3618     import std.conv : text;
3619     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3620     foreach (CodeList; AllSets)
3621     {
3622         CodeList a, b, c, d;
3623 
3624         //"plug a hole"
3625         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3626         b.add(40, 60).add(80, 100).add(140, 150);
3627         c = a | b;
3628         d = b | a;
3629         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3630         assert(c == d, text(c," vs ", d));
3631 
3632         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3633         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3634         d = b | a;
3635         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3636         assert(c == d, text(c," vs ", d));
3637 
3638         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3639         c = a | b;//[10, 140) [145, 200)
3640         d = b | a;
3641         assert(c == CodeList(10, 140, 145, 200));
3642         assert(c == d, text(c," vs ", d));
3643 
3644         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3645         c = a | b;//[0, 140) [150, 220)
3646         d = b | a;
3647         assert(c == CodeList(0, 140, 150, 220));
3648         assert(c == d, text(c," vs ", d));
3649 
3650 
3651         a = CodeList.init.add(20, 40).add(60, 80);
3652         b = CodeList.init.add(25, 35).add(65, 75);
3653         c = a & b;
3654         d = b & a;
3655         assert(c == CodeList(25, 35, 65, 75), text(c));
3656         assert(c == d, text(c," vs ", d));
3657 
3658         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3659         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3660         c = a & b;
3661         d = b & a;
3662         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3663         assert(c == d, text(c," vs ", d));
3664 
3665         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3666         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3667         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3668         d = b & a;
3669 
3670         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3671         assert(c == d, text(c, " vs ",d));
3672         assert((c & a) == c);
3673         assert((d & b) == d);
3674         assert((c & d) == d);
3675 
3676         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3677         c = a & b;
3678         d = b & a;
3679         assert(c == CodeList(150, 200), text(c));
3680         assert(c == d, text(c, " vs ",d));
3681         assert((c & a) == c);
3682         assert((d & b) == d);
3683         assert((c & d) == d);
3684 
3685         assert((a & a) == a);
3686         assert((b & b) == b);
3687 
3688         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3689         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3690         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3691         d = b - a;// [40, 60) [80, 100) [200, 300)
3692         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3693         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3694         assert(c - d == c, text(c-d, " vs ", c));
3695         assert(d - c == d, text(d-c, " vs ", d));
3696         assert(c - c == CodeList.init);
3697         assert(d - d == CodeList.init);
3698 
3699         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3700         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3701         c = a - b;// [160, 190)
3702         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3703         assert(c == CodeList(160, 190), text(c));
3704         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3705         assert(c - d == c, text(c-d, " vs ", c));
3706         assert(d - c == d, text(d-c, " vs ", d));
3707         assert(c - c == CodeList.init);
3708         assert(d - d == CodeList.init);
3709 
3710         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3711         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3712         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3713         d = b ~ a;
3714         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3715                text(c));
3716         assert(c == d, text(c, " vs ", d));
3717     }
3718 }
3719 
3720 }
3721 
3722 pure @safe unittest// vs single dchar
3723 {
3724     import std.conv : text;
3725     CodepointSet a = CodepointSet(10, 100, 120, 200);
3726     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3727     assert((a & 'B') == CodepointSet(66, 67));
3728 }
3729 
3730 pure @safe unittest// iteration & opIndex
3731 {
3732     import std.algorithm.comparison : equal;
3733     import std.conv : text;
3734     import std.typecons : tuple, Tuple;
3735 
3736     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3737     {{
3738         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3739         auto a = CodeList('A','N','a', 'n');
3740         assert(equal(a.byInterval,
3741                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3742             ), text(a.byInterval));
3743 
3744         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3745         version (bug8949)
3746         {
3747             import std.range : retro;
3748             assert(equal(retro(a.byInterval),
3749                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3750             ), text(retro(a.byInterval)));
3751         }
3752         auto achr = a.byCodepoint;
3753         assert(equal(achr, arr), text(a.byCodepoint));
3754         foreach (ch; a.byCodepoint)
3755             assert(a[ch]);
3756         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3757         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3758         foreach (ch; x.byCodepoint)
3759             assert(x[ch]);
3760         static if (is(CodeList == CodepointSet))
3761         {
3762             auto y = CodeList(x.byInterval);
3763             assert(equal(x.byInterval, y.byInterval));
3764         }
3765         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3766         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3767     }}
3768 }
3769 
3770 //============================================================================
3771 // Generic Trie template and various ways to build it
3772 //============================================================================
3773 
3774 // debug helper to get a shortened array dump
3775 auto arrayRepr(T)(T x)
3776 {
3777     import std.conv : text;
3778     if (x.length > 32)
3779     {
3780         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3781     }
3782     else
3783         return text(x);
3784 }
3785 
3786 /**
3787     Maps `Key` to a suitable integer index within the range of `size_t`.
3788     The mapping is constructed by applying predicates from `Prefix` left to right
3789     and concatenating the resulting bits.
3790 
3791     The first (leftmost) predicate defines the most significant bits of
3792     the resulting index.
3793  */
3794 template mapTrieIndex(Prefix...)
3795 {
3796     size_t mapTrieIndex(Key)(Key key)
3797     if (isValidPrefixForTrie!(Key, Prefix))
3798     {
3799         alias p = Prefix;
3800         size_t idx;
3801         foreach (i, v; p[0..$-1])
3802         {
3803             idx |= p[i](key);
3804             idx <<= p[i+1].bitSize;
3805         }
3806         idx |= p[$-1](key);
3807         return idx;
3808     }
3809 }
3810 
3811 /*
3812     `TrieBuilder` is a type used for incremental construction
3813     of $(LREF Trie)s.
3814 
3815     See $(LREF buildTrie) for generic helpers built on top of it.
3816 */
3817 @trusted private struct TrieBuilder(Value, Key, Args...)
3818 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3819 {
3820     import std.exception : enforce;
3821 
3822 private:
3823     // last index is not stored in table, it is used as an offset to values in a block.
3824     static if (is(Value == bool))// always pack bool
3825         alias V = BitPacked!(Value, 1);
3826     else
3827         alias V = Value;
3828     static auto deduceMaxIndex(Preds...)()
3829     {
3830         size_t idx = 1;
3831         foreach (v; Preds)
3832             idx *= 2^^v.bitSize;
3833         return idx;
3834     }
3835 
3836     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3837     {
3838         alias Prefix = Args[1..$];
3839         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3840         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3841         enum roughedMaxIndex =
3842             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3843         // check warp around - if wrapped, use the default deduction rule
3844         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3845             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3846     }
3847     else
3848     {
3849         alias Prefix = Args;
3850         enum maxIndex = deduceMaxIndex!(Prefix)();
3851     }
3852 
3853     alias getIndex = mapTrieIndex!(Prefix);
3854 
3855     enum lastLevel = Prefix.length-1;
3856     struct ConstructState
3857     {
3858         size_t idx_zeros, idx_ones;
3859     }
3860     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3861     size_t[Prefix.length] indices;
3862     // default filler value to use
3863     Value defValue;
3864     // this is a full-width index of next item
3865     size_t curIndex;
3866     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3867     ConstructState[Prefix.length] state;
3868     // the table being constructed
3869     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3870 
3871     @disable this();
3872 
3873     //shortcut for index variable at level 'level'
3874     @property ref idx(size_t level)(){ return indices[level]; }
3875 
3876     // this function assumes no holes in the input so
3877     // indices are going one by one
3878     void addValue(size_t level, T)(T val, size_t numVals)
3879     {
3880         alias j = idx!level;
3881         enum pageSize = 1 << Prefix[level].bitSize;
3882         if (numVals == 0)
3883             return;
3884         auto ptr = table.slice!(level);
3885         if (numVals == 1)
3886         {
3887             static if (level == Prefix.length-1)
3888                 ptr[j] = val;
3889             else
3890             {// can incur narrowing conversion
3891                 assert(j < ptr.length);
3892                 ptr[j] = force!(typeof(ptr[j]))(val);
3893             }
3894             j++;
3895             if (j % pageSize == 0)
3896                 spillToNextPage!level(ptr);
3897             return;
3898         }
3899         // longer row of values
3900         // get to the next page boundary
3901         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3902         immutable n =  nextPB - j;// can fill right in this page
3903         if (numVals < n) //fits in current page
3904         {
3905             ptr[j .. j+numVals]  = val;
3906             j += numVals;
3907             return;
3908         }
3909         static if (level != 0)//on the first level it always fits
3910         {
3911             numVals -= n;
3912             //write till the end of current page
3913             ptr[j .. j+n]  = val;
3914             j += n;
3915             //spill to the next page
3916             spillToNextPage!level(ptr);
3917             // page at once loop
3918             if (state[level].idx_zeros != size_t.max && val == T.init)
3919             {
3920                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3921                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3922                     numVals/pageSize);
3923                 ptr = table.slice!level; //table structure might have changed
3924                 numVals %= pageSize;
3925             }
3926             else
3927             {
3928                 while (numVals >= pageSize)
3929                 {
3930                     numVals -= pageSize;
3931                     ptr[j .. j+pageSize]  = val;
3932                     j += pageSize;
3933                     spillToNextPage!level(ptr);
3934                 }
3935             }
3936             if (numVals)
3937             {
3938                 // the leftovers, an incomplete page
3939                 ptr[j .. j+numVals]  = val;
3940                 j += numVals;
3941             }
3942         }
3943     }
3944 
3945     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3946     {
3947         // last level (i.e. topmost) has 1 "page"
3948         // thus it need not to add a new page on upper level
3949         static if (level != 0)
3950             spillToNextPageImpl!(level)(ptr);
3951     }
3952 
3953     // this can re-use the current page if duplicate or allocate a new one
3954     // it also makes sure that previous levels point to the correct page in this level
3955     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3956     {
3957         alias NextIdx = typeof(table.slice!(level-1)[0]);
3958         NextIdx next_lvl_index;
3959         enum pageSize = 1 << Prefix[level].bitSize;
3960         assert(idx!level % pageSize == 0);
3961         immutable last = idx!level-pageSize;
3962         const slice = ptr[idx!level - pageSize .. idx!level];
3963         size_t j;
3964         for (j=0; j<last; j+=pageSize)
3965         {
3966             if (ptr[j .. j+pageSize] == slice)
3967             {
3968                 // get index to it, reuse ptr space for the next block
3969                 next_lvl_index = force!NextIdx(j/pageSize);
3970                 version (none)
3971                 {
3972                 import std.stdio : writefln, writeln;
3973                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3974                         ,level
3975                         ,indices[level-1], pageSize, j, j+pageSize);
3976                 writeln("LEVEL(", level
3977                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3978                 writeln("LEVEL(", level
3979                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3980                 }
3981                 idx!level -= pageSize; // reuse this page, it is duplicate
3982                 break;
3983             }
3984         }
3985         if (j == last)
3986         {
3987     L_allocate_page:
3988             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3989             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3990             {
3991                 state[level].idx_zeros = next_lvl_index;
3992             }
3993             // allocate next page
3994             version (none)
3995             {
3996             import std.stdio : writefln;
3997             writefln("LEVEL(%s) page allocated: %s"
3998                      , level, arrayRepr(slice[0 .. pageSize]));
3999             writefln("LEVEL(%s) index: %s ; page at this index %s"
4000                      , level
4001                      , next_lvl_index
4002                      , arrayRepr(
4003                          table.slice!(level)
4004                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4005                         ));
4006             }
4007             table.length!level = table.length!level + pageSize;
4008         }
4009     L_know_index:
4010         // for the previous level, values are indices to the pages in the current level
4011         addValue!(level-1)(next_lvl_index, 1);
4012         ptr = table.slice!level; //re-load the slice after moves
4013     }
4014 
4015     // idx - full-width index to fill with v (full-width index != key)
4016     // fills everything in the range of [curIndex, idx) with filler
4017     void putAt(size_t idx, Value v)
4018     {
4019         assert(idx >= curIndex);
4020         immutable numFillers = idx - curIndex;
4021         addValue!lastLevel(defValue, numFillers);
4022         addValue!lastLevel(v, 1);
4023         curIndex = idx + 1;
4024     }
4025 
4026     // ditto, but sets the range of [idxA, idxB) to v
4027     void putRangeAt(size_t idxA, size_t idxB, Value v)
4028     {
4029         assert(idxA >= curIndex);
4030         assert(idxB >= idxA);
4031         size_t numFillers = idxA - curIndex;
4032         addValue!lastLevel(defValue, numFillers);
4033         addValue!lastLevel(v, idxB - idxA);
4034         curIndex = idxB; // open-right
4035     }
4036 
4037     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4038         "duplicate key->value mapping";
4039 
4040 public:
4041     /**
4042         Construct a builder, where `filler` is a value
4043         to indicate empty slots (or "not found" condition).
4044     */
4045     this(Value filler)
4046     {
4047         curIndex = 0;
4048         defValue = filler;
4049         // zeros-page index, ones-page index
4050         foreach (ref v; state)
4051             v = ConstructState(size_t.max, size_t.max);
4052         table = typeof(table)(indices);
4053         // one page per level is a bootstrap minimum
4054         foreach (i, Pred; Prefix)
4055             table.length!i = (1 << Pred.bitSize);
4056     }
4057 
4058     /**
4059         Put a value `v` into interval as
4060         mapped by keys from `a` to `b`.
4061         All slots prior to `a` are filled with
4062         the default filler.
4063     */
4064     void putRange(Key a, Key b, Value v)
4065     {
4066         auto idxA = getIndex(a), idxB = getIndex(b);
4067         // indexes of key should always grow
4068         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4069         putRangeAt(idxA, idxB, v);
4070     }
4071 
4072     /**
4073         Put a value `v` into slot mapped by `key`.
4074         All slots prior to `key` are filled with the
4075         default filler.
4076     */
4077     void putValue(Key key, Value v)
4078     {
4079         auto idx = getIndex(key);
4080         enforce(idx >= curIndex, errMsg);
4081         putAt(idx, v);
4082     }
4083 
4084     /// Finishes construction of Trie, yielding an immutable Trie instance.
4085     auto build()
4086     {
4087         static if (maxIndex != 0) // doesn't cover full range of size_t
4088         {
4089             assert(curIndex <= maxIndex);
4090             addValue!lastLevel(defValue, maxIndex - curIndex);
4091         }
4092         else
4093         {
4094             if (curIndex != 0 // couldn't wrap around
4095                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4096             {
4097                 addValue!lastLevel(defValue, size_t.max - curIndex);
4098                 addValue!lastLevel(defValue, 1);
4099             }
4100             // else curIndex already completed the full range of size_t by wrapping around
4101         }
4102         return Trie!(V, Key, maxIndex, Prefix)(table);
4103     }
4104 }
4105 
4106 /**
4107     $(P A generic Trie data-structure for a fixed number of stages.
4108     The design goal is optimal speed with smallest footprint size.
4109     )
4110     $(P It's intentionally read-only and doesn't provide constructors.
4111      To construct one use a special builder,
4112      see $(LREF TrieBuilder) and $(LREF buildTrie).
4113     )
4114 
4115 */
4116 @trusted private struct Trie(Value, Key, Args...)
4117 if (isValidPrefixForTrie!(Key, Args)
4118     || (isValidPrefixForTrie!(Key, Args[1..$])
4119     && is(typeof(Args[0]) : size_t)))
4120 {
4121     import std.range.primitives : isOutputRange;
4122     static if (is(typeof(Args[0]) : size_t))
4123     {
4124         private enum maxIndex = Args[0];
4125         private enum hasBoundsCheck = true;
4126         private alias Prefix = Args[1..$];
4127     }
4128     else
4129     {
4130         private enum hasBoundsCheck = false;
4131         private alias Prefix = Args;
4132     }
4133 
4134     private this()(typeof(_table) table)
4135     {
4136         _table = table;
4137     }
4138 
4139     // only for constant Tries constructed from precompiled tables
4140     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4141         const(size_t)[] data) const
4142     {
4143         _table = typeof(_table)(offsets, sizes, data);
4144     }
4145 
4146     /**
4147         $(P Lookup the `key` in this `Trie`. )
4148 
4149         $(P The lookup always succeeds if key fits the domain
4150         provided during construction. The whole domain defined
4151         is covered so instead of not found condition
4152         the sentinel (filler) value could be used. )
4153 
4154         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4155         define a domain of `Trie` keys and the sentinel value. )
4156 
4157         Note:
4158         Domain range-checking is only enabled in debug builds
4159         and results in assertion failure.
4160     */
4161     TypeOfBitPacked!Value opIndex()(Key key) const
4162     {
4163         static if (hasBoundsCheck)
4164             assert(mapTrieIndex!Prefix(key) < maxIndex);
4165         size_t idx;
4166         alias p = Prefix;
4167         idx = cast(size_t) p[0](key);
4168         foreach (i, v; p[0..$-1])
4169             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4170         return _table.ptr!(p.length-1)[idx];
4171     }
4172 
4173     ///
4174     @property size_t bytes(size_t n=size_t.max)() const
4175     {
4176         return _table.bytes!n;
4177     }
4178 
4179     ///
4180     @property size_t pages(size_t n)() const
4181     {
4182         return (bytes!n+2^^(Prefix[n].bitSize-1))
4183                 /2^^Prefix[n].bitSize;
4184     }
4185 
4186     ///
4187     void store(OutRange)(scope OutRange sink) const
4188     if (isOutputRange!(OutRange, char))
4189     {
4190         _table.store(sink);
4191     }
4192 
4193 private:
4194     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4195 }
4196 
4197 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4198 // left-to-right, the most significant bits first
4199 template GetBitSlicing(size_t top, sizes...)
4200 {
4201     static if (sizes.length > 0)
4202         alias GetBitSlicing =
4203             AliasSeq!(sliceBits!(top - sizes[0], top),
4204                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4205     else
4206         alias GetBitSlicing = AliasSeq!();
4207 }
4208 
4209 template callableWith(T)
4210 {
4211     template callableWith(alias Pred)
4212     {
4213         static if (!is(typeof(Pred(T.init))))
4214             enum callableWith = false;
4215         else
4216         {
4217             alias Result = typeof(Pred(T.init));
4218             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4219         }
4220     }
4221 }
4222 
4223 /*
4224     Check if `Prefix` is a valid set of predicates
4225     for `Trie` template having `Key` as the type of keys.
4226     This requires all predicates to be callable, take
4227     single argument of type `Key` and return unsigned value.
4228 */
4229 template isValidPrefixForTrie(Key, Prefix...)
4230 {
4231     import std.meta : allSatisfy;
4232     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4233 }
4234 
4235 /*
4236     Check if `Args` is a set of maximum key value followed by valid predicates
4237     for `Trie` template having `Key` as the type of keys.
4238 */
4239 template isValidArgsForTrie(Key, Args...)
4240 {
4241     static if (Args.length > 1)
4242     {
4243         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4244             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4245     }
4246     else
4247         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4248 }
4249 
4250 @property size_t sumOfIntegerTuple(ints...)()
4251 {
4252     size_t count=0;
4253     foreach (v; ints)
4254         count += v;
4255     return count;
4256 }
4257 
4258 /**
4259     A shorthand for creating a custom multi-level fixed Trie
4260     from a `CodepointSet`. `sizes` are numbers of bits per level,
4261     with the most significant bits used first.
4262 
4263     Note: The sum of `sizes` must be equal 21.
4264 
4265     See_Also: $(LREF toTrie), which is even simpler.
4266 
4267     Example:
4268     ---
4269     {
4270         import std.stdio;
4271         auto set = unicode("Number");
4272         auto trie = codepointSetTrie!(8, 5, 8)(set);
4273         writeln("Input code points to test:");
4274         foreach (line; stdin.byLine)
4275         {
4276             int count=0;
4277             foreach (dchar ch; line)
4278                 if (trie[ch])// is number
4279                     count++;
4280             writefln("Contains %d number code points.", count);
4281         }
4282     }
4283     ---
4284 */
4285 public template codepointSetTrie(sizes...)
4286 if (sumOfIntegerTuple!sizes == 21)
4287 {
4288     auto codepointSetTrie(Set)(Set set)
4289     if (isCodepointSet!Set)
4290     {
4291         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4292         foreach (ival; set.byInterval)
4293             builder.putRange(ival[0], ival[1], true);
4294         return builder.build();
4295     }
4296 }
4297 
4298 /// Type of Trie generated by codepointSetTrie function.
4299 public template CodepointSetTrie(sizes...)
4300 if (sumOfIntegerTuple!sizes == 21)
4301 {
4302     alias Prefix = GetBitSlicing!(21, sizes);
4303     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4304 }
4305 
4306 /**
4307     A slightly more general tool for building fixed `Trie`
4308     for the Unicode data.
4309 
4310     Specifically unlike `codepointSetTrie` it's allows creating mappings
4311     of `dchar` to an arbitrary type `T`.
4312 
4313     Note: Overload taking `CodepointSet`s will naturally convert
4314     only to bool mapping `Trie`s.
4315 
4316     CodepointTrie is the type of Trie as generated by codepointTrie function.
4317 */
4318 public template codepointTrie(T, sizes...)
4319 if (sumOfIntegerTuple!sizes == 21)
4320 {
4321     alias Prefix = GetBitSlicing!(21, sizes);
4322 
4323     static if (is(TypeOfBitPacked!T == bool))
4324     {
4325         auto codepointTrie(Set)(const scope Set set)
4326         if (isCodepointSet!Set)
4327         {
4328             return codepointSetTrie(set);
4329         }
4330     }
4331 
4332     ///
4333     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4334     {
4335         return buildTrie!(T, dchar, Prefix)(map, defValue);
4336     }
4337 
4338     // unsorted range of pairs
4339     ///
4340     auto codepointTrie(R)(R range, T defValue=T.init)
4341     if (isInputRange!R
4342         && is(typeof(ElementType!R.init[0]) : T)
4343         && is(typeof(ElementType!R.init[1]) : dchar))
4344     {
4345         // build from unsorted array of pairs
4346         // TODO: expose index sorting functions for Trie
4347         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4348     }
4349 }
4350 
4351 @system pure unittest
4352 {
4353     import std.algorithm.comparison : max;
4354     import std.algorithm.searching : count;
4355 
4356     // pick characters from the Greek script
4357     auto set = unicode.Greek;
4358 
4359     // a user-defined property (or an expensive function)
4360     // that we want to look up
4361     static uint luckFactor(dchar ch)
4362     {
4363         // here we consider a character lucky
4364         // if its code point has a lot of identical hex-digits
4365         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4366         ubyte[6] nibbles; // 6 4-bit chunks of code point
4367         uint value = ch;
4368         foreach (i; 0 .. 6)
4369         {
4370             nibbles[i] = value & 0xF;
4371             value >>= 4;
4372         }
4373         uint luck;
4374         foreach (n; nibbles)
4375             luck = cast(uint) max(luck, count(nibbles[], n));
4376         return luck;
4377     }
4378 
4379     // only unsigned built-ins are supported at the moment
4380     alias LuckFactor = BitPacked!(uint, 3);
4381 
4382     // create a temporary associative array (AA)
4383     LuckFactor[dchar] map;
4384     foreach (ch; set.byCodepoint)
4385         map[ch] = LuckFactor(luckFactor(ch));
4386 
4387     // bits per stage are chosen randomly, fell free to optimize
4388     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4389 
4390     // from now on the AA is not needed
4391     foreach (ch; set.byCodepoint)
4392         assert(trie[ch] == luckFactor(ch)); // verify
4393     // CJK is not Greek, thus it has the default value
4394     assert(trie['\u4444'] == 0);
4395     // and here is a couple of quite lucky Greek characters:
4396     // Greek small letter epsilon with dasia
4397     assert(trie['\u1F11'] == 3);
4398     // Ancient Greek metretes sign
4399     assert(trie['\U00010181'] == 3);
4400 
4401 }
4402 
4403 /// ditto
4404 public template CodepointTrie(T, sizes...)
4405 if (sumOfIntegerTuple!sizes == 21)
4406 {
4407     alias Prefix = GetBitSlicing!(21, sizes);
4408     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4409 }
4410 
4411 package(std) template cmpK0(alias Pred)
4412 {
4413     import std.typecons : Tuple;
4414     static bool cmpK0(Value, Key)
4415         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4416     {
4417         return Pred(a[1]) < Pred(b[1]);
4418     }
4419 }
4420 
4421 /**
4422     The most general utility for construction of `Trie`s
4423     short of using `TrieBuilder` directly.
4424 
4425     Provides a number of convenience overloads.
4426     `Args` is tuple of maximum key value followed by
4427     predicates to construct index from key.
4428 
4429     Alternatively if the first argument is not a value convertible to `Key`
4430     then the whole tuple of `Args` is treated as predicates
4431     and the maximum Key is deduced from predicates.
4432 */
4433 private template buildTrie(Value, Key, Args...)
4434 if (isValidArgsForTrie!(Key, Args))
4435 {
4436     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4437     {
4438         alias Prefix = Args[1..$];
4439     }
4440     else
4441         alias Prefix = Args;
4442 
4443     alias getIndex = mapTrieIndex!(Prefix);
4444 
4445     // for multi-sort
4446     template GetComparators(size_t n)
4447     {
4448         static if (n > 0)
4449             alias GetComparators =
4450                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4451         else
4452             alias GetComparators = AliasSeq!();
4453     }
4454 
4455     /*
4456         Build `Trie` from a range of a Key-Value pairs,
4457         assuming it is sorted by Key as defined by the following lambda:
4458         ------
4459         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4460         ------
4461         Exception is thrown if it's detected that the above order doesn't hold.
4462 
4463         In other words $(LREF mapTrieIndex) should be a
4464         monotonically increasing function that maps `Key` to an integer.
4465 
4466         See_Also: $(REF sort, std,_algorithm),
4467         $(REF SortedRange, std,range),
4468         $(REF setUnion, std,_algorithm).
4469     */
4470     auto buildTrie(Range)(Range range, Value filler=Value.init)
4471     if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4472         && is(typeof(Range.init.front[1]) : Key))
4473     {
4474         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4475         foreach (v; range)
4476             builder.putValue(v[1], v[0]);
4477         return builder.build();
4478     }
4479 
4480     /*
4481         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4482         to build `Trie` from a range of open-right intervals of `Key`s.
4483         The requirement  on the ordering of keys (and the behavior on the
4484         violation of it) is the same as for Key-Value range overload.
4485 
4486         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4487         If no filler provided keys inside of the intervals map to true,
4488         and `filler` is false.
4489     */
4490     auto buildTrie(Range)(Range range, Value filler=Value.init)
4491     if (is(TypeOfBitPacked!Value ==  bool)
4492         && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4493         && is(typeof(Range.init.front[1]) : Key))
4494     {
4495         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4496         foreach (ival; range)
4497             builder.putRange(ival[0], ival[1], !filler);
4498         return builder.build();
4499     }
4500 
4501     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4502     if (isInputRange!Range
4503         && is(typeof(Range.init.front[0]) : Value)
4504         && is(typeof(Range.init.front[1]) : Key))
4505     {
4506         import std.algorithm.sorting : multiSort;
4507         alias Comps = GetComparators!(Prefix.length);
4508         if (unsorted)
4509             multiSort!(Comps)(range);
4510         return buildTrie(range, filler);
4511     }
4512 
4513     /*
4514         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4515         to build `Trie` simply from an input range of `Key`s.
4516         The requirement  on the ordering of keys (and the behavior on the
4517         violation of it) is the same as for Key-Value range overload.
4518 
4519         Keys found in range denote !`filler` i.e. the opposite of filler.
4520         If no filler provided keys map to true, and `filler` is false.
4521     */
4522     auto buildTrie(Range)(Range range, Value filler=Value.init)
4523     if (is(TypeOfBitPacked!Value ==  bool)
4524         && isInputRange!Range && is(typeof(Range.init.front) : Key))
4525     {
4526         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4527         foreach (v; range)
4528             builder.putValue(v, !filler);
4529         return builder.build();
4530     }
4531 
4532     /*
4533         If `Key` is unsigned integer `Trie` could be constructed from array
4534         of values where array index serves as key.
4535     */
4536     auto buildTrie()(Value[] array, Value filler=Value.init)
4537     if (isUnsigned!Key)
4538     {
4539         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4540         foreach (idx, v; array)
4541             builder.putValue(idx, v);
4542         return builder.build();
4543     }
4544 
4545     /*
4546         Builds `Trie` from associative array.
4547     */
4548     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4549     {
4550         import std.array : array;
4551         import std.range : zip;
4552         auto range = array(zip(map.values, map.keys));
4553         return buildTrie(range, filler, true); // sort it
4554     }
4555 }
4556 
4557 // helper in place of assumeSize to
4558 //reduce mangled name & help DMD inline Trie functors
4559 struct clamp(size_t bits)
4560 {
4561     static size_t opCall(T)(T arg){ return arg; }
4562     enum bitSize = bits;
4563 }
4564 
4565 struct clampIdx(size_t idx, size_t bits)
4566 {
4567     static size_t opCall(T)(T arg){ return arg[idx]; }
4568     enum bitSize = bits;
4569 }
4570 
4571 /**
4572     Conceptual type that outlines the common properties of all UTF Matchers.
4573 
4574     Note: For illustration purposes only, every method
4575     call results in assertion failure.
4576     Use $(LREF utfMatcher) to obtain a concrete matcher
4577     for UTF-8 or UTF-16 encodings.
4578 */
4579 public struct MatcherConcept
4580 {
4581     /**
4582         $(P Perform a semantic equivalent 2 operations:
4583         decoding a $(CODEPOINT) at front of `inp` and testing if
4584         it belongs to the set of $(CODEPOINTS) of this matcher. )
4585 
4586         $(P The effect on `inp` depends on the kind of function called:)
4587 
4588         $(P Match. If the codepoint is found in the set then range `inp`
4589         is advanced by its size in $(S_LINK Code unit, code units),
4590         otherwise the range is not modifed.)
4591 
4592         $(P Skip. The range is always advanced by the size
4593         of the tested $(CODEPOINT) regardless of the result of test.)
4594 
4595         $(P Test. The range is left unaffected regardless
4596         of the result of test.)
4597     */
4598     public bool match(Range)(ref Range inp)
4599     if (isRandomAccessRange!Range && is(ElementType!Range : char))
4600     {
4601        assert(false);
4602     }
4603 
4604     ///ditto
4605     public bool skip(Range)(ref Range inp)
4606     if (isRandomAccessRange!Range && is(ElementType!Range : char))
4607     {
4608         assert(false);
4609     }
4610 
4611     ///ditto
4612     public bool test(Range)(ref Range inp)
4613     if (isRandomAccessRange!Range && is(ElementType!Range : char))
4614     {
4615         assert(false);
4616     }
4617     ///
4618     pure @safe unittest
4619     {
4620         string truth = "2² = 4";
4621         auto m = utfMatcher!char(unicode.Number);
4622         assert(m.match(truth)); // '2' is a number all right
4623         assert(truth == "² = 4"); // skips on match
4624         assert(m.match(truth)); // so is the superscript '2'
4625         assert(!m.match(truth)); // space is not a number
4626         assert(truth == " = 4"); // unaffected on no match
4627         assert(!m.skip(truth)); // same test ...
4628         assert(truth == "= 4"); // but skips a codepoint regardless
4629         assert(!m.test(truth)); // '=' is not a number
4630         assert(truth == "= 4"); // test never affects argument
4631     }
4632 
4633     /**
4634         Advanced feature - provide direct access to a subset of matcher based a
4635         set of known encoding lengths. Lengths are provided in
4636         $(S_LINK Code unit, code units). The sub-matcher then may do less
4637         operations per any `test`/`match`.
4638 
4639         Use with care as the sub-matcher won't match
4640         any $(CODEPOINTS) that have encoded length that doesn't belong
4641         to the selected set of lengths. Also the sub-matcher object references
4642         the parent matcher and must not be used past the liftetime
4643         of the latter.
4644 
4645         Another caveat of using sub-matcher is that skip is not available
4646         preciesly because sub-matcher doesn't detect all lengths.
4647     */
4648     @property auto subMatcher(Lengths...)()
4649     {
4650         assert(0);
4651         return this;
4652     }
4653 
4654     pure @safe unittest
4655     {
4656         auto m = utfMatcher!char(unicode.Number);
4657         string square = "2²";
4658         // about sub-matchers
4659         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4660         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4661         assert(!m.subMatcher!1.test(square)); // unicode '²'
4662         assert(m.subMatcher!(2,3,4).match(square));  //
4663         assert(square == "");
4664         wstring wsquare = "2²";
4665         auto m16 = utfMatcher!wchar(unicode.Number);
4666         // may keep ref, but the orignal (m16) must be kept alive
4667         auto bmp = m16.subMatcher!1;
4668         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4669         assert(bmp.match(wsquare)); // And '²' too
4670     }
4671 }
4672 
4673 /**
4674     Test if `M` is an UTF Matcher for ranges of `Char`.
4675 */
4676 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4677     C[] s;
4678     auto d = s.decoder;
4679     M m;
4680     assert(is(typeof(m.match(d)) == bool));
4681     assert(is(typeof(m.test(d)) == bool));
4682     static if (is(typeof(m.skip(d))))
4683     {
4684         assert(is(typeof(m.skip(d)) == bool));
4685         assert(is(typeof(m.skip(s)) == bool));
4686     }
4687     assert(is(typeof(m.match(s)) == bool));
4688     assert(is(typeof(m.test(s)) == bool));
4689 });
4690 
4691 pure @safe unittest
4692 {
4693     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4694     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4695     static assert(isUtfMatcher!(CharMatcher, char));
4696     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4697     static assert(isUtfMatcher!(WcharMatcher, wchar));
4698     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4699 }
4700 
4701 enum Mode {
4702     alwaysSkip,
4703     neverSkip,
4704     skipOnMatch
4705 }
4706 
4707 mixin template ForwardStrings()
4708 {
4709     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4710     {
4711         import std.utf : byCodeUnit;
4712         alias type = typeof(byCodeUnit(str));
4713         return mixin(fn~"(*cast(type*)&str)");
4714     }
4715 }
4716 
4717 template Utf8Matcher()
4718 {
4719     enum validSize(int sz) = sz >= 1 && sz <= 4;
4720 
4721     void badEncoding() pure @safe
4722     {
4723         import std.utf : UTFException;
4724         throw new UTFException("Invalid UTF-8 sequence");
4725     }
4726 
4727     //for 1-stage ASCII
4728     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4729     //for 2-stage lookup of 2 byte UTF-8 sequences
4730     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4731         clampIdx!(0, 5), clampIdx!(1, 6));
4732     //ditto for 3 byte
4733     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4734         clampIdx!(0, 4),
4735         clampIdx!(1, 6),
4736         clampIdx!(2, 6)
4737     );
4738     //ditto for 4 byte
4739     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4740         clampIdx!(0, 3), clampIdx!(1, 6),
4741         clampIdx!(2, 6), clampIdx!(3, 6)
4742     );
4743     alias Tables = AliasSeq!(
4744         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4745         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4746         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4747         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4748     );
4749     alias Table(int size) = Tables[size-1];
4750 
4751     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4752     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4753 
4754     char truncate()(char ch) pure @safe
4755     {
4756         ch -= 0x80;
4757         if (ch < 0x40)
4758         {
4759             return ch;
4760         }
4761         else
4762         {
4763             badEncoding();
4764             return cast(char) 0;
4765         }
4766     }
4767 
4768     static auto encode(size_t sz)(dchar ch)
4769     if (sz > 1)
4770     {
4771         import std.utf : encodeUTF = encode;
4772         char[4] buf;
4773         encodeUTF(buf, ch);
4774         char[sz] ret;
4775         buf[0] &= leadMask!sz;
4776         foreach (n; 1 .. sz)
4777             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4778         ret[] = buf[0 .. sz];
4779         return ret;
4780     }
4781 
4782     auto build(Set)(Set set)
4783     {
4784         import std.algorithm.iteration : map;
4785         auto ascii = set & unicode.ASCII;
4786         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4787         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4788         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4789         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4790         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4791         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4792         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4793         alias Ret = Impl!(1,2,3,4);
4794         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4795     }
4796 
4797     // Bootstrap UTF-8 static matcher interface
4798     // from 3 primitives: tab!(size), lookup and Sizes
4799     mixin template DefMatcher()
4800     {
4801         import std.format : format;
4802         import std.meta : Erase, staticIndexOf;
4803         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4804         alias UniSizes = Erase!(1, Sizes);
4805 
4806         //generate dispatch code sequence for unicode parts
4807         static auto genDispatch()
4808         {
4809             string code;
4810             foreach (size; UniSizes)
4811                 code ~= format(q{
4812                     if ((ch & ~leadMask!%d) == encMask!(%d))
4813                         return lookup!(%d, mode)(inp);
4814                     else
4815                 }, size, size, size);
4816             static if (Sizes.length == 4) //covers all code unit cases
4817                 code ~= "{ badEncoding(); return false; }";
4818             else
4819                 code ~= "return false;"; //may be just fine but not covered
4820             return code;
4821         }
4822         enum dispatch = genDispatch();
4823 
4824         public bool match(Range)(ref Range inp) const
4825         if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4826             !isDynamicArray!Range)
4827         {
4828             enum mode = Mode.skipOnMatch;
4829             assert(!inp.empty);
4830             immutable ch = inp[0];
4831             static if (hasASCII)
4832             {
4833                 if (ch < 0x80)
4834                 {
4835                     immutable r = tab!1[ch];
4836                     if (r)
4837                         inp.popFront();
4838                     return r;
4839                 }
4840                 else
4841                     mixin(dispatch);
4842             }
4843             else
4844                 mixin(dispatch);
4845         }
4846 
4847         static if (Sizes.length == 4) // can skip iff can detect all encodings
4848         {
4849             public bool skip(Range)(ref Range inp) const
4850             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4851                 !isDynamicArray!Range)
4852             {
4853                 enum mode = Mode.alwaysSkip;
4854                 assert(!inp.empty);
4855                 auto ch = inp[0];
4856                 static if (hasASCII)
4857                 {
4858                     if (ch < 0x80)
4859                     {
4860                         inp.popFront();
4861                         return tab!1[ch];
4862                     }
4863                     else
4864                         mixin(dispatch);
4865                 }
4866                 else
4867                     mixin(dispatch);
4868             }
4869         }
4870 
4871         public bool test(Range)(ref Range inp) const
4872         if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4873             !isDynamicArray!Range)
4874         {
4875             enum mode = Mode.neverSkip;
4876             assert(!inp.empty);
4877             auto ch = inp[0];
4878 
4879             static if (hasASCII)
4880             {
4881                 if (ch < 0x80)
4882                     return tab!1[ch];
4883                 else
4884                     mixin(dispatch);
4885             }
4886             else
4887                 mixin(dispatch);
4888         }
4889 
4890         bool match(C)(ref C[] str) const
4891         if (isSomeChar!C)
4892         {
4893             return fwdStr!"match"(str);
4894         }
4895 
4896         bool skip(C)(ref C[] str) const
4897         if (isSomeChar!C)
4898         {
4899             return fwdStr!"skip"(str);
4900         }
4901 
4902         bool test(C)(ref C[] str) const
4903         if (isSomeChar!C)
4904         {
4905             return fwdStr!"test"(str);
4906         }
4907 
4908         mixin ForwardStrings;
4909     }
4910 
4911     struct Impl(Sizes...)
4912     {
4913         import std.meta : allSatisfy, staticMap;
4914         static assert(allSatisfy!(validSize, Sizes),
4915             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4916     private:
4917         //pick tables for chosen sizes
4918         alias OurTabs = staticMap!(Table, Sizes);
4919         OurTabs tables;
4920         mixin DefMatcher;
4921         //static disptach helper UTF size ==> table
4922         alias tab(int i) = tables[i - 1];
4923 
4924         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4925         {
4926             return CherryPick!(Impl, SizesToPick)(&this);
4927         }
4928 
4929         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4930         {
4931             import std.range : popFrontN;
4932             if (inp.length < size)
4933             {
4934                 badEncoding();
4935                 return false;
4936             }
4937             char[size] needle = void;
4938             needle[0] = leadMask!size & inp[0];
4939             static foreach (i; 1 .. size)
4940             {
4941                 needle[i] = truncate(inp[i]);
4942             }
4943             //overlong encoding checks
4944             static if (size == 2)
4945             {
4946                 //0x80-0x7FF
4947                 //got 6 bits in needle[1], must use at least 8 bits
4948                 //must use at least 2 bits in needle[1]
4949                 if (needle[0] < 2) badEncoding();
4950             }
4951             else static if (size == 3)
4952             {
4953                 //0x800-0xFFFF
4954                 //got 6 bits in needle[2], must use at least 12bits
4955                 //must use 6 bits in needle[1] or anything in needle[0]
4956                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4957             }
4958             else static if (size == 4)
4959             {
4960                 //0x800-0xFFFF
4961                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4962                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4963                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4964             }
4965             static if (mode == Mode.alwaysSkip)
4966             {
4967                 inp.popFrontN(size);
4968                 return tab!size[needle];
4969             }
4970             else static if (mode == Mode.neverSkip)
4971             {
4972                 return tab!size[needle];
4973             }
4974             else
4975             {
4976                 static assert(mode == Mode.skipOnMatch);
4977 
4978                 if (tab!size[needle])
4979                 {
4980                     inp.popFrontN(size);
4981                     return true;
4982                 }
4983                 else
4984                     return false;
4985             }
4986         }
4987     }
4988 
4989     struct CherryPick(I, Sizes...)
4990     {
4991         import std.meta : allSatisfy;
4992         static assert(allSatisfy!(validSize, Sizes),
4993             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4994     private:
4995         I* m;
4996         @property auto tab(int i)() const { return m.tables[i - 1]; }
4997         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4998         {
4999             return m.lookup!(size, mode)(inp);
5000         }
5001         mixin DefMatcher;
5002     }
5003 }
5004 
5005 template Utf16Matcher()
5006 {
5007     enum validSize(int sz) = sz >= 1 && sz <= 2;
5008 
5009     void badEncoding() pure @safe
5010     {
5011         import std.utf : UTFException;
5012         throw new UTFException("Invalid UTF-16 sequence");
5013     }
5014 
5015     // 1-stage ASCII
5016     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5017     //2-stage BMP
5018     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5019     //4-stage - full Unicode
5020     //assume that 0xD800 & 0xDC00 bits are cleared
5021     //thus leaving 10 bit per wchar to worry about
5022     alias UniSpec = AliasSeq!(bool, wchar[2],
5023         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5024         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5025     );
5026     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5027     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5028     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5029 
5030     auto encode2(dchar ch)
5031     {
5032         ch -= 0x1_0000;
5033         assert(ch <= 0xF_FFFF);
5034         wchar[2] ret;
5035         //do not put surrogate bits, they are sliced off
5036         ret[0] = cast(wchar)(ch >> 10);
5037         ret[1] = (ch & 0xFFF);
5038         return ret;
5039     }
5040 
5041     auto build(Set)(Set set)
5042     {
5043         import std.algorithm.iteration : map;
5044         auto ascii = set & unicode.ASCII;
5045         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5046             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5047         auto other = set - (bmp | ascii);
5048         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5049         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5050         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5051         alias Ret = Impl!(1,2);
5052         return Ret(asciiT, bmpT, otherT);
5053     }
5054 
5055     //bootstrap full UTF-16 matcher interace from
5056     //sizeFlags, lookupUni and ascii
5057     mixin template DefMatcher()
5058     {
5059         public bool match(Range)(ref Range inp) const
5060         if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5061             !isDynamicArray!Range)
5062         {
5063             enum mode = Mode.skipOnMatch;
5064             assert(!inp.empty);
5065             immutable ch = inp[0];
5066             static if (sizeFlags & 1)
5067             {
5068                 if (ch < 0x80)
5069                 {
5070                   if (ascii[ch])
5071                   {
5072                       inp.popFront();
5073                       return true;
5074                   }
5075                   else
5076                       return false;
5077                 }
5078                 return lookupUni!mode(inp);
5079             }
5080             else
5081                 return lookupUni!mode(inp);
5082         }
5083 
5084         static if (Sizes.length == 2)
5085         {
5086             public bool skip(Range)(ref Range inp) const
5087             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5088                 !isDynamicArray!Range)
5089             {
5090                 enum mode = Mode.alwaysSkip;
5091                 assert(!inp.empty);
5092                 immutable ch = inp[0];
5093                 static if (sizeFlags & 1)
5094                 {
5095                     if (ch < 0x80)
5096                     {
5097                         inp.popFront();
5098                         return ascii[ch];
5099                     }
5100                     else
5101                         return lookupUni!mode(inp);
5102                 }
5103                 else
5104                     return lookupUni!mode(inp);
5105             }
5106         }
5107 
5108         public bool test(Range)(ref Range inp) const
5109         if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5110             !isDynamicArray!Range)
5111         {
5112             enum mode = Mode.neverSkip;
5113             assert(!inp.empty);
5114             auto ch = inp[0];
5115             static if (sizeFlags & 1)
5116                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5117             else
5118                 return lookupUni!mode(inp);
5119         }
5120 
5121         bool match(C)(ref C[] str) const
5122         if (isSomeChar!C)
5123         {
5124             return fwdStr!"match"(str);
5125         }
5126 
5127         bool skip(C)(ref C[] str) const
5128         if (isSomeChar!C)
5129         {
5130             return fwdStr!"skip"(str);
5131         }
5132 
5133         bool test(C)(ref C[] str) const
5134         if (isSomeChar!C)
5135         {
5136             return fwdStr!"test"(str);
5137         }
5138 
5139         mixin ForwardStrings; //dispatch strings to range versions
5140     }
5141 
5142     struct Impl(Sizes...)
5143     if (Sizes.length >= 1 && Sizes.length <= 2)
5144     {
5145     private:
5146         import std.meta : allSatisfy;
5147         static assert(allSatisfy!(validSize, Sizes),
5148             "Only lengths of 1 and 2 code units are possible in UTF-16");
5149         static if (Sizes.length > 1)
5150             enum sizeFlags = Sizes[0] | Sizes[1];
5151         else
5152             enum sizeFlags = Sizes[0];
5153 
5154         static if (sizeFlags & 1)
5155         {
5156             Ascii ascii;
5157             Bmp bmp;
5158         }
5159         static if (sizeFlags & 2)
5160         {
5161             Uni uni;
5162         }
5163         mixin DefMatcher;
5164 
5165         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5166         {
5167             return CherryPick!(Impl, SizesToPick)(&this);
5168         }
5169 
5170         bool lookupUni(Mode mode, Range)(ref Range inp) const
5171         {
5172             wchar x = cast(wchar)(inp[0] - 0xD800);
5173             //not a high surrogate
5174             if (x > 0x3FF)
5175             {
5176                 //low surrogate
5177                 if (x <= 0x7FF) badEncoding();
5178                 static if (sizeFlags & 1)
5179                 {
5180                     auto ch = inp[0];
5181                     static if (mode == Mode.alwaysSkip)
5182                         inp.popFront();
5183                     static if (mode == Mode.skipOnMatch)
5184                     {
5185                         if (bmp[ch])
5186                         {
5187                             inp.popFront();
5188                             return true;
5189                         }
5190                         else
5191                             return false;
5192                     }
5193                     else
5194                         return bmp[ch];
5195                 }
5196                 else //skip is not available for sub-matchers, so just false
5197                     return false;
5198             }
5199             else
5200             {
5201                 import std.range : popFrontN;
5202                 static if (sizeFlags & 2)
5203                 {
5204                     if (inp.length < 2)
5205                         badEncoding();
5206                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5207                     //not a low surrogate
5208                     if (y > 0x3FF)
5209                         badEncoding();
5210                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5211                     static if (mode == Mode.alwaysSkip)
5212                         inp.popFrontN(2);
5213                     static if (mode == Mode.skipOnMatch)
5214                     {
5215                         if (uni[needle])
5216                         {
5217                             inp.popFrontN(2);
5218                             return true;
5219                         }
5220                         else
5221                             return false;
5222                     }
5223                     else
5224                         return uni[needle];
5225                 }
5226                 else //ditto
5227                     return false;
5228             }
5229         }
5230     }
5231 
5232     struct CherryPick(I, Sizes...)
5233     if (Sizes.length >= 1 && Sizes.length <= 2)
5234     {
5235     private:
5236         import std.meta : allSatisfy;
5237         I* m;
5238         enum sizeFlags = I.sizeFlags;
5239 
5240         static if (sizeFlags & 1)
5241         {
5242             @property auto ascii()() const { return m.ascii; }
5243         }
5244 
5245         bool lookupUni(Mode mode, Range)(ref Range inp) const
5246         {
5247             return m.lookupUni!mode(inp);
5248         }
5249         mixin DefMatcher;
5250         static assert(allSatisfy!(validSize, Sizes),
5251             "Only lengths of 1 and 2 code units are possible in UTF-16");
5252     }
5253 }
5254 
5255 private auto utf8Matcher(Set)(Set set)
5256 {
5257     return Utf8Matcher!().build(set);
5258 }
5259 
5260 private auto utf16Matcher(Set)(Set set)
5261 {
5262     return Utf16Matcher!().build(set);
5263 }
5264 
5265 /**
5266     Constructs a matcher object
5267     to classify $(CODEPOINTS) from the `set` for encoding
5268     that has `Char` as code unit.
5269 
5270     See $(LREF MatcherConcept) for API outline.
5271 */
5272 public auto utfMatcher(Char, Set)(Set set)
5273 if (isCodepointSet!Set)
5274 {
5275     static if (is(Char : char))
5276         return utf8Matcher(set);
5277     else static if (is(Char : wchar))
5278         return utf16Matcher(set);
5279     else static if (is(Char : dchar))
5280         static assert(false, "UTF-32 needs no decoding,
5281             and thus not supported by utfMatcher");
5282     else
5283         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5284 }
5285 
5286 
5287 //a range of code units, packed with index to speed up forward iteration
5288 package(std) auto decoder(C)(C[] s, size_t offset=0)
5289 if (is(C : wchar) || is(C : char))
5290 {
5291     static struct Decoder
5292     {
5293     pure nothrow:
5294         C[] str;
5295         size_t idx;
5296         @property C front(){ return str[idx]; }
5297         @property C back(){ return str[$-1]; }
5298         void popFront(){ idx++; }
5299         void popBack(){ str = str[0..$-1]; }
5300         void popFrontN(size_t n){ idx += n; }
5301         @property bool empty(){ return idx == str.length; }
5302         @property auto save(){ return this; }
5303         auto opIndex(size_t i){ return str[idx+i]; }
5304         @property size_t length(){ return str.length - idx; }
5305         alias opDollar = length;
5306         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5307     }
5308     static assert(isRandomAccessRange!Decoder);
5309     static assert(is(ElementType!Decoder : C));
5310     return Decoder(s, offset);
5311 }
5312 
5313 pure @safe unittest
5314 {
5315     string rs = "hi! ﾈемног砀 текста";
5316     auto codec = rs.decoder;
5317     auto utf8 =  utf8Matcher(unicode.Letter);
5318     auto asc = utf8.subMatcher!(1);
5319     auto uni = utf8.subMatcher!(2,3,4);
5320 
5321     // h
5322     assert(asc.test(codec));
5323     assert(!uni.match(codec));
5324     assert(utf8.skip(codec));
5325     assert(codec.idx == 1);
5326 
5327     // i
5328     assert(asc.test(codec));
5329     assert(!uni.match(codec));
5330     assert(utf8.skip(codec));
5331     assert(codec.idx == 2);
5332 
5333     // !
5334     assert(!asc.match(codec));
5335     assert(!utf8.test(codec));
5336     assert(!utf8.skip(codec));
5337     assert(codec.idx == 3);
5338 
5339     // space
5340     assert(!asc.test(codec));
5341     assert(!utf8.test(codec));
5342     assert(!utf8.skip(codec));
5343     assert(codec.idx == 4);
5344 
5345     assert(utf8.test(codec));
5346     foreach (i; 0 .. 7)
5347     {
5348         assert(!asc.test(codec));
5349         assert(uni.test(codec));
5350         assert(utf8.skip(codec));
5351     }
5352     assert(!utf8.test(codec));
5353     assert(!utf8.skip(codec));
5354 
5355     //the same with match where applicable
5356     codec = rs.decoder;
5357     assert(utf8.match(codec));
5358     assert(codec.idx == 1);
5359     assert(utf8.match(codec));
5360     assert(codec.idx == 2);
5361     assert(!utf8.match(codec));
5362     assert(codec.idx == 2);
5363     assert(!utf8.skip(codec));
5364     assert(!utf8.skip(codec));
5365 
5366     foreach (i; 0 .. 7)
5367     {
5368         assert(!asc.test(codec));
5369         assert(utf8.test(codec));
5370         assert(utf8.match(codec));
5371     }
5372     auto i = codec.idx;
5373     assert(!utf8.match(codec));
5374     assert(codec.idx == i);
5375 }
5376 
5377 pure @system unittest
5378 {
5379     import std.range : stride;
5380     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5381     {
5382         bool t = m.test(r);
5383         auto save = r.idx;
5384         assert(t == m.match(r));
5385         assert(r.idx == save || t); //ether no change or was match
5386         r.idx = save;
5387         static if (is(typeof(m.skip(r))))
5388         {
5389             assert(t == m.skip(r));
5390             assert(r.idx != save); //always changed
5391             r.idx = save;
5392         }
5393         return t;
5394     }
5395     auto utf16 = utfMatcher!wchar(unicode.L);
5396     auto bmp = utf16.subMatcher!1;
5397     auto nonBmp = utf16.subMatcher!1;
5398     auto utf8 = utfMatcher!char(unicode.L);
5399     auto ascii = utf8.subMatcher!1;
5400     auto uni2 = utf8.subMatcher!2;
5401     auto uni3 = utf8.subMatcher!3;
5402     auto uni24 = utf8.subMatcher!(2,4);
5403     foreach (ch; unicode.L.byCodepoint.stride(3))
5404     {
5405         import std.utf : encode;
5406         char[4] buf;
5407         wchar[2] buf16;
5408         auto len = encode(buf, ch);
5409         auto len16 = encode(buf16, ch);
5410         auto c8 = buf[0 .. len].decoder;
5411         auto c16 = buf16[0 .. len16].decoder;
5412         assert(testAll(utf16, c16));
5413         assert(testAll(bmp, c16) || len16 != 1);
5414         assert(testAll(nonBmp, c16) || len16 != 2);
5415 
5416         assert(testAll(utf8, c8));
5417 
5418         //submatchers return false on out of their domain
5419         assert(testAll(ascii, c8) || len != 1);
5420         assert(testAll(uni2, c8) || len != 2);
5421         assert(testAll(uni3, c8) || len != 3);
5422         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5423     }
5424 }
5425 
5426 // cover decode fail cases of Matcher
5427 pure @safe unittest
5428 {
5429     import std.algorithm.iteration : map;
5430     import std.exception : collectException;
5431     import std.format : format;
5432     auto utf16 = utfMatcher!wchar(unicode.L);
5433     auto utf8 = utfMatcher!char(unicode.L);
5434     //decode failure cases UTF-8
5435     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5436         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5437         "\xCF\x00\0x00\0x00\x00");
5438     foreach (msg; fails8)
5439     {
5440         assert(collectException((){
5441             auto s = msg;
5442             size_t idx = 0;
5443             utf8.test(s);
5444         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5445     }
5446     //decode failure cases UTF-16
5447     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5448     foreach (msg; fails16)
5449     {
5450         assert(collectException((){
5451             auto s = msg.map!(x => cast(wchar) x);
5452             utf16.test(s);
5453         }()));
5454     }
5455 }
5456 
5457 /++
5458     Convenience function to construct optimal configurations for
5459     packed Trie from any `set` of $(CODEPOINTS).
5460 
5461     The parameter `level` indicates the number of trie levels to use,
5462     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5463     speed-size wise.
5464 
5465     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5466     $(P Level 4 is the slowest and has the smallest footprint. )
5467 
5468     See the $(S_LINK Synopsis, Synopsis) section for example.
5469 
5470     Note:
5471     Level 4 stays very practical (being faster and more predictable)
5472     compared to using direct lookup on the `set` itself.
5473 
5474 
5475 +/
5476 public auto toTrie(size_t level, Set)(Set set)
5477 if (isCodepointSet!Set)
5478 {
5479     static if (level == 1)
5480         return codepointSetTrie!(21)(set);
5481     else static if (level == 2)
5482         return codepointSetTrie!(10, 11)(set);
5483     else static if (level == 3)
5484         return codepointSetTrie!(8, 5, 8)(set);
5485     else static if (level == 4)
5486          return codepointSetTrie!(6, 4, 4, 7)(set);
5487     else
5488         static assert(false,
5489             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5490 }
5491 
5492 /**
5493     $(P Builds a `Trie` with typically optimal speed-size trade-off
5494     and wraps it into a delegate of the following type:
5495     $(D bool delegate(dchar ch)). )
5496 
5497     $(P Effectively this creates a 'tester' lambda suitable
5498     for algorithms like std.algorithm.find that take unary predicates. )
5499 
5500     See the $(S_LINK Synopsis, Synopsis) section for example.
5501 */
5502 public auto toDelegate(Set)(Set set)
5503 if (isCodepointSet!Set)
5504 {
5505     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5506     auto t = toTrie!3(set);
5507     return (dchar ch) => t[ch];
5508 }
5509 
5510 /**
5511     $(P Opaque wrapper around unsigned built-in integers and
5512     code unit (char/wchar/dchar) types.
5513     Parameter `sz` indicates that the value is confined
5514     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5515     packed more tightly when stored in certain
5516     data-structures like trie. )
5517 
5518     Note:
5519     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5520     but not vise-versa. Users have to ensure the value fits in
5521     the range required and use the `cast`
5522     operator to perform the conversion.)
5523 */
5524 struct BitPacked(T, size_t sz)
5525 if (isIntegral!T || is(T:dchar))
5526 {
5527     enum bitSize = sz;
5528     T _value;
5529     alias _value this;
5530 }
5531 
5532 /*
5533     Depending on the form of the passed argument `bitSizeOf` returns
5534     the amount of bits required to represent a given type
5535     or a return type of a given functor.
5536 */
5537 template bitSizeOf(Args...)
5538 if (Args.length == 1)
5539 {
5540     import std.traits : ReturnType;
5541     alias T = Args[0];
5542     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5543     {
5544         enum bitSizeOf = T.bitSize;
5545     }
5546     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5547     {
5548         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5549     }
5550     else
5551     {
5552         enum bitSizeOf = T.sizeof*8;
5553     }
5554 }
5555 
5556 /**
5557     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5558     and thus suitable for packing.
5559 */
5560 template isBitPacked(T)
5561 {
5562     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5563         enum isBitPacked = true;
5564     else
5565         enum isBitPacked = false;
5566 }
5567 
5568 /**
5569     Gives the type `U` from $(LREF BitPacked)!(U, x)
5570     or `T` itself for every other type.
5571 */
5572 template TypeOfBitPacked(T)
5573 {
5574     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5575         alias TypeOfBitPacked = U;
5576     else
5577         alias TypeOfBitPacked = T;
5578 }
5579 
5580 /*
5581     Wrapper, used in definition of custom data structures from `Trie` template.
5582     Applying it to a unary lambda function indicates that the returned value always
5583     fits within `bits` of bits.
5584 */
5585 struct assumeSize(alias Fn, size_t bits)
5586 {
5587     enum bitSize = bits;
5588     static auto ref opCall(T)(auto ref T arg)
5589     {
5590         return Fn(arg);
5591     }
5592 }
5593 
5594 /*
5595     A helper for defining lambda function that yields a slice
5596     of certain bits from an unsigned integral value.
5597     The resulting lambda is wrapped in assumeSize and can be used directly
5598     with `Trie` template.
5599 */
5600 struct sliceBits(size_t from, size_t to)
5601 {
5602     //for now bypass assumeSize, DMD has trouble inlining it
5603     enum bitSize = to-from;
5604     static auto opCall(T)(T x)
5605     out(result)
5606     {
5607         assert(result < (1 << to-from));
5608     }
5609     do
5610     {
5611         static assert(from < to);
5612         static if (from == 0)
5613             return x & ((1 << to)-1);
5614         else
5615         return (x >> from) & ((1<<(to-from))-1);
5616     }
5617 }
5618 
5619 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5620 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5621 alias lo8 = assumeSize!(low_8, 8);
5622 alias mlo8 = assumeSize!(midlow_8, 8);
5623 
5624 @safe pure nothrow @nogc unittest
5625 {
5626     static assert(bitSizeOf!lo8 == 8);
5627     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5628     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5629 }
5630 
5631 template Sequence(size_t start, size_t end)
5632 {
5633     static if (start < end)
5634         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5635     else
5636         alias Sequence = AliasSeq!();
5637 }
5638 
5639 //---- TRIE TESTS ----
5640 @system unittest
5641 {
5642     import std.algorithm.iteration : map;
5643     import std.algorithm.sorting : sort;
5644     import std.array : array;
5645     import std.conv : text, to;
5646     import std.range : iota;
5647     static trieStats(TRIE)(TRIE t)
5648     {
5649         version (std_uni_stats)
5650         {
5651             import std.stdio : writefln, writeln;
5652             writeln("---TRIE FOOTPRINT STATS---");
5653             static foreach (i; 0 .. t.table.dim)
5654             {
5655                 writefln("lvl%s = %s bytes;  %s pages"
5656                          , i, t.bytes!i, t.pages!i);
5657             }
5658             writefln("TOTAL: %s bytes", t.bytes);
5659             version (none)
5660             {
5661                 writeln("INDEX (excluding value level):");
5662                 static foreach (i; 0 .. t.table.dim-1)
5663                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5664             }
5665             writeln("---------------------------");
5666         }
5667     }
5668     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5669     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5670     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5671     alias Set = CodepointSet;
5672     auto set = Set('A','Z','a','z');
5673     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5674     for (int a='a'; a<'z';a++)
5675         assert(trie[a]);
5676     for (int a='A'; a<'Z';a++)
5677         assert(trie[a]);
5678     for (int a=0; a<'A'; a++)
5679         assert(!trie[a]);
5680     for (int a ='Z'; a<'a'; a++)
5681         assert(!trie[a]);
5682     trieStats(trie);
5683 
5684     auto redundant2 = Set(
5685         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5686     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5687     trieStats(trie2);
5688     foreach (e; redundant2.byCodepoint)
5689         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5690     foreach (i; 0 .. 1024)
5691     {
5692         assert(trie2[i] == (i in redundant2));
5693     }
5694 
5695 
5696     auto redundant3 = Set(
5697           2,    4,    6,    8,    16,
5698        2+16, 4+16, 16+6, 16+8, 16+16,
5699        2+32, 4+32, 32+6, 32+8,
5700       );
5701 
5702     enum max3 = 256;
5703     // sliceBits
5704     auto trie3 = buildTrie!(bool, uint, max3,
5705             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5706         )(redundant3.byInterval);
5707     trieStats(trie3);
5708     foreach (i; 0 .. max3)
5709         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5710 
5711     auto redundant4 = Set(
5712             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5713             1000, 2000, 3000, 4000, 5000, 6000
5714         );
5715     enum max4 = 2^^16;
5716     auto trie4 = buildTrie!(bool, size_t, max4,
5717             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5718         )(redundant4.byInterval);
5719     foreach (i; 0 .. max4)
5720     {
5721         if (i in redundant4)
5722             assert(trie4[i], text(cast(uint) i));
5723     }
5724     trieStats(trie4);
5725 
5726         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5727         string[] redundantS = ["tea", "start", "orange"];
5728         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5729         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5730         // using first char only
5731         assert(redundantS == ["orange", "start", "tea"]);
5732         assert(strie["test"], text(strie["test"]));
5733         assert(!strie["aea"]);
5734         assert(strie["s"]);
5735 
5736     // a bit size test
5737     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5738     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5739     trieStats(bt);
5740     foreach (i; 0 .. 256)
5741         assert(bt[cast(ubyte) i]);
5742 }
5743 
5744 template useItemAt(size_t idx, T)
5745 if (isIntegral!T || is(T: dchar))
5746 {
5747     size_t impl(const scope T[] arr){ return arr[idx]; }
5748     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5749 }
5750 
5751 template useLastItem(T)
5752 {
5753     size_t impl(const scope T[] arr){ return arr[$-1]; }
5754     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5755 }
5756 
5757 template fullBitSize(Prefix...)
5758 {
5759     static if (Prefix.length > 0)
5760         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5761     else
5762         enum fullBitSize = 0;
5763 }
5764 
5765 template idxTypes(Key, size_t fullBits, Prefix...)
5766 {
5767     static if (Prefix.length == 1)
5768     {// the last level is value level, so no index once reduced to 1-level
5769         alias idxTypes = AliasSeq!();
5770     }
5771     else
5772     {
5773         // Important note on bit packing
5774         // Each level has to hold enough of bits to address the next one
5775         // The bottom level is known to hold full bit width
5776         // thus it's size in pages is full_bit_width - size_of_last_prefix
5777         // Recourse on this notion
5778         alias idxTypes =
5779             AliasSeq!(
5780                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5781                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5782             );
5783     }
5784 }
5785 
5786 //============================================================================
5787 
5788 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5789 if (is(Char1 : dchar) && is(Char2 : dchar))
5790 {
5791     import std.algorithm.comparison : cmp;
5792     import std.algorithm.iteration : map, filter;
5793     import std.ascii : toLower;
5794     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5795     return cmp(
5796         a.map!toLower.filter!pred,
5797         b.map!toLower.filter!pred);
5798 }
5799 
5800 @safe pure unittest
5801 {
5802     assert(!comparePropertyName("foo-bar", "fooBar"));
5803 }
5804 
5805 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5806 if (is(Char1 : dchar) && is(Char2 : dchar))
5807 {
5808     return comparePropertyName(a, b) < 0;
5809 }
5810 
5811 //============================================================================
5812 // Utilities for compression of Unicode code point sets
5813 //============================================================================
5814 
5815 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5816 {
5817     // not optimized as usually done 1 time (and not public interface)
5818     if (val < 128)
5819         arr ~= cast(ubyte) val;
5820     else if (val < (1 << 13))
5821     {
5822         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5823         arr ~= val & 0xFF;
5824     }
5825     else
5826     {
5827         assert(val < (1 << 21));
5828         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5829         arr ~= (val >> 8) & 0xFF;
5830         arr ~= val  & 0xFF;
5831     }
5832 }
5833 
5834 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5835 {
5836     import std.exception : enforce;
5837     immutable first = arr[idx++];
5838     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5839         return first;
5840     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5841     uint val = (first & 0x1F);
5842     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5843     foreach (j; 0 .. extra)
5844         val = (val << 8) | arr[idx+j];
5845     idx += extra;
5846     return val;
5847 }
5848 
5849 
5850 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5851 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5852 {
5853     ubyte[] storage;
5854     uint base = 0;
5855     // RLE encode
5856     foreach (val; intervals)
5857     {
5858         compressTo(val[0]-base, storage);
5859         base = val[0];
5860         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5861         {
5862             compressTo(val[1]-base, storage);
5863             base = val[1];
5864         }
5865     }
5866     return storage;
5867 }
5868 
5869 @safe pure unittest
5870 {
5871     import std.algorithm.comparison : equal;
5872     import std.typecons : tuple;
5873 
5874     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5875     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5876     assert(compressIntervals(run) == enc);
5877     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5878     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5879     assert(compressIntervals(run2) == enc2);
5880     size_t  idx = 0;
5881     assert(decompressFrom(enc, idx) == 80);
5882     assert(decompressFrom(enc, idx) == 47);
5883     assert(decompressFrom(enc, idx) == 1);
5884     assert(decompressFrom(enc, idx) == (1 << 10));
5885     idx = 0;
5886     assert(decompressFrom(enc2, idx) == 0);
5887     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5888     assert(equal(decompressIntervals(compressIntervals(run)), run));
5889     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5890 }
5891 
5892 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5893 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5894 {
5895     return DecompressedIntervals(data);
5896 }
5897 
5898 @safe struct DecompressedIntervals
5899 {
5900 pure:
5901     const(ubyte)[] _stream;
5902     size_t _idx;
5903     CodepointInterval _front;
5904 
5905     this(const(ubyte)[] stream)
5906     {
5907         _stream = stream;
5908         popFront();
5909     }
5910 
5911     @property CodepointInterval front()
5912     {
5913         assert(!empty);
5914         return _front;
5915     }
5916 
5917     void popFront()
5918     {
5919         if (_idx == _stream.length)
5920         {
5921             _idx = size_t.max;
5922             return;
5923         }
5924         uint base = _front[1];
5925         _front[0] = base + decompressFrom(_stream, _idx);
5926         if (_idx == _stream.length)// odd length ---> till the end
5927             _front[1] = lastDchar+1;
5928         else
5929         {
5930             base = _front[0];
5931             _front[1] = base + decompressFrom(_stream, _idx);
5932         }
5933     }
5934 
5935     @property bool empty() const
5936     {
5937         return _idx == size_t.max;
5938     }
5939 
5940     @property DecompressedIntervals save() return scope { return this; }
5941 }
5942 
5943 @safe pure nothrow @nogc unittest
5944 {
5945     static assert(isInputRange!DecompressedIntervals);
5946     static assert(isForwardRange!DecompressedIntervals);
5947 }
5948 
5949 //============================================================================
5950 
5951 version (std_uni_bootstrap){}
5952 else
5953 {
5954 
5955 // helper for looking up code point sets
5956 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5957 {
5958     import std.algorithm.iteration : map;
5959     import std.range : assumeSorted;
5960     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5961         (table.map!"a.name"());
5962     size_t idx = range.lowerBound(name).length;
5963     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5964         return idx;
5965     return -1;
5966 }
5967 
5968 // another one that loads it
5969 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5970 {
5971     auto idx = findUnicodeSet!table(name);
5972     if (idx >= 0)
5973     {
5974         dest = Set(asSet(table[idx].compressed));
5975         return true;
5976     }
5977     return false;
5978 }
5979 
5980 bool loadProperty(Set=CodepointSet, C)
5981     (const scope C[] name, ref Set target) pure
5982 {
5983     import std.internal.unicode_tables : uniProps; // generated file
5984     alias ucmp = comparePropertyName;
5985     // conjure cumulative properties by hand
5986     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5987     {
5988         target = asSet(uniProps.Lu);
5989         target |= asSet(uniProps.Ll);
5990         target |= asSet(uniProps.Lt);
5991         target |= asSet(uniProps.Lo);
5992         target |= asSet(uniProps.Lm);
5993     }
5994     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5995     {
5996         target = asSet(uniProps.Ll);
5997         target |= asSet(uniProps.Lu);
5998         target |= asSet(uniProps.Lt);// Title case
5999     }
6000     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
6001     {
6002         target = asSet(uniProps.Mn);
6003         target |= asSet(uniProps.Mc);
6004         target |= asSet(uniProps.Me);
6005     }
6006     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
6007     {
6008         target = asSet(uniProps.Nd);
6009         target |= asSet(uniProps.Nl);
6010         target |= asSet(uniProps.No);
6011     }
6012     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
6013     {
6014         target = asSet(uniProps.Pc);
6015         target |= asSet(uniProps.Pd);
6016         target |= asSet(uniProps.Ps);
6017         target |= asSet(uniProps.Pe);
6018         target |= asSet(uniProps.Pi);
6019         target |= asSet(uniProps.Pf);
6020         target |= asSet(uniProps.Po);
6021     }
6022     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6023     {
6024         target = asSet(uniProps.Sm);
6025         target |= asSet(uniProps.Sc);
6026         target |= asSet(uniProps.Sk);
6027         target |= asSet(uniProps.So);
6028     }
6029     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6030     {
6031         target = asSet(uniProps.Zs);
6032         target |= asSet(uniProps.Zl);
6033         target |= asSet(uniProps.Zp);
6034     }
6035     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6036     {
6037         target = asSet(uniProps.Cc);
6038         target |= asSet(uniProps.Cf);
6039         target |= asSet(uniProps.Cs);
6040         target |= asSet(uniProps.Co);
6041         target |= asSet(uniProps.Cn);
6042     }
6043     else if (ucmp(name, "graphical") == 0)
6044     {
6045         target = asSet(uniProps.Alphabetic);
6046 
6047         target |= asSet(uniProps.Mn);
6048         target |= asSet(uniProps.Mc);
6049         target |= asSet(uniProps.Me);
6050 
6051         target |= asSet(uniProps.Nd);
6052         target |= asSet(uniProps.Nl);
6053         target |= asSet(uniProps.No);
6054 
6055         target |= asSet(uniProps.Pc);
6056         target |= asSet(uniProps.Pd);
6057         target |= asSet(uniProps.Ps);
6058         target |= asSet(uniProps.Pe);
6059         target |= asSet(uniProps.Pi);
6060         target |= asSet(uniProps.Pf);
6061         target |= asSet(uniProps.Po);
6062 
6063         target |= asSet(uniProps.Zs);
6064 
6065         target |= asSet(uniProps.Sm);
6066         target |= asSet(uniProps.Sc);
6067         target |= asSet(uniProps.Sk);
6068         target |= asSet(uniProps.So);
6069     }
6070     else if (ucmp(name, "any") == 0)
6071         target = Set.fromIntervals(0, 0x110000);
6072     else if (ucmp(name, "ascii") == 0)
6073         target = Set.fromIntervals(0, 0x80);
6074     else
6075         return loadUnicodeSet!(uniProps.tab)(name, target);
6076     return true;
6077 }
6078 
6079 // CTFE-only helper for checking property names at compile-time
6080 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6081 {
6082     import std.algorithm.searching : find;
6083     auto names = [
6084         "L", "Letter",
6085         "LC", "Cased Letter",
6086         "M", "Mark",
6087         "N", "Number",
6088         "P", "Punctuation",
6089         "S", "Symbol",
6090         "Z", "Separator",
6091         "Graphical",
6092         "any",
6093         "ascii"
6094     ];
6095     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6096     return !x.empty;
6097 }
6098 
6099 // ditto, CTFE-only, not optimized
6100 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6101 {
6102     return findUnicodeSet!table(name) >= 0;
6103 }
6104 
6105 template SetSearcher(alias table, string kind)
6106 {
6107     /// Run-time checked search.
6108     static auto opCall(C)(const scope C[] name)
6109     if (is(C : dchar))
6110     {
6111         import std.conv : to;
6112         CodepointSet set;
6113         if (loadUnicodeSet!table(name, set))
6114             return set;
6115         throw new Exception("No unicode set for "~kind~" by name "
6116             ~name.to!string()~" was found.");
6117     }
6118     /// Compile-time checked search.
6119     static @property auto opDispatch(string name)()
6120     {
6121         static if (findSetName!table(name))
6122         {
6123             CodepointSet set;
6124             loadUnicodeSet!table(name, set);
6125             return set;
6126         }
6127         else
6128             static assert(false, "No unicode set for "~kind~" by name "
6129                 ~name~" was found.");
6130     }
6131 }
6132 
6133 // Characters that need escaping in string posed as regular expressions
6134 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6135     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6136 
6137 package(std) CodepointSet memoizeExpr(string expr)()
6138 {
6139     if (__ctfe)
6140         return mixin(expr);
6141     alias T = typeof(mixin(expr));
6142     static T slot;
6143     static bool initialized;
6144     if (!initialized)
6145     {
6146         slot =  mixin(expr);
6147         initialized = true;
6148     }
6149     return slot;
6150 }
6151 
6152 //property for \w character class
6153 package(std) @property CodepointSet wordCharacter() @safe
6154 {
6155     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6156         | unicode.Me | unicode.Nd | unicode.Pc")();
6157 }
6158 
6159 //basic stack, just in case it gets used anywhere else then Parser
6160 package(std) struct Stack(T)
6161 {
6162 @safe:
6163     T[] data;
6164     @property bool empty(){ return data.empty; }
6165 
6166     @property size_t length(){ return data.length; }
6167 
6168     void push(T val){ data ~= val;  }
6169 
6170     @trusted T pop()
6171     {
6172         assert(!empty);
6173         auto val = data[$ - 1];
6174         data = data[0 .. $ - 1];
6175         if (!__ctfe)
6176             cast(void) data.assumeSafeAppend();
6177         return val;
6178     }
6179 
6180     @property ref T top()
6181     {
6182         assert(!empty);
6183         return data[$ - 1];
6184     }
6185 }
6186 
6187 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6188 //returns it's value and skips these maxDigit chars on success, throws on failure
6189 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6190 {
6191     import std.exception : enforce;
6192     //std.conv.parse is both @system and bogus
6193     uint val;
6194     for (int k = 0; k < maxDigit; k++)
6195     {
6196         enforce(!str.empty, "incomplete escape sequence");
6197         //accepts ascii only, so it's OK to index directly
6198         immutable current = str.front;
6199         if ('0' <= current && current <= '9')
6200             val = val * 16 + current - '0';
6201         else if ('a' <= current && current <= 'f')
6202             val = val * 16 + current -'a' + 10;
6203         else if ('A' <= current && current <= 'F')
6204             val = val * 16 + current - 'A' + 10;
6205         else
6206             throw new Exception("invalid escape sequence");
6207         str.popFront();
6208     }
6209     enforce(val <= 0x10FFFF, "invalid codepoint");
6210     return val;
6211 }
6212 
6213 @safe unittest
6214 {
6215     import std.algorithm.searching : canFind;
6216     import std.exception : collectException;
6217     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6218     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6219     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6220     foreach (v; non_hex)
6221         assert(collectException(parseUniHex(v, v.length)).msg
6222           .canFind("invalid escape sequence"));
6223     foreach (i, v; hex)
6224         assert(parseUniHex(v, v.length) == value[i]);
6225     string over = "0011FFFF";
6226     assert(collectException(parseUniHex(over, over.length)).msg
6227       .canFind("invalid codepoint"));
6228 }
6229 
6230 auto caseEnclose(CodepointSet set)
6231 {
6232     auto cased = set & unicode.LC;
6233     foreach (dchar ch; cased.byCodepoint)
6234     {
6235         foreach (c; simpleCaseFoldings(ch))
6236             set |= c;
6237     }
6238     return set;
6239 }
6240 
6241 /+
6242     fetch codepoint set corresponding to a name (InBlock or binary property)
6243 +/
6244 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6245 {
6246     CodepointSet s = unicode(name);
6247     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6248     if (casefold)
6249        s = caseEnclose(s);
6250     if (negated)
6251         s = s.inverted;
6252     return s;
6253 }
6254 
6255 struct UnicodeSetParser(Range)
6256 {
6257     import std.exception : enforce;
6258     import std.typecons : tuple, Tuple;
6259     Range range;
6260     bool casefold_;
6261 
6262     @property bool empty(){ return range.empty; }
6263     @property dchar front(){ return range.front; }
6264     void popFront(){ range.popFront(); }
6265 
6266     //CodepointSet operations relatively in order of priority
6267     enum Operator:uint {
6268         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6269     }
6270 
6271     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6272     //also fetches next set operation
6273     Tuple!(CodepointSet,Operator) parseCharTerm()
6274     {
6275         import std.range : drop;
6276         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6277         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6278             PotentialTwinSymbolOperator }
6279         Operator op = Operator.None;
6280         dchar last;
6281         CodepointSet set;
6282         State state = State.Start;
6283 
6284         void addWithFlags(ref CodepointSet set, uint ch)
6285         {
6286             if (casefold_)
6287             {
6288                 auto foldings = simpleCaseFoldings(ch);
6289                 foreach (v; foldings)
6290                     set |= v;
6291             }
6292             else
6293                 set |= ch;
6294         }
6295 
6296         static Operator twinSymbolOperator(dchar symbol)
6297         {
6298             switch (symbol)
6299             {
6300             case '|':
6301                 return Operator.Union;
6302             case '-':
6303                 return Operator.Difference;
6304             case '~':
6305                 return Operator.SymDifference;
6306             case '&':
6307                 return Operator.Intersection;
6308             default:
6309                 assert(false);
6310             }
6311         }
6312 
6313         L_CharTermLoop:
6314         for (;;)
6315         {
6316             final switch (state)
6317             {
6318             case State.Start:
6319                 switch (front)
6320                 {
6321                 case '|':
6322                 case '-':
6323                 case '~':
6324                 case '&':
6325                     state = State.PotentialTwinSymbolOperator;
6326                     last = front;
6327                     break;
6328                 case '[':
6329                     op = Operator.Union;
6330                     goto case;
6331                 case ']':
6332                     break L_CharTermLoop;
6333                 case '\\':
6334                     state = State.Escape;
6335                     break;
6336                 default:
6337                     state = State.Char;
6338                     last = front;
6339                 }
6340                 break;
6341             case State.Char:
6342                 // xxx last front xxx
6343                 switch (front)
6344                 {
6345                 case '|':
6346                 case '~':
6347                 case '&':
6348                     // then last is treated as normal char and added as implicit union
6349                     state = State.PotentialTwinSymbolOperator;
6350                     addWithFlags(set, last);
6351                     last = front;
6352                     break;
6353                 case '-': // still need more info
6354                     state = State.CharDash;
6355                     break;
6356                 case '\\':
6357                     set |= last;
6358                     state = State.Escape;
6359                     break;
6360                 case '[':
6361                     op = Operator.Union;
6362                     goto case;
6363                 case ']':
6364                     addWithFlags(set, last);
6365                     break L_CharTermLoop;
6366                 default:
6367                     state = State.Char;
6368                     addWithFlags(set, last);
6369                     last = front;
6370                 }
6371                 break;
6372             case State.PotentialTwinSymbolOperator:
6373                 // xxx last front xxxx
6374                 // where last = [|-&~]
6375                 if (front == last)
6376                 {
6377                     op = twinSymbolOperator(last);
6378                     popFront();//skip second twin char
6379                     break L_CharTermLoop;
6380                 }
6381                 goto case State.Char;
6382             case State.Escape:
6383                 // xxx \ front xxx
6384                 switch (front)
6385                 {
6386                 case 'f':
6387                     last = '\f';
6388                     state = State.Char;
6389                     break;
6390                 case 'n':
6391                     last = '\n';
6392                     state = State.Char;
6393                     break;
6394                 case 'r':
6395                     last = '\r';
6396                     state = State.Char;
6397                     break;
6398                 case 't':
6399                     last = '\t';
6400                     state = State.Char;
6401                     break;
6402                 case 'v':
6403                     last = '\v';
6404                     state = State.Char;
6405                     break;
6406                 case 'c':
6407                     last = unicode.parseControlCode(this);
6408                     state = State.Char;
6409                     break;
6410                 foreach (val; Escapables)
6411                 {
6412                 case val:
6413                 }
6414                     last = front;
6415                     state = State.Char;
6416                     break;
6417                 case 'p':
6418                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6419                     state = State.Start;
6420                     continue L_CharTermLoop; //next char already fetched
6421                 case 'P':
6422                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6423                     state = State.Start;
6424                     continue L_CharTermLoop; //next char already fetched
6425                 case 'x':
6426                     popFront();
6427                     last = parseUniHex(this, 2);
6428                     state = State.Char;
6429                     continue L_CharTermLoop;
6430                 case 'u':
6431                     popFront();
6432                     last = parseUniHex(this, 4);
6433                     state = State.Char;
6434                     continue L_CharTermLoop;
6435                 case 'U':
6436                     popFront();
6437                     last = parseUniHex(this, 8);
6438                     state = State.Char;
6439                     continue L_CharTermLoop;
6440                 case 'd':
6441                     set.add(unicode.Nd);
6442                     state = State.Start;
6443                     break;
6444                 case 'D':
6445                     set.add(unicode.Nd.inverted);
6446                     state = State.Start;
6447                     break;
6448                 case 's':
6449                     set.add(unicode.White_Space);
6450                     state = State.Start;
6451                     break;
6452                 case 'S':
6453                     set.add(unicode.White_Space.inverted);
6454                     state = State.Start;
6455                     break;
6456                 case 'w':
6457                     set.add(wordCharacter);
6458                     state = State.Start;
6459                     break;
6460                 case 'W':
6461                     set.add(wordCharacter.inverted);
6462                     state = State.Start;
6463                     break;
6464                 default:
6465                     if (front >= privateUseStart && front <= privateUseEnd)
6466                         enforce(false, "no matching ']' found while parsing character class");
6467                     enforce(false, "invalid escape sequence");
6468                 }
6469                 break;
6470             case State.CharDash:
6471                 // xxx last - front xxx
6472                 switch (front)
6473                 {
6474                 case '[':
6475                     op = Operator.Union;
6476                     goto case;
6477                 case ']':
6478                     //means dash is a single char not an interval specifier
6479                     addWithFlags(set, last);
6480                     addWithFlags(set, '-');
6481                     break L_CharTermLoop;
6482                  case '-'://set Difference again
6483                     addWithFlags(set, last);
6484                     op = Operator.Difference;
6485                     popFront();//skip '-'
6486                     break L_CharTermLoop;
6487                 case '\\':
6488                     state = State.CharDashEscape;
6489                     break;
6490                 default:
6491                     enforce(last <= front, "inverted range");
6492                     if (casefold_)
6493                     {
6494                         for (uint ch = last; ch <= front; ch++)
6495                             addWithFlags(set, ch);
6496                     }
6497                     else
6498                         set.add(last, front + 1);
6499                     state = State.Start;
6500                 }
6501                 break;
6502             case State.CharDashEscape:
6503             //xxx last - \ front xxx
6504                 uint end;
6505                 switch (front)
6506                 {
6507                 case 'f':
6508                     end = '\f';
6509                     break;
6510                 case 'n':
6511                     end = '\n';
6512                     break;
6513                 case 'r':
6514                     end = '\r';
6515                     break;
6516                 case 't':
6517                     end = '\t';
6518                     break;
6519                 case 'v':
6520                     end = '\v';
6521                     break;
6522                 foreach (val; Escapables)
6523                 {
6524                 case val:
6525                 }
6526                     end = front;
6527                     break;
6528                 case 'c':
6529                     end = unicode.parseControlCode(this);
6530                     break;
6531                 case 'x':
6532                     popFront();
6533                     end = parseUniHex(this, 2);
6534                     enforce(last <= end,"inverted range");
6535                     set.add(last, end + 1);
6536                     state = State.Start;
6537                     continue L_CharTermLoop;
6538                 case 'u':
6539                     popFront();
6540                     end = parseUniHex(this, 4);
6541                     enforce(last <= end,"inverted range");
6542                     set.add(last, end + 1);
6543                     state = State.Start;
6544                     continue L_CharTermLoop;
6545                 case 'U':
6546                     popFront();
6547                     end = parseUniHex(this, 8);
6548                     enforce(last <= end,"inverted range");
6549                     set.add(last, end + 1);
6550                     state = State.Start;
6551                     continue L_CharTermLoop;
6552                 default:
6553                     if (front >= privateUseStart && front <= privateUseEnd)
6554                         enforce(false, "no matching ']' found while parsing character class");
6555                     enforce(false, "invalid escape sequence");
6556                 }
6557                 // Lookahead to check if it's a \T
6558                 // where T is sub-pattern terminator in multi-pattern scheme
6559                 auto lookahead = range.save.drop(1);
6560                 if (end == '\\' && !lookahead.empty)
6561                 {
6562                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6563                         enforce(false, "no matching ']' found while parsing character class");
6564                 }
6565                 enforce(last <= end,"inverted range");
6566                 set.add(last, end + 1);
6567                 state = State.Start;
6568                 break;
6569             }
6570             popFront();
6571             enforce(!empty, "unexpected end of CodepointSet");
6572         }
6573         return tuple(set, op);
6574     }
6575 
6576     alias ValStack = Stack!(CodepointSet);
6577     alias OpStack = Stack!(Operator);
6578 
6579     CodepointSet parseSet()
6580     {
6581         ValStack vstack;
6582         OpStack opstack;
6583         import std.functional : unaryFun;
6584         enforce(!empty, "unexpected end of input");
6585         enforce(front == '[', "expected '[' at the start of unicode set");
6586         //
6587         static bool apply(Operator op, ref ValStack stack)
6588         {
6589             switch (op)
6590             {
6591             case Operator.Negate:
6592                 enforce(!stack.empty, "no operand for '^'");
6593                 stack.top = stack.top.inverted;
6594                 break;
6595             case Operator.Union:
6596                 auto s = stack.pop();//2nd operand
6597                 enforce(!stack.empty, "no operand for '||'");
6598                 stack.top.add(s);
6599                 break;
6600             case Operator.Difference:
6601                 auto s = stack.pop();//2nd operand
6602                 enforce(!stack.empty, "no operand for '--'");
6603                 stack.top.sub(s);
6604                 break;
6605             case Operator.SymDifference:
6606                 auto s = stack.pop();//2nd operand
6607                 enforce(!stack.empty, "no operand for '~~'");
6608                 stack.top ~= s;
6609                 break;
6610             case Operator.Intersection:
6611                 auto s = stack.pop();//2nd operand
6612                 enforce(!stack.empty, "no operand for '&&'");
6613                 stack.top.intersect(s);
6614                 break;
6615             default:
6616                 return false;
6617             }
6618             return true;
6619         }
6620         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6621         {
6622             while (cond(opstack.top))
6623             {
6624                 if (!apply(opstack.pop(),vstack))
6625                     return false;//syntax error
6626                 if (opstack.empty)
6627                     return false;
6628             }
6629             return true;
6630         }
6631 
6632         L_CharsetLoop:
6633         do
6634         {
6635             switch (front)
6636             {
6637             case '[':
6638                 opstack.push(Operator.Open);
6639                 popFront();
6640                 enforce(!empty, "unexpected end of character class");
6641                 if (front == '^')
6642                 {
6643                     opstack.push(Operator.Negate);
6644                     popFront();
6645                     enforce(!empty, "unexpected end of character class");
6646                 }
6647                 else if (front == ']') // []...] is special cased
6648                 {
6649                     popFront();
6650                     enforce(!empty, "wrong character set");
6651                     auto pair = parseCharTerm();
6652                     pair[0].add(']', ']'+1);
6653                     if (pair[1] != Operator.None)
6654                     {
6655                         if (opstack.top == Operator.Union)
6656                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6657                         opstack.push(pair[1]);
6658                     }
6659                     vstack.push(pair[0]);
6660                 }
6661                 break;
6662             case ']':
6663                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6664                     "character class syntax error");
6665                 enforce(!opstack.empty, "unmatched ']'");
6666                 opstack.pop();
6667                 popFront();
6668                 if (opstack.empty)
6669                     break L_CharsetLoop;
6670                 auto pair  = parseCharTerm();
6671                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6672                 {
6673                     vstack.top.add(pair[0]);//apply union
6674                 }
6675                 if (pair[1] != Operator.None)
6676                 {
6677                     if (opstack.top == Operator.Union)
6678                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6679                     opstack.push(pair[1]);
6680                 }
6681                 break;
6682             //
6683             default://yet another pair of term(op)?
6684                 auto pair = parseCharTerm();
6685                 if (pair[1] != Operator.None)
6686                 {
6687                     if (opstack.top == Operator.Union)
6688                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6689                     opstack.push(pair[1]);
6690                 }
6691                 vstack.push(pair[0]);
6692             }
6693 
6694         }while (!empty || !opstack.empty);
6695         while (!opstack.empty)
6696             apply(opstack.pop(),vstack);
6697         assert(vstack.length == 1);
6698         return vstack.top;
6699     }
6700 }
6701 
6702 /**
6703     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6704     a block, script or general category.
6705 
6706     It uses well defined standard rules of property name lookup.
6707     This includes fuzzy matching of names, so that
6708     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6709     and yield the same set of white space $(CHARACTERS).
6710 */
6711 @safe public struct unicode
6712 {
6713     import std.exception : enforce;
6714     /**
6715         Performs the lookup of set of $(CODEPOINTS)
6716         with compile-time correctness checking.
6717         This short-cut version combines 3 searches:
6718         across blocks, scripts, and common binary properties.
6719 
6720         Note that since scripts and blocks overlap the
6721         usual trick to disambiguate is used - to get a block use
6722         `unicode.InBlockName`, to search a script
6723         use `unicode.ScriptName`.
6724 
6725         See_Also: $(LREF block), $(LREF script)
6726         and (not included in this search) $(LREF hangulSyllableType).
6727     */
6728 
6729     static @property auto opDispatch(string name)() pure
6730     {
6731         static if (findAny(name))
6732             return loadAny(name);
6733         else
6734             static assert(false, "No unicode set by name "~name~" was found.");
6735     }
6736 
6737     ///
6738     @safe unittest
6739     {
6740         import std.exception : collectException;
6741         auto ascii = unicode.ASCII;
6742         assert(ascii['A']);
6743         assert(ascii['~']);
6744         assert(!ascii['\u00e0']);
6745         // matching is case-insensitive
6746         assert(ascii == unicode.ascII);
6747         assert(!ascii['à']);
6748         // underscores, '-' and whitespace in names are ignored too
6749         auto latin = unicode.in_latin1_Supplement;
6750         assert(latin['à']);
6751         assert(!latin['$']);
6752         // BTW Latin 1 Supplement is a block, hence "In" prefix
6753         assert(latin == unicode("In Latin 1 Supplement"));
6754         // run-time look up throws if no such set is found
6755         assert(collectException(unicode("InCyrilliac")));
6756     }
6757 
6758     /**
6759         The same lookup across blocks, scripts, or binary properties,
6760         but performed at run-time.
6761         This version is provided for cases where `name`
6762         is not known beforehand; otherwise compile-time
6763         checked $(LREF opDispatch) is typically a better choice.
6764 
6765         See the $(S_LINK Unicode properties, table of properties) for available
6766         sets.
6767     */
6768     static auto opCall(C)(const scope C[] name)
6769     if (is(C : dchar))
6770     {
6771         return loadAny(name);
6772     }
6773 
6774     /**
6775         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6776 
6777         Note:
6778         Here block names are unambiguous as no scripts are searched
6779         and thus to search use simply `unicode.block.BlockName` notation.
6780 
6781         See $(S_LINK Unicode properties, table of properties) for available sets.
6782         See_Also: $(S_LINK Unicode properties, table of properties).
6783     */
6784     struct block
6785     {
6786         import std.internal.unicode_tables : blocks; // generated file
6787         mixin SetSearcher!(blocks.tab, "block");
6788     }
6789 
6790     ///
6791     @safe unittest
6792     {
6793         // use .block for explicitness
6794         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6795     }
6796 
6797     /**
6798         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6799 
6800         See the $(S_LINK Unicode properties, table of properties) for available
6801         sets.
6802     */
6803     struct script
6804     {
6805         import std.internal.unicode_tables : scripts; // generated file
6806         mixin SetSearcher!(scripts.tab, "script");
6807     }
6808 
6809     ///
6810     @safe unittest
6811     {
6812         auto arabicScript = unicode.script.arabic;
6813         auto arabicBlock = unicode.block.arabic;
6814         // there is an intersection between script and block
6815         assert(arabicBlock['؁']);
6816         assert(arabicScript['؁']);
6817         // but they are different
6818         assert(arabicBlock != arabicScript);
6819         assert(arabicBlock == unicode.inArabic);
6820         assert(arabicScript == unicode.arabic);
6821     }
6822 
6823     /**
6824         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6825 
6826         Other non-binary properties (once supported) follow the same
6827         notation - `unicode.propertyName.propertyValue` for compile-time
6828         checked access and `unicode.propertyName(propertyValue)`
6829         for run-time checked one.
6830 
6831         See the $(S_LINK Unicode properties, table of properties) for available
6832         sets.
6833     */
6834     struct hangulSyllableType
6835     {
6836         import std.internal.unicode_tables : hangul; // generated file
6837         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6838     }
6839 
6840     ///
6841     @safe unittest
6842     {
6843         // L here is syllable type not Letter as in unicode.L short-cut
6844         auto leadingVowel = unicode.hangulSyllableType("L");
6845         // check that some leading vowels are present
6846         foreach (vowel; '\u1110'..'\u115F')
6847             assert(leadingVowel[vowel]);
6848         assert(leadingVowel == unicode.hangulSyllableType.L);
6849     }
6850 
6851     //parse control code of form \cXXX, c assumed to be the current symbol
6852     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6853     {
6854         with(p)
6855         {
6856             popFront();
6857             enforce(!empty, "Unfinished escape sequence");
6858             enforce(('a' <= front && front <= 'z')
6859                 || ('A' <= front && front <= 'Z'),
6860             "Only letters are allowed after \\c");
6861             return front & 0x1f;
6862         }
6863     }
6864 
6865     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6866     //\ - assumed to be processed, p - is current
6867     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6868         bool negated, bool casefold)
6869     {
6870         static import std.ascii;
6871         with(p)
6872         {
6873             enum MAX_PROPERTY = 128;
6874             char[MAX_PROPERTY] result;
6875             uint k = 0;
6876             popFront();
6877             enforce(!empty, "eof parsing unicode property spec");
6878             if (front == '{')
6879             {
6880                 popFront();
6881                 while (k < MAX_PROPERTY && !empty && front !='}'
6882                     && front !=':')
6883                 {
6884                     if (front != '-' && front != ' ' && front != '_')
6885                         result[k++] = cast(char) std.ascii.toLower(front);
6886                     popFront();
6887                 }
6888                 enforce(k != MAX_PROPERTY, "invalid property name");
6889                 enforce(front == '}', "} expected ");
6890             }
6891             else
6892             {//single char properties e.g.: \pL, \pN ...
6893                 enforce(front < 0x80, "invalid property name");
6894                 result[k++] = cast(char) front;
6895             }
6896             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6897             enforce(!s.empty, "unrecognized unicode property spec");
6898             popFront();
6899             return s;
6900         }
6901     }
6902 
6903     /**
6904         Parse unicode codepoint set from given `range` using standard regex
6905         syntax '[...]'. The range is advanced skiping over regex set definition.
6906         `casefold` parameter determines if the set should be casefolded - that is
6907         include both lower and upper case versions for any letters in the set.
6908     */
6909     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6910     if (isInputRange!Range && is(ElementType!Range : dchar))
6911     {
6912         auto usParser = UnicodeSetParser!Range(range, casefold);
6913         auto set = usParser.parseSet();
6914         range = usParser.range;
6915         return set;
6916     }
6917 
6918     ///
6919     @safe unittest
6920     {
6921         import std.uni : unicode;
6922         string pat = "[a-zA-Z0-9]hello";
6923         auto set = unicode.parseSet(pat);
6924         // check some of the codepoints
6925         assert(set['a'] && set['A'] && set['9']);
6926         assert(pat == "hello");
6927     }
6928 
6929 private:
6930     alias ucmp = comparePropertyName;
6931 
6932     static bool findAny(string name)
6933     {
6934         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6935         return isPrettyPropertyName(name)
6936             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6937             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6938     }
6939 
6940     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6941     {
6942         import std.conv : to;
6943         import std.internal.unicode_tables : blocks, scripts; // generated file
6944         Set set;
6945         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6946             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6947                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6948         if (loaded)
6949             return set;
6950         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6951     }
6952 
6953     // FIXME: re-disable once the compiler is fixed
6954     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6955     //@disable ~this();
6956 }
6957 
6958 @safe unittest
6959 {
6960     import std.internal.unicode_tables : blocks, uniProps; // generated file
6961     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6962     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6963     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6964 }
6965 
6966 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6967 
6968 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6969 // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
6970 //   or extend | '\u200D' separately
6971 
6972 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6973 {
6974     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6975 }
6976 
6977 // Our grapheme decoder is a state machine, this is list of all possible
6978 // states before each code point.
6979 private enum GraphemeState
6980 {
6981     Start,
6982     CR,
6983     RI,
6984     L,
6985     V,
6986     LVT,
6987     Emoji,
6988     EmojiZWJ,
6989     Prepend,
6990     End
6991 }
6992 
6993 // Message values whether end of grapheme is reached
6994 private enum TransformRes
6995 {
6996     // No, unless the source range ends here
6997     // (GB2 - break at end of text, unless text is empty)
6998     goOn,
6999     redo, // Run last character again with new state
7000     retInclude, // Yes, after the just iterated character
7001     retExclude // Yes, before the just iterated character
7002 }
7003 
7004 // The logic of the grapheme decoding is all here
7005 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29
7006 // Note, getting GB1 (break at start of text, unless text is empty) right
7007 // relies on the user starting grapheme walking from beginning of the text, and
7008 // not attempting to walk an empty text.
7009 private immutable TransformRes
7010     function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
7011 [
7012     GraphemeState.Start: (ref state, ch)
7013     {
7014         // GB4. Break after controls.
7015         if (graphemeControlTrie[ch] || ch == '\n')
7016             return TransformRes.retInclude;
7017 
7018         with (GraphemeState) state =
7019             ch == '\r' ? CR :
7020             isRegionalIndicator(ch) ? RI :
7021             isHangL(ch) ? L :
7022             hangLV[ch] || isHangV(ch) ? V :
7023             hangLVT[ch] || isHangT(ch) ? LVT :
7024             prependTrie[ch] ? Prepend :
7025             xpictoTrie[ch] ? Emoji :
7026             End;
7027 
7028         // No matter what we encountered, we always include the
7029         // first code point in the grapheme.
7030         return TransformRes.goOn;
7031     },
7032 
7033     // GB3, GB4. Do not break between a CR and LF.
7034     // Otherwise, break after controls.
7035     GraphemeState.CR: (ref state, ch) => ch == '\n' ?
7036         TransformRes.retInclude :
7037         TransformRes.retExclude,
7038 
7039     // GB12 - GB13. Do not break within emoji flag sequences.
7040     // That is, do not break between regional indicator (RI) symbols if
7041     // there is an odd number of RI characters before the break point.
7042     // This state applies if one and only one RI code point has been
7043     // encountered.
7044     GraphemeState.RI: (ref state, ch)
7045     {
7046         state = GraphemeState.End;
7047 
7048         return isRegionalIndicator(ch) ?
7049             TransformRes.goOn :
7050             TransformRes.redo;
7051     },
7052 
7053     // GB6. Do not break Hangul syllable sequences.
7054     GraphemeState.L: (ref state, ch)
7055     {
7056         if (isHangL(ch))
7057             return TransformRes.goOn;
7058         else if (isHangV(ch) || hangLV[ch])
7059         {
7060             state = GraphemeState.V;
7061             return TransformRes.goOn;
7062         }
7063         else if (hangLVT[ch])
7064         {
7065             state = GraphemeState.LVT;
7066             return TransformRes.goOn;
7067         }
7068 
7069         state = GraphemeState.End;
7070         return TransformRes.redo;
7071     },
7072 
7073     // GB7. Do not break Hangul syllable sequences.
7074     GraphemeState.V: (ref state, ch)
7075     {
7076         if (isHangV(ch))
7077             return TransformRes.goOn;
7078         else if (isHangT(ch))
7079         {
7080             state = GraphemeState.LVT;
7081             return TransformRes.goOn;
7082         }
7083 
7084         state = GraphemeState.End;
7085         return TransformRes.redo;
7086     },
7087 
7088     // GB8. Do not break Hangul syllable sequences.
7089     GraphemeState.LVT: (ref state, ch)
7090     {
7091         if (isHangT(ch))
7092             return TransformRes.goOn;
7093 
7094         state = GraphemeState.End;
7095         return TransformRes.redo;
7096     },
7097 
7098     // GB11. Do not break within emoji modifier sequences or emoji
7099     // zwj sequences. This state applies when the last code point was
7100     // NOT a ZWJ.
7101     GraphemeState.Emoji: (ref state, ch)
7102     {
7103         if (graphemeExtendTrie[ch])
7104             return TransformRes.goOn;
7105 
7106         static assert(!graphemeExtendTrie['\u200D']);
7107 
7108         if (ch == '\u200D')
7109         {
7110             state = GraphemeState.EmojiZWJ;
7111             return TransformRes.goOn;
7112         }
7113 
7114         state = GraphemeState.End;
7115         // There might still be spacing marks are
7116         // at the end, which are not allowed in
7117         // middle of emoji sequences
7118         return TransformRes.redo;
7119     },
7120 
7121     // GB11. Do not break within emoji modifier sequences or emoji
7122     // zwj sequences. This state applies when the last code point was
7123     // a ZWJ.
7124     GraphemeState.EmojiZWJ: (ref state, ch)
7125     {
7126         state = GraphemeState.Emoji;
7127         if (xpictoTrie[ch])
7128             return TransformRes.goOn;
7129         return TransformRes.redo;
7130     },
7131 
7132     // GB9b. Do not break after Prepend characters.
7133     GraphemeState.Prepend: (ref state, ch)
7134     {
7135         // GB5. Break before controls.
7136         if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
7137             return TransformRes.retExclude;
7138 
7139         state = GraphemeState.Start;
7140         return TransformRes.redo;
7141     },
7142 
7143     // GB9, GB9a. Do not break before extending characters, ZWJ
7144     // or SpacingMarks.
7145     // GB999. Otherwise, break everywhere.
7146     GraphemeState.End: (ref state, ch)
7147         => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
7148             TransformRes.retExclude :
7149             TransformRes.goOn
7150 ];
7151 
7152 enum GraphemeRet { none, step, value }
7153 
7154 template genericDecodeGrapheme(GraphemeRet retType)
7155 {   alias Ret = GraphemeRet;
7156 
7157     static if (retType == Ret.value)
7158         alias Value = Grapheme;
7159     else static if (retType == Ret.step)
7160         alias Value = size_t;
7161     else static if (retType == Ret.none)
7162         alias Value = void;
7163 
7164     Value genericDecodeGrapheme(Input)(ref Input range)
7165     {
7166         static if (retType == Ret.value)
7167             Grapheme result;
7168         else static if (retType == Ret.step)
7169             size_t result = 0;
7170 
7171         auto state = GraphemeState.Start;
7172         dchar ch;
7173 
7174         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
7175     outer:
7176         while (!range.empty)
7177         {
7178             ch = range.front;
7179 
7180         rerun:
7181             final switch (graphemeTransforms[state](state, ch))
7182                 with(TransformRes)
7183             {
7184             case goOn:
7185                 static if (retType == Ret.value)
7186                     result ~= ch;
7187                 else static if (retType == Ret.step)
7188                     result++;
7189                 range.popFront();
7190                 continue;
7191 
7192             case redo:
7193                 goto rerun;
7194 
7195             case retInclude:
7196                 static if (retType == Ret.value)
7197                     result ~= ch;
7198                 else static if (retType == Ret.step)
7199                     result++;
7200                 range.popFront();
7201                 break outer;
7202 
7203             case retExclude:
7204                 break outer;
7205             }
7206         }
7207 
7208         static if (retType != Ret.none)
7209             return result;
7210     }
7211 }
7212 
7213 public: // Public API continues
7214 
7215 /++
7216     Computes the length of grapheme cluster starting at `index`.
7217     Both the resulting length and the `index` are measured
7218     in $(S_LINK Code unit, code units).
7219 
7220     Params:
7221         C = type that is implicitly convertible to `dchars`
7222         input = array of grapheme clusters
7223         index = starting index into `input[]`
7224 
7225     Returns:
7226         length of grapheme cluster
7227 +/
7228 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7229 if (is(C : dchar))
7230 {
7231     auto src = input[index..$];
7232     auto n = src.length;
7233     genericDecodeGrapheme!(GraphemeRet.none)(src);
7234     return n - src.length;
7235 }
7236 
7237 ///
7238 @safe unittest
7239 {
7240     assert(graphemeStride("  ", 1) == 1);
7241     // A + combing ring above
7242     string city = "A\u030Arhus";
7243     size_t first = graphemeStride(city, 0);
7244     assert(first == 3); //\u030A has 2 UTF-8 code units
7245     assert(city[0 .. first] == "A\u030A");
7246     assert(city[first..$] == "rhus");
7247 }
7248 
7249 @safe unittest
7250 {
7251     // Ensure that graphemeStride is usable from CTFE.
7252     enum c1 = graphemeStride("A", 0);
7253     static assert(c1 == 1);
7254 
7255     enum c2 = graphemeStride("A\u0301", 0);
7256     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7257 }
7258 
7259 @safe pure nothrow @nogc unittest
7260 {
7261     // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
7262     assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
7263     // skier ~ female sign ~ '€'
7264     assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
7265     // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
7266     assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
7267     // skier ~ zero-width joiner ~ female sign ~ '€'
7268     assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
7269     // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
7270     // ~ female sign ~ '€'
7271     assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
7272     // skier ~ zero-width joiner ~ '€'
7273     assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
7274     //'€' ~ zero-width joiner ~ skier
7275     assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
7276     // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
7277     assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
7278     // Kaithi number sign ~ null
7279     assert(graphemeStride("\U000110BD\0"d, 0) == 1);
7280 }
7281 
7282 /++
7283     Reads one full grapheme cluster from an
7284     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7285 
7286     For examples see the $(LREF Grapheme) below.
7287 
7288     Note:
7289     This function modifies `inp` and thus `inp`
7290     must be an L-value.
7291 +/
7292 Grapheme decodeGrapheme(Input)(ref Input inp)
7293 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7294 {
7295     return genericDecodeGrapheme!(GraphemeRet.value)(inp);
7296 }
7297 
7298 @safe unittest
7299 {
7300     import std.algorithm.comparison : equal;
7301 
7302     Grapheme gr;
7303     string s = " \u0020\u0308 ";
7304     gr = decodeGrapheme(s);
7305     assert(gr.length == 1 && gr[0] == ' ');
7306     gr = decodeGrapheme(s);
7307     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7308     s = "\u0300\u0308\u1100";
7309     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7310     assert(equal(decodeGrapheme(s)[], "\u1100"));
7311     s = "\u11A8\u0308\uAC01";
7312     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7313     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7314 
7315     // Two Union Jacks of the Great Britain
7316     s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7317     assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
7318 }
7319 
7320 /++
7321     Reads one full grapheme cluster from an
7322     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`,
7323     but doesn't return it. Instead returns the number of code units read.
7324     This differs from number of code points read only if `input` is an
7325     autodecodable string.
7326 
7327     Note:
7328     This function modifies `inp` and thus `inp`
7329     must be an L-value.
7330 +/
7331 size_t popGrapheme(Input)(ref Input inp)
7332 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7333 {
7334     static if (isAutodecodableString!Input || hasLength!Input)
7335     {
7336         // Why count each step in the decoder when you can just
7337         // measure the grapheme in one go?
7338         auto n = inp.length;
7339         genericDecodeGrapheme!(GraphemeRet.none)(inp);
7340         return n - inp.length;
7341     }
7342     else return genericDecodeGrapheme!(GraphemeRet.step)(inp);
7343 }
7344 
7345 ///
7346 @safe pure unittest
7347 {
7348     // Two Union Jacks of the Great Britain in each
7349     string s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7350     wstring ws = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7351     dstring ds = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7352 
7353     // String pop length in code units, not points.
7354     assert(s.popGrapheme() == 8);
7355     assert(ws.popGrapheme() == 4);
7356     assert(ds.popGrapheme() == 2);
7357 
7358     assert(s == "\U0001F1EC\U0001F1E7");
7359     assert(ws == "\U0001F1EC\U0001F1E7");
7360     assert(ds == "\U0001F1EC\U0001F1E7");
7361 
7362     import std.algorithm.comparison : equal;
7363     import std.algorithm.iteration : filter;
7364 
7365     // Also works for non-random access ranges as long as the
7366     // character type is 32-bit.
7367     auto testPiece = "\r\nhello!"d.filter!(x => !x.isAlpha);
7368     // Windows-style line ending is two code points in a single grapheme.
7369     assert(testPiece.popGrapheme() == 2);
7370     assert(testPiece.equal("!"d));
7371 }
7372 
7373 // Attribute compliance test. Should be nothrow `@nogc` when
7374 // no autodecoding needed.
7375 @safe pure nothrow @nogc unittest
7376 {
7377     import std.algorithm.iteration : filter;
7378 
7379     auto str = "abcdef"d;
7380     assert(str.popGrapheme() == 1);
7381 
7382     // also test with non-random access
7383     auto filtered = "abcdef"d.filter!(x => x%2);
7384     assert(filtered.popGrapheme() == 1);
7385 }
7386 
7387 /++
7388     $(P Iterate a string by $(LREF Grapheme).)
7389 
7390     $(P Useful for doing string manipulation that needs to be aware
7391     of graphemes.)
7392 
7393     See_Also:
7394         $(LREF byCodePoint)
7395 +/
7396 auto byGrapheme(Range)(Range range)
7397 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7398 {
7399     // TODO: Bidirectional access
7400     static struct Result(R)
7401     {
7402         private R _range;
7403         private Grapheme _front;
7404 
7405         bool empty() @property
7406         {
7407             return _front.length == 0;
7408         }
7409 
7410         Grapheme front() @property
7411         {
7412             return _front;
7413         }
7414 
7415         void popFront()
7416         {
7417             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7418         }
7419 
7420         static if (isForwardRange!R)
7421         {
7422             Result save() @property
7423             {
7424                 return Result(_range.save, _front);
7425             }
7426         }
7427     }
7428 
7429     auto result = Result!(Range)(range);
7430     result.popFront();
7431     return result;
7432 }
7433 
7434 ///
7435 @safe unittest
7436 {
7437     import std.algorithm.comparison : equal;
7438     import std.range.primitives : walkLength;
7439     import std.range : take, drop;
7440     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7441     assert(text.walkLength == 5); // 5 code points
7442 
7443     auto gText = text.byGrapheme;
7444     assert(gText.walkLength == 4); // 4 graphemes
7445 
7446     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7447     assert(gText.drop(3).equal("l".byGrapheme));
7448 }
7449 
7450 // For testing non-forward-range input ranges
7451 version (StdUnittest)
7452 private static @safe struct InputRangeString
7453 {
7454     private string s;
7455 
7456     bool empty() @property { return s.empty; }
7457     dchar front() @property { return s.front; }
7458     void popFront() { s.popFront(); }
7459 }
7460 
7461 @safe unittest
7462 {
7463     import std.algorithm.comparison : equal;
7464     import std.array : array;
7465     import std.range : retro;
7466     import std.range.primitives : walkLength;
7467     assert("".byGrapheme.walkLength == 0);
7468 
7469     auto reverse = "le\u0308on";
7470     assert(reverse.walkLength == 5);
7471 
7472     auto gReverse = reverse.byGrapheme;
7473     assert(gReverse.walkLength == 4);
7474 
7475     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7476     {{
7477         assert(text.walkLength == 5);
7478         static assert(isForwardRange!(typeof(text)));
7479 
7480         auto gText = text.byGrapheme;
7481         static assert(isForwardRange!(typeof(gText)));
7482         assert(gText.walkLength == 4);
7483         assert(gText.array.retro.equal(gReverse));
7484     }}
7485 
7486     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7487     static assert(!isForwardRange!(typeof(nonForwardRange)));
7488     assert(nonForwardRange.walkLength == 4);
7489 }
7490 
7491 // Issue 23474
7492 @safe pure unittest
7493 {
7494     import std.range.primitives : walkLength;
7495     assert(byGrapheme("\r\u0308").walkLength == 2);
7496 }
7497 
7498 /++
7499     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7500 
7501     $(P Useful for converting the result to a string after doing operations
7502     on graphemes.)
7503 
7504     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7505 +/
7506 auto byCodePoint(Range)(Range range)
7507 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7508 {
7509     // TODO: Propagate bidirectional access
7510     static struct Result
7511     {
7512         private Range _range;
7513         private size_t i = 0;
7514 
7515         bool empty() @property
7516         {
7517             return _range.empty;
7518         }
7519 
7520         dchar front() @property
7521         {
7522             return _range.front[i];
7523         }
7524 
7525         void popFront()
7526         {
7527             ++i;
7528 
7529             if (i >= _range.front.length)
7530             {
7531                 _range.popFront();
7532                 i = 0;
7533             }
7534         }
7535 
7536         static if (isForwardRange!Range)
7537         {
7538             Result save() @property
7539             {
7540                 return Result(_range.save, i);
7541             }
7542         }
7543     }
7544 
7545     return Result(range);
7546 }
7547 
7548 /// Ditto
7549 auto byCodePoint(Range)(Range range)
7550 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7551 {
7552     import std.range.primitives : isBidirectionalRange, popBack;
7553     import std.traits : isNarrowString;
7554     static if (isNarrowString!Range)
7555     {
7556         static struct Result
7557         {
7558             private Range _range;
7559             @property bool empty() { return _range.empty; }
7560             @property dchar front(){ return _range.front; }
7561             void popFront(){ _range.popFront; }
7562             @property auto save() { return Result(_range.save); }
7563             @property dchar back(){ return _range.back; }
7564             void popBack(){ _range.popBack; }
7565         }
7566         static assert(isBidirectionalRange!(Result));
7567         return Result(range);
7568     }
7569     else
7570         return range;
7571 }
7572 
7573 ///
7574 @safe unittest
7575 {
7576     import std.array : array;
7577     import std.conv : text;
7578     import std.range : retro;
7579 
7580     string s = "noe\u0308l"; // noël
7581 
7582     // reverse it and convert the result to a string
7583     string reverse = s.byGrapheme
7584         .array
7585         .retro
7586         .byCodePoint
7587         .text;
7588 
7589     assert(reverse == "le\u0308on"); // lëon
7590 }
7591 
7592 @safe unittest
7593 {
7594     import std.algorithm.comparison : equal;
7595     import std.range.primitives : walkLength;
7596     import std.range : retro;
7597     assert("".byGrapheme.byCodePoint.equal(""));
7598 
7599     string text = "noe\u0308l";
7600     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7601 
7602     auto gText = InputRangeString(text).byGrapheme;
7603     static assert(!isForwardRange!(typeof(gText)));
7604 
7605     auto cpText = gText.byCodePoint;
7606     static assert(!isForwardRange!(typeof(cpText)));
7607 
7608     assert(cpText.walkLength == text.walkLength);
7609 
7610     auto plainCp = text.byCodePoint;
7611     static assert(isForwardRange!(typeof(plainCp)));
7612     assert(equal(plainCp, text));
7613     assert(equal(retro(plainCp.save), retro(text.save)));
7614     // Check that we still have length for dstring
7615     assert("абвгд"d.byCodePoint.length == 5);
7616 }
7617 
7618 /++
7619     $(P A structure designed to effectively pack $(CHARACTERS)
7620     of a $(CLUSTER).
7621     )
7622 
7623     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7624     always refer to distinct objects. In most actual scenarios a `Grapheme`
7625     fits on the stack and avoids memory allocation overhead for all but quite
7626     long clusters.
7627     )
7628 
7629     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7630 +/
7631 @safe struct Grapheme
7632 {
7633     import std.exception : enforce;
7634     import std.traits : isDynamicArray;
7635 
7636 public:
7637     /// Ctor
7638     this(C)(const scope C[] chars...)
7639     if (is(C : dchar))
7640     {
7641         this ~= chars;
7642     }
7643 
7644     ///ditto
7645     this(Input)(Input seq)
7646     if (!isDynamicArray!Input
7647         && isInputRange!Input && is(ElementType!Input : dchar))
7648     {
7649         this ~= seq;
7650     }
7651 
7652     /// Gets a $(CODEPOINT) at the given index in this cluster.
7653     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7654     {
7655         assert(index < length);
7656         return read24(isBig ? ptr_ : small_.ptr, index);
7657     }
7658 
7659     /++
7660         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7661 
7662         Warning:
7663         Use of this facility may invalidate grapheme cluster,
7664         see also $(LREF Grapheme.valid).
7665     +/
7666     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7667     {
7668         assert(index < length);
7669         write24(isBig ? ptr_ : small_.ptr, ch, index);
7670     }
7671 
7672     ///
7673     @safe unittest
7674     {
7675         auto g = Grapheme("A\u0302");
7676         assert(g[0] == 'A');
7677         assert(g.valid);
7678         g[1] = '~'; // ASCII tilda is not a combining mark
7679         assert(g[1] == '~');
7680         assert(!g.valid);
7681     }
7682 
7683     /++
7684         Random-access range over Grapheme's $(CHARACTERS).
7685 
7686         Warning: Invalidates when this Grapheme leaves the scope,
7687         attempts to use it then would lead to memory corruption.
7688     +/
7689     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7690     {
7691         return sliceOverIndexed(a, b, &this);
7692     }
7693 
7694     /// ditto
7695     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7696     {
7697         return sliceOverIndexed(0, length, &this);
7698     }
7699 
7700     /// Grapheme cluster length in $(CODEPOINTS).
7701     @property size_t length() const @nogc nothrow pure
7702     {
7703         return isBig ? len_ : slen_ & 0x7F;
7704     }
7705 
7706     /++
7707         Append $(CHARACTER) `ch` to this grapheme.
7708         Warning:
7709         Use of this facility may invalidate grapheme cluster,
7710         see also `valid`.
7711 
7712         See_Also: $(LREF Grapheme.valid)
7713     +/
7714     ref opOpAssign(string op)(dchar ch) @trusted
7715     {
7716         static if (op == "~")
7717         {
7718             import std.internal.memory : enforceRealloc;
7719             if (!isBig)
7720             {
7721                 if (slen_ == small_cap)
7722                     convertToBig();// & fallthrough to "big" branch
7723                 else
7724                 {
7725                     write24(small_.ptr, ch, smallLength);
7726                     slen_++;
7727                     return this;
7728                 }
7729             }
7730 
7731             assert(isBig);
7732             if (len_ == cap_)
7733             {
7734                 import core.checkedint : addu, mulu;
7735                 bool overflow;
7736                 cap_ = addu(cap_, grow, overflow);
7737                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7738                 if (overflow) assert(0);
7739                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7740             }
7741             write24(ptr_, ch, len_++);
7742             return this;
7743         }
7744         else
7745             static assert(false, "No operation "~op~" defined for Grapheme");
7746     }
7747 
7748     ///
7749     @safe unittest
7750     {
7751         import std.algorithm.comparison : equal;
7752         auto g = Grapheme("A");
7753         assert(g.valid);
7754         g ~= '\u0301';
7755         assert(g[].equal("A\u0301"));
7756         assert(g.valid);
7757         g ~= "B";
7758         // not a valid grapheme cluster anymore
7759         assert(!g.valid);
7760         // still could be useful though
7761         assert(g[].equal("A\u0301B"));
7762     }
7763 
7764     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7765     ref opOpAssign(string op, Input)(scope Input inp)
7766     if (isInputRange!Input && is(ElementType!Input : dchar))
7767     {
7768         static if (op == "~")
7769         {
7770             foreach (dchar ch; inp)
7771                 this ~= ch;
7772             return this;
7773         }
7774         else
7775             static assert(false, "No operation "~op~" defined for Grapheme");
7776     }
7777 
7778     // This is not a good `opEquals`, but formerly the automatically generated
7779     // opEquals was used, which was inferred `@safe` because of bugzilla 20655:
7780     // https://issues.dlang.org/show_bug.cgi?id=20655
7781     // This `@trusted opEquals` is only here to prevent breakage.
7782     bool opEquals(R)(const auto ref R other) const @trusted
7783     {
7784         return this.tupleof == other.tupleof;
7785     }
7786 
7787     // Define a default toHash to allow AA usage
7788     size_t toHash() const @trusted
7789     {
7790         return hashOf(slen_, hashOf(small_));
7791     }
7792 
7793     /++
7794         True if this object contains valid extended grapheme cluster.
7795         Decoding primitives of this module always return a valid `Grapheme`.
7796 
7797         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7798         render it no longer valid. Certain applications may chose to use
7799         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7800         entirely.
7801     +/
7802     @property bool valid()() /*const*/
7803     {
7804         auto r = this[];
7805         genericDecodeGrapheme!(GraphemeRet.none)(r);
7806         return r.length == 0;
7807     }
7808 
7809     this(this) @nogc nothrow pure @trusted
7810     {
7811         import std.internal.memory : enforceMalloc;
7812         if (isBig)
7813         {// dup it
7814             import core.checkedint : addu, mulu;
7815             bool overflow;
7816             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7817             if (overflow) assert(0);
7818 
7819             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7820             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7821             ptr_ = p;
7822         }
7823     }
7824 
7825     ~this() @nogc nothrow pure @trusted
7826     {
7827         import core.memory : pureFree;
7828         if (isBig)
7829         {
7830             pureFree(ptr_);
7831         }
7832     }
7833 
7834 
7835 private:
7836     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7837     // "out of the blue" grow rate, needs testing
7838     // (though graphemes are typically small < 9)
7839     enum grow = 20;
7840     enum small_cap = small_bytes/3;
7841     enum small_flag = 0x80, small_mask = 0x7F;
7842     // 16 bytes in 32bits, should be enough for the majority of cases
7843     union
7844     {
7845         struct
7846         {
7847             ubyte* ptr_;
7848             size_t cap_;
7849             size_t len_;
7850             size_t padding_;
7851         }
7852         struct
7853         {
7854             ubyte[small_bytes] small_;
7855             ubyte slen_;
7856         }
7857     }
7858 
7859     void convertToBig() @nogc nothrow pure @trusted
7860     {
7861         import std.internal.memory : enforceMalloc;
7862         static assert(grow.max / 3 - 1 >= grow);
7863         enum nbytes = 3 * (grow + 1);
7864         size_t k = smallLength;
7865         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7866         for (int i=0; i<k; i++)
7867             write24(p, read24(small_.ptr, i), i);
7868         // now we can overwrite small array data
7869         ptr_ = p;
7870         len_ = slen_;
7871         assert(grow > len_);
7872         cap_ = grow;
7873         setBig();
7874     }
7875 
7876     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7877 
7878     @property size_t smallLength() const @nogc nothrow pure
7879     {
7880         return slen_ & small_mask;
7881     }
7882     @property ubyte isBig() const @nogc nothrow pure
7883     {
7884         return slen_ & small_flag;
7885     }
7886 }
7887 
7888 static assert(Grapheme.sizeof == size_t.sizeof*4);
7889 
7890 
7891 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7892 {
7893     import std.algorithm.comparison : equal;
7894     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7895     assert(byGrapheme("ЮУЗ").equal(data[]));
7896 }
7897 
7898 ///
7899 @safe unittest
7900 {
7901     import std.algorithm.comparison : equal;
7902     import std.algorithm.iteration : filter;
7903     import std.range : isRandomAccessRange;
7904 
7905     string bold = "ku\u0308hn";
7906 
7907     // note that decodeGrapheme takes parameter by ref
7908     auto first = decodeGrapheme(bold);
7909 
7910     assert(first.length == 1);
7911     assert(first[0] == 'k');
7912 
7913     // the next grapheme is 2 characters long
7914     auto wideOne = decodeGrapheme(bold);
7915     // slicing a grapheme yields a random-access range of dchar
7916     assert(wideOne[].equal("u\u0308"));
7917     assert(wideOne.length == 2);
7918     static assert(isRandomAccessRange!(typeof(wideOne[])));
7919 
7920     // all of the usual range manipulation is possible
7921     assert(wideOne[].filter!isMark().equal("\u0308"));
7922 
7923     auto g = Grapheme("A");
7924     assert(g.valid);
7925     g ~= '\u0301';
7926     assert(g[].equal("A\u0301"));
7927     assert(g.valid);
7928     g ~= "B";
7929     // not a valid grapheme cluster anymore
7930     assert(!g.valid);
7931     // still could be useful though
7932     assert(g[].equal("A\u0301B"));
7933 }
7934 
7935 @safe unittest
7936 {
7937     auto g = Grapheme("A\u0302");
7938     assert(g[0] == 'A');
7939     assert(g.valid);
7940     g[1] = '~'; // ASCII tilda is not a combining mark
7941     assert(g[1] == '~');
7942     assert(!g.valid);
7943 }
7944 
7945 @safe unittest
7946 {
7947     import std.algorithm.comparison : equal;
7948     import std.algorithm.iteration : map;
7949     import std.conv : text;
7950     import std.range : iota;
7951 
7952     // not valid clusters (but it just a test)
7953     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7954     assert(g[0] == 'a');
7955     assert(g[1] == 'b');
7956     assert(g[2] == 'c');
7957     assert(g[3] == 'd');
7958     assert(g[4] == 'e');
7959     g[3] = 'Й';
7960     assert(g[2] == 'c');
7961     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7962     assert(g[4] == 'e');
7963     assert(!g.valid);
7964 
7965     g ~= 'ц';
7966     g ~= '~';
7967     assert(g[0] == 'a');
7968     assert(g[1] == 'b');
7969     assert(g[2] == 'c');
7970     assert(g[3] == 'Й');
7971     assert(g[4] == 'e');
7972     assert(g[5] == 'ц');
7973     assert(g[6] == '~');
7974     assert(!g.valid);
7975 
7976     Grapheme copy = g;
7977     copy[0] = 'X';
7978     copy[1] = '-';
7979     assert(g[0] == 'a' && copy[0] == 'X');
7980     assert(g[1] == 'b' && copy[1] == '-');
7981     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7982     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7983     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7984     copy ~= "xyz";
7985     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7986     assert(!copy.valid);
7987 
7988     Grapheme h;
7989     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7990         h ~= v;
7991     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7992 }
7993 
7994 // ensure Grapheme can be used as an AA key.
7995 @safe unittest
7996 {
7997     int[Grapheme] aa;
7998 }
7999 
8000 /++
8001     $(P Does basic case-insensitive comparison of `r1` and `r2`.
8002     This function uses simpler comparison rule thus achieving better performance
8003     than $(LREF icmp). However keep in mind the warning below.)
8004 
8005     Params:
8006         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
8007         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
8008 
8009     Returns:
8010         An `int` that is 0 if the strings match,
8011         &lt;0 if `r1` is lexicographically "less" than `r2`,
8012         &gt;0 if `r1` is lexicographically "greater" than `r2`
8013 
8014     Warning:
8015     This function only handles 1:1 $(CODEPOINT) mapping
8016     and thus is not sufficient for certain alphabets
8017     like German, Greek and few others.
8018 
8019     See_Also:
8020         $(LREF icmp)
8021         $(REF cmp, std,algorithm,comparison)
8022 +/
8023 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
8024 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
8025     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
8026 {
8027     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
8028     import std.range.primitives : isInfinite;
8029     import std.utf : decodeFront;
8030     import std.traits : isDynamicArray;
8031     import std.typecons : Yes;
8032     static import std.ascii;
8033 
8034     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8035         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8036         && !(isInfinite!S1 && isInfinite!S2)
8037         && __traits(compiles,
8038             {
8039                 size_t s = size_t.sizeof / 2;
8040                 r1 = r1[s .. $];
8041                 r2 = r2[s .. $];
8042             }))
8043     {{
8044         // ASCII optimization for dynamic arrays & similar.
8045         size_t i = 0;
8046         static if (isInfinite!S1)
8047             immutable end = r2.length;
8048         else static if (isInfinite!S2)
8049             immutable end = r1.length;
8050         else
8051             immutable end = r1.length > r2.length ? r2.length : r1.length;
8052         for (; i < end; ++i)
8053         {
8054             auto lhs = r1[i];
8055             auto rhs = r2[i];
8056             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8057             if (lhs == rhs) continue;
8058             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8059             if (lowDiff) return lowDiff;
8060         }
8061         static if (isInfinite!S1)
8062             return 1;
8063         else static if (isInfinite!S2)
8064             return -1;
8065         else
8066             return (r1.length > r2.length) - (r2.length > r1.length);
8067 
8068     NonAsciiPath:
8069         r1 = r1[i .. $];
8070         r2 = r2[i .. $];
8071         // Fall through to standard case.
8072     }}
8073 
8074     while (!r1.empty)
8075     {
8076         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
8077         if (r2.empty)
8078             return 1;
8079         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
8080         int diff = lhs - rhs;
8081         if (!diff)
8082             continue;
8083         if ((lhs | rhs) < 0x80)
8084         {
8085             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8086             if (!d) continue;
8087             return d;
8088         }
8089         size_t idx = simpleCaseTrie[lhs];
8090         size_t idx2 = simpleCaseTrie[rhs];
8091         // simpleCaseTrie is packed index table
8092         if (idx != EMPTY_CASE_TRIE)
8093         {
8094             if (idx2 != EMPTY_CASE_TRIE)
8095             {// both cased chars
8096                 // adjust idx --> start of bucket
8097                 idx = idx - sTable(idx).n;
8098                 idx2 = idx2 - sTable(idx2).n;
8099                 if (idx == idx2)// one bucket, equivalent chars
8100                     continue;
8101                 else//  not the same bucket
8102                     diff = sTable(idx).ch - sTable(idx2).ch;
8103             }
8104             else
8105                 diff = sTable(idx - sTable(idx).n).ch - rhs;
8106         }
8107         else if (idx2 != EMPTY_CASE_TRIE)
8108         {
8109             diff = lhs - sTable(idx2 - sTable(idx2).n).ch;
8110         }
8111         // one of chars is not cased at all
8112         return diff;
8113     }
8114     return int(r2.empty) - 1;
8115 }
8116 
8117 ///
8118 @safe @nogc pure nothrow unittest
8119 {
8120     assert(sicmp("Август", "авгусТ") == 0);
8121     // Greek also works as long as there is no 1:M mapping in sight
8122     assert(sicmp("ΌΎ", "όύ") == 0);
8123     // things like the following won't get matched as equal
8124     // Greek small letter iota with dialytika and tonos
8125     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8126 
8127     // while icmp has no problem with that
8128     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
8129     assert(icmp("ΌΎ", "όύ") == 0);
8130 }
8131 
8132 // overloads for the most common cases to reduce compile time
8133 @safe @nogc pure nothrow
8134 {
8135     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
8136     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
8137 
8138     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
8139     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8140 
8141     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
8142     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8143 }
8144 
8145 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
8146 {
8147     import std.algorithm.searching : skipOver;
8148     import std.internal.unicode_tables : fullCaseTable; // generated file
8149     alias fTable = fullCaseTable;
8150     size_t idx = fullCaseTrie[lhs];
8151     // fullCaseTrie is packed index table
8152     if (idx == EMPTY_CASE_TRIE)
8153         return lhs;
8154     immutable start = idx - fTable(idx).n;
8155     immutable end = fTable(idx).size + start;
8156     assert(fTable(start).entry_len == 1);
8157     for (idx=start; idx<end; idx++)
8158     {
8159         const entryLen = fTable(idx).entry_len;
8160         if (entryLen == 1)
8161         {
8162             if (fTable(idx).seq[0] == rhs)
8163             {
8164                 return 0;
8165             }
8166         }
8167         else
8168         {// OK it's a long chunk, like 'ss' for German
8169             dchar[3] arr = fTable(idx).seq;
8170             const dchar[] seq = arr[0 .. entryLen];
8171             if (rhs == seq[0]
8172                 && rtail.skipOver(seq[1..$]))
8173             {
8174                 // note that this path modifies rtail
8175                 // iff we managed to get there
8176                 return 0;
8177             }
8178         }
8179     }
8180     return fTable(start).seq[0]; // new remapped character for accurate diffs
8181 }
8182 
8183 /++
8184     Does case insensitive comparison of `r1` and `r2`.
8185     Follows the rules of full case-folding mapping.
8186     This includes matching as equal german ß with "ss" and
8187     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
8188     The cost of `icmp` being pedantically correct is
8189     slightly worse performance.
8190 
8191     Params:
8192         r1 = a forward range of characters
8193         r2 = a forward range of characters
8194 
8195     Returns:
8196         An `int` that is 0 if the strings match,
8197         &lt;0 if `str1` is lexicographically "less" than `str2`,
8198         &gt;0 if `str1` is lexicographically "greater" than `str2`
8199 
8200     See_Also:
8201         $(LREF sicmp)
8202         $(REF cmp, std,algorithm,comparison)
8203 +/
8204 int icmp(S1, S2)(S1 r1, S2 r2)
8205 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
8206     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
8207 {
8208     import std.range.primitives : isInfinite;
8209     import std.traits : isDynamicArray;
8210     import std.utf : byDchar;
8211     static import std.ascii;
8212 
8213     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8214         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8215         && !(isInfinite!S1 && isInfinite!S2)
8216         && __traits(compiles,
8217             {
8218                 size_t s = size_t.max / 2;
8219                 r1 = r1[s .. $];
8220                 r2 = r2[s .. $];
8221             }))
8222     {{
8223         // ASCII optimization for dynamic arrays & similar.
8224         size_t i = 0;
8225         static if (isInfinite!S1)
8226             immutable end = r2.length;
8227         else static if (isInfinite!S2)
8228             immutable end = r1.length;
8229         else
8230             immutable end = r1.length > r2.length ? r2.length : r1.length;
8231         for (; i < end; ++i)
8232         {
8233             auto lhs = r1[i];
8234             auto rhs = r2[i];
8235             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8236             if (lhs == rhs) continue;
8237             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8238             if (lowDiff) return lowDiff;
8239         }
8240         static if (isInfinite!S1)
8241             return 1;
8242         else static if (isInfinite!S2)
8243             return -1;
8244         else
8245             return (r1.length > r2.length) - (r2.length > r1.length);
8246 
8247     NonAsciiPath:
8248         r1 = r1[i .. $];
8249         r2 = r2[i .. $];
8250         // Fall through to standard case.
8251     }}
8252 
8253     auto str1 = r1.byDchar;
8254     auto str2 = r2.byDchar;
8255 
8256     for (;;)
8257     {
8258         if (str1.empty)
8259             return str2.empty ? 0 : -1;
8260         immutable lhs = str1.front;
8261         if (str2.empty)
8262             return 1;
8263         immutable rhs = str2.front;
8264         str1.popFront();
8265         str2.popFront();
8266         if (!(lhs - rhs))
8267             continue;
8268         // first try to match lhs to <rhs,right-tail> sequence
8269         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8270         if (!cmpLR)
8271             continue;
8272         // then rhs to <lhs,left-tail> sequence
8273         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8274         if (!cmpRL)
8275             continue;
8276         // cmpXX contain remapped codepoints
8277         // to obtain stable ordering of icmp
8278         return cmpLR - cmpRL;
8279     }
8280 }
8281 
8282 ///
8283 @safe @nogc pure nothrow unittest
8284 {
8285     assert(icmp("Rußland", "Russland") == 0);
8286     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8287 }
8288 
8289 /**
8290  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8291  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8292  */
8293 @safe @nogc nothrow pure unittest
8294 {
8295     import std.utf : byDchar;
8296 
8297     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8298     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8299 }
8300 
8301 // test different character types
8302 @safe unittest
8303 {
8304     assert(icmp("Rußland", "Russland") == 0);
8305     assert(icmp("Rußland"w, "Russland") == 0);
8306     assert(icmp("Rußland", "Russland"w) == 0);
8307     assert(icmp("Rußland"w, "Russland"w) == 0);
8308     assert(icmp("Rußland"d, "Russland"w) == 0);
8309     assert(icmp("Rußland"w, "Russland"d) == 0);
8310 }
8311 
8312 // overloads for the most common cases to reduce compile time
8313 @safe @nogc pure nothrow
8314 {
8315     int icmp(const(char)[] str1, const(char)[] str2)
8316     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8317     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8318     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8319     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8320     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8321 }
8322 
8323 @safe unittest
8324 {
8325     import std.algorithm.sorting : sort;
8326     import std.conv : to;
8327     import std.exception : assertCTFEable;
8328     assertCTFEable!(
8329     {
8330     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8331     {{
8332         static foreach (S1; AliasSeq!(string, wstring, dstring))
8333         static foreach (S2; AliasSeq!(string, wstring, dstring))
8334         {
8335             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8336             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8337             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8338             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8339             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8340             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8341             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8342             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8343             // Check example:
8344             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8345             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8346         }
8347         // check that the order is properly agnostic to the case
8348         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8349         sort!((a,b) => cfunc(a,b) < 0)(strs);
8350         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8351     }}
8352     assert(icmp("ßb", "ssa") > 0);
8353     // Check example:
8354     assert(icmp("Russland", "Rußland") == 0);
8355     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8356     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8357     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8358     // https://issues.dlang.org/show_bug.cgi?id=11057
8359     assert( icmp("K", "L") < 0 );
8360     });
8361 }
8362 
8363 // https://issues.dlang.org/show_bug.cgi?id=17372
8364 @safe pure unittest
8365 {
8366     import std.algorithm.iteration : joiner, map;
8367     import std.algorithm.sorting : sort;
8368     import std.array : array;
8369     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8370 }
8371 
8372 // This is package(std) for the moment to be used as a support tool for std.regex
8373 // It needs a better API
8374 /*
8375     Return a range of all $(CODEPOINTS) that casefold to
8376     and from this `ch`.
8377 */
8378 package(std) auto simpleCaseFoldings(dchar ch) @safe
8379 {
8380     import std.internal.unicode_tables : simpleCaseTable; // generated file
8381     alias sTable = simpleCaseTable;
8382     static struct Range
8383     {
8384     @safe pure nothrow:
8385         uint idx; //if == uint.max, then read c.
8386         union
8387         {
8388             dchar c; // == 0 - empty range
8389             uint len;
8390         }
8391         @property bool isSmall() const { return idx == uint.max; }
8392 
8393         this(dchar ch)
8394         {
8395             idx = uint.max;
8396             c = ch;
8397         }
8398 
8399         this(uint start, uint size)
8400         {
8401             idx = start;
8402             len = size;
8403         }
8404 
8405         @property dchar front() const
8406         {
8407             assert(!empty);
8408             if (isSmall)
8409             {
8410                 return c;
8411             }
8412             auto ch = sTable(idx).ch;
8413             return ch;
8414         }
8415 
8416         @property bool empty() const
8417         {
8418             if (isSmall)
8419             {
8420                 return c == 0;
8421             }
8422             return len == 0;
8423         }
8424 
8425         @property size_t length() const
8426         {
8427             if (isSmall)
8428             {
8429                 return c == 0 ? 0 : 1;
8430             }
8431             return len;
8432         }
8433 
8434         void popFront()
8435         {
8436             if (isSmall)
8437                 c = 0;
8438             else
8439             {
8440                 idx++;
8441                 len--;
8442             }
8443         }
8444     }
8445     immutable idx = simpleCaseTrie[ch];
8446     if (idx == EMPTY_CASE_TRIE)
8447         return Range(ch);
8448     auto entry = sTable(idx);
8449     immutable start = idx - entry.n;
8450     return Range(start, entry.size);
8451 }
8452 
8453 @safe unittest
8454 {
8455     import std.algorithm.comparison : equal;
8456     import std.algorithm.searching : canFind;
8457     import std.array : array;
8458     import std.exception : assertCTFEable;
8459     assertCTFEable!((){
8460         auto r = simpleCaseFoldings('Э').array;
8461         assert(r.length == 2);
8462         assert(r.canFind('э') && r.canFind('Э'));
8463         auto sr = simpleCaseFoldings('~');
8464         assert(sr.equal("~"));
8465         //A with ring above - casefolds to the same bucket as Angstrom sign
8466         sr = simpleCaseFoldings('Å');
8467         assert(sr.length == 3);
8468         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8469     });
8470 }
8471 
8472 /++
8473     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8474 +/
8475 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8476 {
8477     return combiningClassTrie[ch];
8478 }
8479 
8480 ///
8481 @safe unittest
8482 {
8483     // shorten the code
8484     alias CC = combiningClass;
8485 
8486     // combining tilda
8487     assert(CC('\u0303') == 230);
8488     // combining ring below
8489     assert(CC('\u0325') == 220);
8490     // the simple consequence is that  "tilda" should be
8491     // placed after a "ring below" in a sequence
8492 }
8493 
8494 @safe pure nothrow @nogc unittest
8495 {
8496     foreach (ch; 0 .. 0x80)
8497         assert(combiningClass(ch) == 0);
8498     assert(combiningClass('\u05BD') == 22);
8499     assert(combiningClass('\u0300') == 230);
8500     assert(combiningClass('\u0317') == 220);
8501     assert(combiningClass('\u1939') == 222);
8502 }
8503 
8504 /// Unicode character decomposition type.
8505 enum UnicodeDecomposition {
8506     /// Canonical decomposition. The result is canonically equivalent sequence.
8507     Canonical,
8508     /**
8509          Compatibility decomposition. The result is compatibility equivalent sequence.
8510          Note: Compatibility decomposition is a $(B lossy) conversion,
8511          typically suitable only for fuzzy matching and internal processing.
8512     */
8513     Compatibility
8514 }
8515 
8516 /**
8517     Shorthand aliases for character decomposition type, passed as a
8518     template parameter to $(LREF decompose).
8519 */
8520 enum {
8521     Canonical = UnicodeDecomposition.Canonical,
8522     Compatibility = UnicodeDecomposition.Compatibility
8523 }
8524 
8525 /++
8526     Try to canonically compose 2 $(CHARACTERS).
8527     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8528 
8529     The assumption is that `first` comes before `second` in the original text,
8530     usually meaning that the first is a starter.
8531 
8532     Note: Hangul syllables are not covered by this function.
8533     See `composeJamo` below.
8534 +/
8535 public dchar compose(dchar first, dchar second) pure nothrow @safe
8536 {
8537     import std.algorithm.iteration : map;
8538     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8539     import std.range : assumeSorted, stride;
8540     immutable packed = compositionJumpTrie[first];
8541     if (packed == ushort.max)
8542         return dchar.init;
8543     // unpack offset and length
8544     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8545     // TODO: optimize this micro binary search (no more then 4-5 steps)
8546     auto r = compositionTable.stride(2)[idx .. idx+cnt].assumeSorted();
8547     immutable target = r.lowerBound(second).length;
8548     if (target == cnt)
8549         return dchar.init;
8550     immutable entry = compositionTable[(idx+target)*2];
8551     if (entry != second)
8552         return dchar.init;
8553     return compositionTable[(idx+target)*2 + 1];
8554 }
8555 
8556 ///
8557 @safe unittest
8558 {
8559     assert(compose('A','\u0308') == '\u00C4');
8560     assert(compose('A', 'B') == dchar.init);
8561     assert(compose('C', '\u0301') == '\u0106');
8562     // note that the starter is the first one
8563     // thus the following doesn't compose
8564     assert(compose('\u0308', 'A') == dchar.init);
8565 }
8566 
8567 /++
8568     Returns a full $(S_LINK Canonical decomposition, Canonical)
8569     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8570     decomposition of $(CHARACTER) `ch`.
8571     If no decomposition is available returns a $(LREF Grapheme)
8572     with the `ch` itself.
8573 
8574     Note:
8575     This function also decomposes hangul syllables
8576     as prescribed by the standard.
8577 
8578     See_Also: $(LREF decomposeHangul) for a restricted version
8579     that takes into account only hangul syllables  but
8580     no other decompositions.
8581 +/
8582 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8583 {
8584     import std.algorithm.searching : until;
8585     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8586     static if (decompType == Canonical)
8587     {
8588         alias table = decompCanonTable;
8589         alias mapping = canonMappingTrie;
8590     }
8591     else static if (decompType == Compatibility)
8592     {
8593         alias table = decompCompatTable;
8594         alias mapping = compatMappingTrie;
8595     }
8596     immutable idx = mapping[ch];
8597     if (!idx) // not found, check hangul arithmetic decomposition
8598         return decomposeHangul(ch);
8599     auto decomp = table[idx..$].until(0);
8600     return Grapheme(decomp);
8601 }
8602 
8603 ///
8604 @safe unittest
8605 {
8606     import std.algorithm.comparison : equal;
8607 
8608     assert(compose('A','\u0308') == '\u00C4');
8609     assert(compose('A', 'B') == dchar.init);
8610     assert(compose('C', '\u0301') == '\u0106');
8611     // note that the starter is the first one
8612     // thus the following doesn't compose
8613     assert(compose('\u0308', 'A') == dchar.init);
8614 
8615     assert(decompose('Ĉ')[].equal("C\u0302"));
8616     assert(decompose('D')[].equal("D"));
8617     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8618     assert(decompose!Compatibility('¹')[].equal("1"));
8619 }
8620 
8621 //----------------------------------------------------------------------------
8622 // Hangul specific composition/decomposition
8623 enum jamoSBase = 0xAC00;
8624 enum jamoLBase = 0x1100;
8625 enum jamoVBase = 0x1161;
8626 enum jamoTBase = 0x11A7;
8627 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8628 enum jamoNCount = jamoVCount * jamoTCount;
8629 enum jamoSCount = jamoLCount * jamoNCount;
8630 
8631 // Tests if `ch` is a Hangul leading consonant jamo.
8632 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8633 {
8634     // first cmp rejects ~ 1M code points above leading jamo range
8635     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8636 }
8637 
8638 // Tests if `ch` is a Hangul vowel jamo.
8639 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8640 {
8641     // first cmp rejects ~ 1M code points above trailing jamo range
8642     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8643     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8644 }
8645 
8646 // Tests if `ch` is a Hangul trailnig consonant jamo.
8647 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8648 {
8649     // first cmp rejects ~ 1M code points above vowel range
8650     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8651 }
8652 
8653 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8654 {
8655     int idxS = cast(int) ch - jamoSBase;
8656     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8657 }
8658 
8659 // internal helper: compose hangul syllables leaving dchar.init in holes
8660 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8661 {
8662     for (size_t idx = 0; idx + 1 < seq.length; )
8663     {
8664         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8665         {
8666             immutable int indexL = seq[idx] - jamoLBase;
8667             immutable int indexV = seq[idx+1] - jamoVBase;
8668             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8669             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8670             {
8671                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8672                 seq[idx+1] = dchar.init;
8673                 seq[idx+2] = dchar.init;
8674                 idx += 3;
8675             }
8676             else
8677             {
8678                 seq[idx] = jamoSBase + indexLV;
8679                 seq[idx+1] = dchar.init;
8680                 idx += 2;
8681             }
8682         }
8683         else
8684             idx++;
8685     }
8686 }
8687 
8688 //----------------------------------------------------------------------------
8689 public:
8690 
8691 /**
8692     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8693     then this function returns $(LREF Grapheme) containing only `ch` as is.
8694 */
8695 Grapheme decomposeHangul(dchar ch) nothrow pure @safe
8696 {
8697     immutable idxS = cast(int) ch - jamoSBase;
8698     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8699     immutable idxL = idxS / jamoNCount;
8700     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8701     immutable idxT = idxS % jamoTCount;
8702 
8703     immutable partL = jamoLBase + idxL;
8704     immutable partV = jamoVBase + idxV;
8705     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8706         return Grapheme(partL, partV, jamoTBase + idxT);
8707     else // <L, V> decomposition
8708         return Grapheme(partL, partV);
8709 }
8710 
8711 ///
8712 @safe unittest
8713 {
8714     import std.algorithm.comparison : equal;
8715     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8716 }
8717 
8718 /++
8719     Try to compose hangul syllable out of a leading consonant (`lead`),
8720     a `vowel` and optional `trailing` consonant jamos.
8721 
8722     On success returns the composed LV or LVT hangul syllable.
8723 
8724     If any of `lead` and `vowel` are not a valid hangul jamo
8725     of the respective $(CHARACTER) class returns dchar.init.
8726 +/
8727 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8728 {
8729     if (!isJamoL(lead))
8730         return dchar.init;
8731     immutable indexL = lead - jamoLBase;
8732     if (!isJamoV(vowel))
8733         return dchar.init;
8734     immutable indexV = vowel - jamoVBase;
8735     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8736     immutable dchar syllable = jamoSBase + indexLV;
8737     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8738 }
8739 
8740 ///
8741 @safe unittest
8742 {
8743     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8744     // leaving out T-vowel, or passing any codepoint
8745     // that is not trailing consonant composes an LV-syllable
8746     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8747     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8748     assert(composeJamo('\u1111', 'A') == dchar.init);
8749     assert(composeJamo('A', '\u1171') == dchar.init);
8750 }
8751 
8752 @safe unittest
8753 {
8754     import std.algorithm.comparison : equal;
8755     import std.conv : text;
8756 
8757     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8758     {
8759         Grapheme g = decompose!T(ch);
8760         assert(equal(g[], r), text(g[], " vs ", r));
8761     }
8762     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8763     testDecomp!Canonical('\uF907', "\u9F9C");
8764     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8765     testDecomp!Compatibility('\uA7F9', "\u0153");
8766 
8767     // check examples
8768     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8769     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8770     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8771     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8772     assert(composeJamo('\u1111', 'A') == dchar.init);
8773     assert(composeJamo('A', '\u1171') == dchar.init);
8774 }
8775 
8776 /**
8777     Enumeration type for normalization forms,
8778     passed as template parameter for functions like $(LREF normalize).
8779 */
8780 enum NormalizationForm {
8781     NFC,
8782     NFD,
8783     NFKC,
8784     NFKD
8785 }
8786 
8787 
8788 enum {
8789     /**
8790         Shorthand aliases from values indicating normalization forms.
8791     */
8792     NFC = NormalizationForm.NFC,
8793     ///ditto
8794     NFD = NormalizationForm.NFD,
8795     ///ditto
8796     NFKC = NormalizationForm.NFKC,
8797     ///ditto
8798     NFKD = NormalizationForm.NFKD
8799 }
8800 
8801 /++
8802     Returns `input` string normalized to the chosen form.
8803     Form C is used by default.
8804 
8805     For more information on normalization forms see
8806     the $(S_LINK Normalization, normalization section).
8807 
8808     Note:
8809     In cases where the string in question is already normalized,
8810     it is returned unmodified and no memory allocation happens.
8811 +/
8812 /*
8813     WARNING: @trusted lambda inside - handle with same care as @trusted
8814         functions
8815 
8816     Despite being a template, the attributes do no harm since this doesn't work
8817     with user-defined range or character types anyway.
8818 */
8819 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C)
8820     (return scope inout(C)[] input)
8821 {
8822     import std.algorithm.mutation : SwapStrategy;
8823     import std.algorithm.sorting : sort;
8824     import std.array : appender;
8825     import std.range : zip;
8826 
8827     auto anchors = splitNormalized!norm(input);
8828     if (anchors[0] == input.length && anchors[1] == input.length)
8829         return input;
8830     dchar[] decomposed;
8831     decomposed.reserve(31);
8832     ubyte[] ccc;
8833     ccc.reserve(31);
8834     auto app = appender!(C[])();
8835     do
8836     {
8837         app.put(input[0 .. anchors[0]]);
8838         foreach (dchar ch; input[anchors[0]..anchors[1]])
8839             static if (norm == NFD || norm == NFC)
8840             {
8841                 foreach (dchar c; decompose!Canonical(ch)[])
8842                     decomposed ~= c;
8843             }
8844             else // NFKD & NFKC
8845             {
8846                 foreach (dchar c; decompose!Compatibility(ch)[])
8847                     decomposed ~= c;
8848             }
8849         ccc.length = decomposed.length;
8850         size_t firstNonStable = 0;
8851         ubyte lastClazz = 0;
8852 
8853         foreach (idx, dchar ch; decomposed)
8854         {
8855             immutable clazz = combiningClass(ch);
8856             ccc[idx] = clazz;
8857             if (clazz == 0 && lastClazz != 0)
8858             {
8859                 // found a stable code point after unstable ones
8860                 sort!("a[0] < b[0]", SwapStrategy.stable)
8861                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8862                 firstNonStable = decomposed.length;
8863             }
8864             else if (clazz != 0 && lastClazz == 0)
8865             {
8866                 // found first unstable code point after stable ones
8867                 firstNonStable = idx;
8868             }
8869             lastClazz = clazz;
8870         }
8871         sort!("a[0] < b[0]", SwapStrategy.stable)
8872             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8873         static if (norm == NFC || norm == NFKC)
8874         {
8875             import std.algorithm.searching : countUntil;
8876             auto first = countUntil(ccc, 0);
8877             if (first >= 0) // no starters?? no recomposition
8878             {
8879                 for (;;)
8880                 {
8881                     immutable second = recompose(first, decomposed, ccc);
8882                     if (second == decomposed.length)
8883                         break;
8884                     first = second;
8885                 }
8886                 // 2nd pass for hangul syllables
8887                 hangulRecompose(decomposed);
8888             }
8889         }
8890         static if (norm == NFD || norm == NFKD)
8891             app.put(decomposed);
8892         else
8893         {
8894             import std.algorithm.mutation : remove;
8895             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8896             app.put(decomposed[0 .. clean.length]);
8897         }
8898         // reset variables
8899         decomposed.length = 0;
8900         () @trusted {
8901             // assumeSafeAppend isn't considered pure as of writing, hence the
8902             // cast. It isn't pure in the sense that the elements after
8903             // the array in question are affected, but we don't use those
8904             // making the call pure for our purposes.
8905             (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})();
8906             ccc.length = 0;
8907             (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})();
8908         } ();
8909         input = input[anchors[1]..$];
8910         // and move on
8911         anchors = splitNormalized!norm(input);
8912     } while (anchors[0] != input.length);
8913     app.put(input[0 .. anchors[0]]);
8914     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8915 }
8916 
8917 ///
8918 @safe pure unittest
8919 {
8920     // any encoding works
8921     wstring greet = "Hello world";
8922     assert(normalize(greet) is greet); // the same exact slice
8923 
8924     // An example of a character with all 4 forms being different:
8925     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8926     assert(normalize!NFC("ϓ") == "\u03D3");
8927     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8928     assert(normalize!NFKC("ϓ") == "\u038E");
8929     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8930 }
8931 
8932 @safe pure unittest
8933 {
8934     import std.conv : text;
8935 
8936     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8937     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8938     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8939 
8940     // test with dstring
8941     dstring greet = "Hello world";
8942     assert(normalize(greet) is greet); // the same exact slice
8943 }
8944 
8945 // canonically recompose given slice of code points, works in-place and mutates data
8946 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8947 {
8948     assert(input.length == ccc.length);
8949     int accumCC = -1;// so that it's out of 0 .. 255 range
8950     // writefln("recomposing %( %04x %)", input);
8951     // first one is always a starter thus we start at i == 1
8952     size_t i = start+1;
8953     for (; ; )
8954     {
8955         if (i == input.length)
8956             break;
8957         immutable curCC = ccc[i];
8958         // In any character sequence beginning with a starter S
8959         // a character C is blocked from S if and only if there
8960         // is some character B between S and C, and either B
8961         // is a starter or it has the same or higher combining class as C.
8962         //------------------------
8963         // Applying to our case:
8964         // S is input[0]
8965         // accumCC is the maximum CCC of characters between C and S,
8966         //     as ccc are sorted
8967         // C is input[i]
8968 
8969         if (curCC > accumCC)
8970         {
8971             immutable comp = compose(input[start], input[i]);
8972             if (comp != dchar.init)
8973             {
8974                 input[start] = comp;
8975                 input[i] = dchar.init;// put a sentinel
8976                 // current was merged so its CCC shouldn't affect
8977                 // composing with the next one
8978             }
8979             else
8980             {
8981                 // if it was a starter then accumCC is now 0, end of loop
8982                 accumCC = curCC;
8983                 if (accumCC == 0)
8984                     break;
8985             }
8986         }
8987         else
8988         {
8989             // ditto here
8990             accumCC = curCC;
8991             if (accumCC == 0)
8992                 break;
8993         }
8994         i++;
8995     }
8996     return i;
8997 }
8998 
8999 // returns tuple of 2 indexes that delimit:
9000 // normalized text, piece that needs normalization and
9001 // the rest of input starting with stable code point
9002 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
9003 {
9004     import std.typecons : tuple;
9005     ubyte lastCC = 0;
9006 
9007     foreach (idx, dchar ch; input)
9008     {
9009         static if (norm == NFC)
9010             if (ch < 0x0300)
9011             {
9012                 lastCC = 0;
9013                 continue;
9014             }
9015         immutable ubyte CC = combiningClass(ch);
9016         if (lastCC > CC && CC != 0)
9017         {
9018             return seekStable!norm(idx, input);
9019         }
9020 
9021         if (notAllowedIn!norm(ch))
9022         {
9023            return seekStable!norm(idx, input);
9024         }
9025         lastCC = CC;
9026     }
9027     return tuple(input.length, input.length);
9028 }
9029 
9030 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
9031 {
9032     import std.typecons : tuple;
9033     import std.utf : codeLength;
9034 
9035     auto br = input[0 .. idx];
9036     size_t region_start = 0;// default
9037     for (;;)
9038     {
9039         if (br.empty)// start is 0
9040             break;
9041         dchar ch = br.back;
9042         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
9043         {
9044             region_start = br.length - codeLength!C(ch);
9045             break;
9046         }
9047         br.popFront();
9048     }
9049     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
9050     size_t region_end=input.length;// end is $ by default
9051     foreach (i, dchar ch; input[idx..$])
9052     {
9053         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
9054         {
9055             region_end = i+idx;
9056             break;
9057         }
9058     }
9059     // writeln("Region to normalize: ", input[region_start .. region_end]);
9060     return tuple(region_start, region_end);
9061 }
9062 
9063 /**
9064     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
9065     form `norm`.
9066 */
9067 public bool allowedIn(NormalizationForm norm)(dchar ch)
9068 {
9069     return !notAllowedIn!norm(ch);
9070 }
9071 
9072 ///
9073 @safe unittest
9074 {
9075     // e.g. Cyrillic is always allowed, so is ASCII
9076     assert(allowedIn!NFC('я'));
9077     assert(allowedIn!NFD('я'));
9078     assert(allowedIn!NFKC('я'));
9079     assert(allowedIn!NFKD('я'));
9080     assert(allowedIn!NFC('Z'));
9081 }
9082 
9083 // not user friendly name but more direct
9084 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
9085 {
9086     static if (norm == NFC)
9087         alias qcTrie = nfcQCTrie;
9088     else static if (norm == NFD)
9089         alias qcTrie = nfdQCTrie;
9090     else static if (norm == NFKC)
9091         alias qcTrie = nfkcQCTrie;
9092     else static if (norm == NFKD)
9093         alias qcTrie = nfkdQCTrie;
9094     else
9095         static assert("Unknown normalization form "~norm);
9096     return qcTrie[ch];
9097 }
9098 
9099 @safe unittest
9100 {
9101     assert(allowedIn!NFC('я'));
9102     assert(allowedIn!NFD('я'));
9103     assert(allowedIn!NFKC('я'));
9104     assert(allowedIn!NFKD('я'));
9105     assert(allowedIn!NFC('Z'));
9106 }
9107 
9108 }
9109 
9110 version (std_uni_bootstrap)
9111 {
9112     // old version used for bootstrapping of gen_uni.d that generates
9113     // up to date optimal versions of all of isXXX functions
9114     @safe pure nothrow @nogc public bool isWhite(dchar c)
9115     {
9116         import std.ascii : isWhite;
9117         return isWhite(c) ||
9118                c == lineSep || c == paraSep ||
9119                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
9120                (c >= '\u2000' && c <= '\u200A') ||
9121                c == '\u202F' || c == '\u205F' || c == '\u3000';
9122     }
9123 }
9124 else
9125 {
9126 
9127 // trusted -> avoid bounds check
9128 @trusted pure nothrow @nogc private
9129 {
9130     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
9131 
9132     // hide template instances behind functions
9133     // https://issues.dlang.org/show_bug.cgi?id=13232
9134     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
9135     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
9136     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
9137 
9138     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
9139     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
9140     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
9141 
9142     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
9143     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
9144     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
9145 }
9146 
9147 public:
9148 
9149 /++
9150     Whether or not `c` is a Unicode whitespace $(CHARACTER).
9151     (general Unicode category: Part of C0(tab, vertical tab, form feed,
9152     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
9153 +/
9154 @safe pure nothrow @nogc
9155 public bool isWhite(dchar c)
9156 {
9157     import std.internal.unicode_tables : isWhiteGen; // generated file
9158     return isWhiteGen(c); // call pregenerated binary search
9159 }
9160 
9161 /++
9162     Return whether `c` is a Unicode lowercase $(CHARACTER).
9163 +/
9164 @safe pure nothrow @nogc
9165 bool isLower(dchar c)
9166 {
9167     import std.ascii : isLower, isASCII;
9168     if (isASCII(c))
9169         return isLower(c);
9170     return lowerCaseTrie[c];
9171 }
9172 
9173 @safe unittest
9174 {
9175     import std.ascii : isLower;
9176     foreach (v; 0 .. 0x80)
9177         assert(isLower(v) == .isLower(v));
9178     assert(.isLower('я'));
9179     assert(.isLower('й'));
9180     assert(!.isLower('Ж'));
9181     // Greek HETA
9182     assert(!.isLower('\u0370'));
9183     assert(.isLower('\u0371'));
9184     assert(!.isLower('\u039C')); // capital MU
9185     assert(.isLower('\u03B2')); // beta
9186     // from extended Greek
9187     assert(!.isLower('\u1F18'));
9188     assert(.isLower('\u1F00'));
9189     foreach (v; unicode.lowerCase.byCodepoint)
9190         assert(.isLower(v) && !isUpper(v));
9191 }
9192 
9193 
9194 /++
9195     Return whether `c` is a Unicode uppercase $(CHARACTER).
9196 +/
9197 @safe pure nothrow @nogc
9198 bool isUpper(dchar c)
9199 {
9200     import std.ascii : isUpper, isASCII;
9201     if (isASCII(c))
9202         return isUpper(c);
9203     return upperCaseTrie[c];
9204 }
9205 
9206 @safe unittest
9207 {
9208     import std.ascii : isLower;
9209     foreach (v; 0 .. 0x80)
9210         assert(isLower(v) == .isLower(v));
9211     assert(!isUpper('й'));
9212     assert(isUpper('Ж'));
9213     // Greek HETA
9214     assert(isUpper('\u0370'));
9215     assert(!isUpper('\u0371'));
9216     assert(isUpper('\u039C')); // capital MU
9217     assert(!isUpper('\u03B2')); // beta
9218     // from extended Greek
9219     assert(!isUpper('\u1F00'));
9220     assert(isUpper('\u1F18'));
9221     foreach (v; unicode.upperCase.byCodepoint)
9222         assert(isUpper(v) && !.isLower(v));
9223 }
9224 
9225 
9226 //TODO: Hidden for now, needs better API.
9227 //Other transforms could use better API as well, but this one is a new primitive.
9228 @safe pure nothrow @nogc
9229 private dchar toTitlecase(dchar c)
9230 {
9231     // optimize ASCII case
9232     if (c < 0xAA)
9233     {
9234         if (c < 'a')
9235             return c;
9236         if (c <= 'z')
9237             return c - 32;
9238         return c;
9239     }
9240     size_t idx = toTitleSimpleIndex(c);
9241     if (idx != ushort.max)
9242     {
9243         return toTitleTab(idx);
9244     }
9245     return c;
9246 }
9247 
9248 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9249 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9250 
9251 // generic toUpper/toLower on whole string, creates new or returns as is
9252 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9253 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9254 {
9255     import std.array : appender, array;
9256     import std.ascii : isASCII;
9257     import std.utf : byDchar, codeLength;
9258 
9259     alias C = ElementEncodingType!S;
9260 
9261     auto r = s.byDchar;
9262     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9263     {
9264         auto cOuter = r.front;
9265         ushort idx = indexFn(cOuter);
9266         if (idx == ushort.max)
9267             continue;
9268         auto result = appender!(C[])();
9269         result.reserve(s.length);
9270         result.put(s[0 .. i]);
9271         foreach (dchar c; s[i .. $].byDchar)
9272         {
9273             if (c.isASCII)
9274             {
9275                 result.put(asciiConvert(c));
9276             }
9277             else
9278             {
9279                 idx = indexFn(c);
9280                 if (idx == ushort.max)
9281                     result.put(c);
9282                 else if (idx < maxIdx)
9283                 {
9284                     c = tableFn(idx);
9285                     result.put(c);
9286                 }
9287                 else
9288                 {
9289                     auto val = tableFn(idx);
9290                     // unpack length + codepoint
9291                     immutable uint len = val >> 24;
9292                     result.put(cast(dchar)(val & 0xFF_FFFF));
9293                     foreach (j; idx+1 .. idx+len)
9294                         result.put(tableFn(j));
9295                 }
9296             }
9297         }
9298         return result.data;
9299     }
9300 
9301     static if (isSomeString!S)
9302         return s;
9303     else
9304         return s.array;
9305 }
9306 
9307 // https://issues.dlang.org/show_bug.cgi?id=12428
9308 @safe unittest
9309 {
9310     import std.array : replicate;
9311     auto s = "abcdefghij".replicate(300);
9312     s = s[0 .. 10];
9313 
9314     toUpper(s);
9315 
9316     assert(s == "abcdefghij");
9317 }
9318 
9319 // https://issues.dlang.org/show_bug.cgi?id=18993
9320 @safe unittest
9321 {
9322     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9323 }
9324 
9325 
9326 // generic toUpper/toLower on whole range, returns range
9327 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9328     // Accept range of dchar's
9329 if (isInputRange!Range &&
9330     isSomeChar!(ElementEncodingType!Range) &&
9331     ElementEncodingType!Range.sizeof == dchar.sizeof)
9332 {
9333     static struct ToCaserImpl
9334     {
9335         @property bool empty()
9336         {
9337             return !nLeft && r.empty;
9338         }
9339 
9340         @property auto front()
9341         {
9342             import std.ascii : isASCII;
9343 
9344             if (!nLeft)
9345             {
9346                 dchar c = r.front;
9347                 if (c.isASCII)
9348                 {
9349                     buf[0] = asciiConvert(c);
9350                     nLeft = 1;
9351                 }
9352                 else
9353                 {
9354                     const idx = indexFn(c);
9355                     if (idx == ushort.max)
9356                     {
9357                         buf[0] = c;
9358                         nLeft = 1;
9359                     }
9360                     else if (idx < maxIdx)
9361                     {
9362                         buf[0] = tableFn(idx);
9363                         nLeft = 1;
9364                     }
9365                     else
9366                     {
9367                         immutable val = tableFn(idx);
9368                         // unpack length + codepoint
9369                         nLeft = val >> 24;
9370                         if (nLeft == 0)
9371                             nLeft = 1;
9372                         assert(nLeft <= buf.length);
9373                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9374                         foreach (j; 1 .. nLeft)
9375                             buf[nLeft - j - 1] = tableFn(idx + j);
9376                     }
9377                 }
9378             }
9379             return buf[nLeft - 1];
9380         }
9381 
9382         void popFront()
9383         {
9384             if (!nLeft)
9385                 front;
9386             assert(nLeft);
9387             --nLeft;
9388             if (!nLeft)
9389                 r.popFront();
9390         }
9391 
9392         static if (isForwardRange!Range)
9393         {
9394             @property auto save()
9395             {
9396                 auto ret = this;
9397                 ret.r = r.save;
9398                 return ret;
9399             }
9400         }
9401 
9402       private:
9403         Range r;
9404         uint nLeft;
9405         dchar[3] buf = void;
9406     }
9407 
9408     return ToCaserImpl(str);
9409 }
9410 
9411 /*********************
9412  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9413  * or a string to upper or lower case.
9414  *
9415  * Does not allocate memory.
9416  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9417  * are treated as $(REF replacementDchar, std,utf).
9418  *
9419  * Params:
9420  *      str = string or range of characters
9421  *
9422  * Returns:
9423  *      an input range of `dchar`s
9424  *
9425  * See_Also:
9426  *      $(LREF toUpper), $(LREF toLower)
9427  */
9428 
9429 auto asLowerCase(Range)(Range str)
9430 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9431     !isConvertibleToString!Range)
9432 {
9433     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9434     {
9435         import std.utf : byDchar;
9436 
9437         // Decode first
9438         return asLowerCase(str.byDchar);
9439     }
9440     else
9441     {
9442         static import std.ascii;
9443         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9444     }
9445 }
9446 
9447 /// ditto
9448 auto asUpperCase(Range)(Range str)
9449 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9450     !isConvertibleToString!Range)
9451 {
9452     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9453     {
9454         import std.utf : byDchar;
9455 
9456         // Decode first
9457         return asUpperCase(str.byDchar);
9458     }
9459     else
9460     {
9461         static import std.ascii;
9462         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9463     }
9464 }
9465 
9466 ///
9467 @safe pure unittest
9468 {
9469     import std.algorithm.comparison : equal;
9470 
9471     assert("hEllo".asUpperCase.equal("HELLO"));
9472 }
9473 
9474 // explicitly undocumented
9475 auto asLowerCase(Range)(auto ref Range str)
9476 if (isConvertibleToString!Range)
9477 {
9478     import std.traits : StringTypeOf;
9479     return asLowerCase!(StringTypeOf!Range)(str);
9480 }
9481 
9482 // explicitly undocumented
9483 auto asUpperCase(Range)(auto ref Range str)
9484 if (isConvertibleToString!Range)
9485 {
9486     import std.traits : StringTypeOf;
9487     return asUpperCase!(StringTypeOf!Range)(str);
9488 }
9489 
9490 @safe unittest
9491 {
9492     static struct TestAliasedString
9493     {
9494         string get() @safe @nogc pure nothrow { return _s; }
9495         alias get this;
9496         @disable this(this);
9497         string _s;
9498     }
9499 
9500     static bool testAliasedString(alias func, Args...)(string s, Args args)
9501     {
9502         import std.algorithm.comparison : equal;
9503         auto a = func(TestAliasedString(s), args);
9504         auto b = func(s, args);
9505         static if (is(typeof(equal(a, b))))
9506         {
9507             // For ranges, compare contents instead of object identity.
9508             return equal(a, b);
9509         }
9510         else
9511         {
9512             return a == b;
9513         }
9514     }
9515     assert(testAliasedString!asLowerCase("hEllo"));
9516     assert(testAliasedString!asUpperCase("hEllo"));
9517     assert(testAliasedString!asCapitalized("hEllo"));
9518 }
9519 
9520 @safe unittest
9521 {
9522     import std.array : array;
9523 
9524     auto a = "HELLo".asLowerCase;
9525     auto savea = a.save;
9526     auto s = a.array;
9527     assert(s == "hello");
9528     s = savea.array;
9529     assert(s == "hello");
9530 
9531     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9532     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9533 
9534     foreach (i, slwr; lower)
9535     {
9536         import std.utf : byChar;
9537 
9538         auto sx = slwr.asUpperCase.byChar.array;
9539         assert(sx == toUpper(slwr));
9540         auto sy = upper[i].asLowerCase.byChar.array;
9541         assert(sy == toLower(upper[i]));
9542     }
9543 
9544     // Not necessary to call r.front
9545     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9546     {
9547     }
9548 
9549     import std.algorithm.comparison : equal;
9550 
9551     "HELLo"w.asLowerCase.equal("hello"d);
9552     "HELLo"w.asUpperCase.equal("HELLO"d);
9553     "HELLo"d.asLowerCase.equal("hello"d);
9554     "HELLo"d.asUpperCase.equal("HELLO"d);
9555 
9556     import std.utf : byChar;
9557     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9558 }
9559 
9560 // generic capitalizer on whole range, returns range
9561 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9562                            Range)(Range str)
9563     // Accept range of dchar's
9564 if (isInputRange!Range &&
9565     isSomeChar!(ElementEncodingType!Range) &&
9566     ElementEncodingType!Range.sizeof == dchar.sizeof)
9567 {
9568     static struct ToCapitalizerImpl
9569     {
9570         @property bool empty()
9571         {
9572             return lower ? lwr.empty : !nLeft && r.empty;
9573         }
9574 
9575         @property auto front()
9576         {
9577             if (lower)
9578                 return lwr.front;
9579 
9580             if (!nLeft)
9581             {
9582                 immutable dchar c = r.front;
9583                 const idx = indexFnUpper(c);
9584                 if (idx == ushort.max)
9585                 {
9586                     buf[0] = c;
9587                     nLeft = 1;
9588                 }
9589                 else if (idx < maxIdxUpper)
9590                 {
9591                     buf[0] = tableFnUpper(idx);
9592                     nLeft = 1;
9593                 }
9594                 else
9595                 {
9596                     immutable val = tableFnUpper(idx);
9597                     // unpack length + codepoint
9598                     nLeft = val >> 24;
9599                     if (nLeft == 0)
9600                         nLeft = 1;
9601                     assert(nLeft <= buf.length);
9602                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9603                     foreach (j; 1 .. nLeft)
9604                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9605                 }
9606             }
9607             return buf[nLeft - 1];
9608         }
9609 
9610         void popFront()
9611         {
9612             if (lower)
9613                 lwr.popFront();
9614             else
9615             {
9616                 if (!nLeft)
9617                     front;
9618                 assert(nLeft);
9619                 --nLeft;
9620                 if (!nLeft)
9621                 {
9622                     r.popFront();
9623                     lwr = r.asLowerCase();
9624                     lower = true;
9625                 }
9626             }
9627         }
9628 
9629         static if (isForwardRange!Range)
9630         {
9631             @property auto save()
9632             {
9633                 auto ret = this;
9634                 ret.r = r.save;
9635                 ret.lwr = lwr.save;
9636                 return ret;
9637             }
9638         }
9639 
9640       private:
9641         Range r;
9642         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9643         bool lower = false;     // false for first character, true for rest of string
9644         dchar[3] buf = void;
9645         uint nLeft = 0;
9646     }
9647 
9648     return ToCapitalizerImpl(str);
9649 }
9650 
9651 /*********************
9652  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9653  * or string, meaning convert the first
9654  * character to upper case and subsequent characters to lower case.
9655  *
9656  * Does not allocate memory.
9657  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9658  * are treated as $(REF replacementDchar, std,utf).
9659  *
9660  * Params:
9661  *      str = string or range of characters
9662  *
9663  * Returns:
9664  *      an InputRange of dchars
9665  *
9666  * See_Also:
9667  *      $(LREF toUpper), $(LREF toLower)
9668  *      $(LREF asUpperCase), $(LREF asLowerCase)
9669  */
9670 
9671 auto asCapitalized(Range)(Range str)
9672 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9673     !isConvertibleToString!Range)
9674 {
9675     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9676     {
9677         import std.utf : byDchar;
9678 
9679         // Decode first
9680         return toCapitalizer!UpperTriple(str.byDchar);
9681     }
9682     else
9683     {
9684         return toCapitalizer!UpperTriple(str);
9685     }
9686 }
9687 
9688 ///
9689 @safe pure unittest
9690 {
9691     import std.algorithm.comparison : equal;
9692 
9693     assert("hEllo".asCapitalized.equal("Hello"));
9694 }
9695 
9696 auto asCapitalized(Range)(auto ref Range str)
9697 if (isConvertibleToString!Range)
9698 {
9699     import std.traits : StringTypeOf;
9700     return asCapitalized!(StringTypeOf!Range)(str);
9701 }
9702 
9703 @safe pure nothrow @nogc unittest
9704 {
9705     auto r = "hEllo".asCapitalized();
9706     assert(r.front == 'H');
9707 }
9708 
9709 @safe unittest
9710 {
9711     import std.array : array;
9712 
9713     auto a = "hELLo".asCapitalized;
9714     auto savea = a.save;
9715     auto s = a.array;
9716     assert(s == "Hello");
9717     s = savea.array;
9718     assert(s == "Hello");
9719 
9720     string[2][] cases =
9721     [
9722         ["", ""],
9723         ["h", "H"],
9724         ["H", "H"],
9725         ["3", "3"],
9726         ["123", "123"],
9727         ["h123A", "H123a"],
9728         ["феж", "Феж"],
9729         ["\u1Fe2", "\u03a5\u0308\u0300"],
9730     ];
9731 
9732     foreach (i; 0 .. cases.length)
9733     {
9734         import std.utf : byChar;
9735 
9736         auto r = cases[i][0].asCapitalized.byChar.array;
9737         auto result = cases[i][1];
9738         assert(r == result);
9739     }
9740 
9741     // Don't call r.front
9742     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9743     {
9744     }
9745 
9746     import std.algorithm.comparison : equal;
9747 
9748     "HELLo"w.asCapitalized.equal("Hello"d);
9749     "hElLO"w.asCapitalized.equal("Hello"d);
9750     "hello"d.asCapitalized.equal("Hello"d);
9751     "HELLO"d.asCapitalized.equal("Hello"d);
9752 
9753     import std.utf : byChar;
9754     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9755 }
9756 
9757 // TODO: helper, I wish std.utf was more flexible (and stright)
9758 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9759 {
9760     if (c <= 0x7F)
9761     {
9762         buf[idx] = cast(char) c;
9763         idx++;
9764     }
9765     else if (c <= 0x7FF)
9766     {
9767         buf[idx] = cast(char)(0xC0 | (c >> 6));
9768         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9769         idx += 2;
9770     }
9771     else if (c <= 0xFFFF)
9772     {
9773         buf[idx] = cast(char)(0xE0 | (c >> 12));
9774         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9775         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9776         idx += 3;
9777     }
9778     else if (c <= 0x10FFFF)
9779     {
9780         buf[idx] = cast(char)(0xF0 | (c >> 18));
9781         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9782         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9783         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9784         idx += 4;
9785     }
9786     else
9787         assert(0);
9788     return idx;
9789 }
9790 
9791 @safe unittest
9792 {
9793     char[] s = "abcd".dup;
9794     size_t i = 0;
9795     i = encodeTo(s, i, 'X');
9796     assert(s == "Xbcd");
9797 
9798     i = encodeTo(s, i, cast(dchar)'\u00A9');
9799     assert(s == "X\xC2\xA9d");
9800 }
9801 
9802 // TODO: helper, I wish std.utf was more flexible (and stright)
9803 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9804 {
9805     import std.utf : UTFException;
9806     if (c <= 0xFFFF)
9807     {
9808         if (0xD800 <= c && c <= 0xDFFF)
9809             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9810         buf[idx] = cast(wchar) c;
9811         idx++;
9812     }
9813     else if (c <= 0x10FFFF)
9814     {
9815         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9816         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9817         idx += 2;
9818     }
9819     else
9820         assert(0);
9821     return idx;
9822 }
9823 
9824 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9825 {
9826     buf[idx] = c;
9827     idx++;
9828     return idx;
9829 }
9830 
9831 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9832 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9833 {
9834     import std.utf : decode, codeLength;
9835     size_t curIdx = 0;
9836     size_t destIdx = 0;
9837     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9838     size_t lastUnchanged = 0;
9839     // in-buffer move of bytes to a new start index
9840     // the trick is that it may not need to copy at all
9841     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9842     {
9843         // Interestingly we may just bump pointer for a while
9844         // then have to copy if a re-cased char was smaller the original
9845         // later we may regain pace with char that got bigger
9846         // In the end it sometimes flip-flops between the 2 cases below
9847         if (dest == from)
9848             return to;
9849         // got to copy
9850         foreach (C c; str[from .. to])
9851             str[dest++] = c;
9852         return dest;
9853     }
9854     while (curIdx != s.length)
9855     {
9856         size_t startIdx = curIdx;
9857         immutable ch = decode(s, curIdx);
9858         // TODO: special case for ASCII
9859         immutable caseIndex = indexFn(ch);
9860         if (caseIndex == ushort.max) // unchanged, skip over
9861         {
9862             continue;
9863         }
9864         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9865         {
9866             // previous cased chars had the same length as uncased ones
9867             // thus can just adjust pointer
9868             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9869             lastUnchanged = curIdx;
9870             immutable cased = tableFn(caseIndex);
9871             immutable casedLen = codeLength!C(cased);
9872             if (casedLen + destIdx > curIdx) // no place to fit cased char
9873             {
9874                 // switch to slow codepath, where we allocate
9875                 return slowToCase(s, startIdx, destIdx);
9876             }
9877             else
9878             {
9879                 destIdx = encodeTo(s, destIdx, cased);
9880             }
9881         }
9882         else  // 1:m codepoint mapping, slow codepath
9883         {
9884             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9885             lastUnchanged = curIdx;
9886             return slowToCase(s, startIdx, destIdx);
9887         }
9888         assert(destIdx <= curIdx);
9889     }
9890     if (lastUnchanged != s.length)
9891     {
9892         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9893     }
9894     s = s[0 .. destIdx];
9895 }
9896 
9897 // helper to precalculate size of case-converted string
9898 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9899 {
9900     size_t toCaseLength(C)(const scope C[] str)
9901     {
9902         import std.utf : decode, codeLength;
9903         size_t codeLen = 0;
9904         size_t lastNonTrivial = 0;
9905         size_t curIdx = 0;
9906         while (curIdx != str.length)
9907         {
9908             immutable startIdx = curIdx;
9909             immutable ch = decode(str, curIdx);
9910             immutable ushort caseIndex = indexFn(ch);
9911             if (caseIndex == ushort.max)
9912                 continue;
9913             else if (caseIndex < maxIdx)
9914             {
9915                 codeLen += startIdx - lastNonTrivial;
9916                 lastNonTrivial = curIdx;
9917                 immutable cased = tableFn(caseIndex);
9918                 codeLen += codeLength!C(cased);
9919             }
9920             else
9921             {
9922                 codeLen += startIdx - lastNonTrivial;
9923                 lastNonTrivial = curIdx;
9924                 immutable val = tableFn(caseIndex);
9925                 immutable len = val >> 24;
9926                 immutable dchar cased = val & 0xFF_FFFF;
9927                 codeLen += codeLength!C(cased);
9928                 foreach (j; caseIndex+1 .. caseIndex+len)
9929                     codeLen += codeLength!C(tableFn(j));
9930             }
9931         }
9932         if (lastNonTrivial != str.length)
9933             codeLen += str.length - lastNonTrivial;
9934         return codeLen;
9935     }
9936 }
9937 
9938 @safe unittest
9939 {
9940     alias toLowerLength = toCaseLength!(LowerTriple);
9941     assert(toLowerLength("abcd") == 4);
9942     assert(toLowerLength("аБВгд456") == 10+3);
9943 }
9944 
9945 // slower code path that preallocates and then copies
9946 // case-converted stuf to the new string
9947 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9948 {
9949     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9950         size_t destIdx) @trusted pure
9951     if (is(C == char) || is(C == wchar) || is(C == dchar))
9952     {
9953         import std.utf : decode;
9954         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9955         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9956         C[] ns = new C[trueLength];
9957         ns[0 .. destIdx] = s[0 .. destIdx];
9958         size_t lastUnchanged = curIdx;
9959         while (curIdx != s.length)
9960         {
9961             immutable startIdx = curIdx; // start of current codepoint
9962             immutable ch = decode(s, curIdx);
9963             immutable caseIndex = indexFn(ch);
9964             if (caseIndex == ushort.max) // skip over
9965             {
9966                 continue;
9967             }
9968             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9969             {
9970                 immutable cased = tableFn(caseIndex);
9971                 auto toCopy = startIdx - lastUnchanged;
9972                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9973                 lastUnchanged = curIdx;
9974                 destIdx += toCopy;
9975                 destIdx = encodeTo(ns, destIdx, cased);
9976             }
9977             else  // 1:m codepoint mapping, slow codepath
9978             {
9979                 auto toCopy = startIdx - lastUnchanged;
9980                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9981                 lastUnchanged = curIdx;
9982                 destIdx += toCopy;
9983                 auto val = tableFn(caseIndex);
9984                 // unpack length + codepoint
9985                 immutable uint len = val >> 24;
9986                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9987                 foreach (j; caseIndex+1 .. caseIndex+len)
9988                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9989             }
9990         }
9991         if (lastUnchanged != s.length)
9992         {
9993             auto toCopy = s.length - lastUnchanged;
9994             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9995             destIdx += toCopy;
9996         }
9997         assert(ns.length == destIdx);
9998         s = ns;
9999     }
10000 }
10001 
10002 /++
10003     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
10004     For a few characters string length may increase after the transformation,
10005     in such a case the function reallocates exactly once.
10006     If `s` does not have any uppercase characters, then `s` is unaltered.
10007 +/
10008 void toLowerInPlace(C)(ref C[] s) @trusted pure
10009 if (is(C == char) || is(C == wchar) || is(C == dchar))
10010 {
10011     toCaseInPlace!(LowerTriple)(s);
10012 }
10013 // overloads for the most common cases to reduce compile time
10014 @safe pure /*TODO nothrow*/
10015 {
10016     void toLowerInPlace(ref char[] s)
10017     { toLowerInPlace!char(s); }
10018     void toLowerInPlace(ref wchar[] s)
10019     { toLowerInPlace!wchar(s); }
10020     void toLowerInPlace(ref dchar[] s)
10021     { toLowerInPlace!dchar(s); }
10022 }
10023 
10024 /++
10025     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
10026     For a few characters string length may increase after the transformation,
10027     in such a case the function reallocates exactly once.
10028     If `s` does not have any lowercase characters, then `s` is unaltered.
10029 +/
10030 void toUpperInPlace(C)(ref C[] s) @trusted pure
10031 if (is(C == char) || is(C == wchar) || is(C == dchar))
10032 {
10033     toCaseInPlace!(UpperTriple)(s);
10034 }
10035 // overloads for the most common cases to reduce compile time/code size
10036 @safe pure /*TODO nothrow*/
10037 {
10038     void toUpperInPlace(ref char[] s)
10039     { toUpperInPlace!char(s); }
10040     void toUpperInPlace(ref wchar[] s)
10041     { toUpperInPlace!wchar(s); }
10042     void toUpperInPlace(ref dchar[] s)
10043     { toUpperInPlace!dchar(s); }
10044 }
10045 
10046 /++
10047     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
10048     is returned. Otherwise `c` is returned.
10049 
10050     Warning: certain alphabets like German and Greek have no 1:1
10051     upper-lower mapping. Use overload of toLower which takes full string instead.
10052 +/
10053 @safe pure nothrow @nogc
10054 dchar toLower(dchar c)
10055 {
10056      // optimize ASCII case
10057     if (c < 0xAA)
10058     {
10059         if (c < 'A')
10060             return c;
10061         if (c <= 'Z')
10062             return c + 32;
10063         return c;
10064     }
10065     size_t idx = toLowerSimpleIndex(c);
10066     if (idx != ushort.max)
10067     {
10068         return toLowerTab(idx);
10069     }
10070     return c;
10071 }
10072 
10073 /++
10074     Creates a new array which is identical to `s` except that all of its
10075     characters are converted to lowercase (by performing Unicode lowercase mapping).
10076     If none of `s` characters were affected, then `s` itself is returned if `s` is a
10077     `string`-like type.
10078 
10079     Params:
10080         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10081         of characters
10082     Returns:
10083         An array with the same element type as `s`.
10084 +/
10085 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
10086 if (isSomeString!S)
10087 {
10088     static import std.ascii;
10089     return toCase!(LowerTriple, std.ascii.toLower)(s);
10090 }
10091 
10092 /// ditto
10093 ElementEncodingType!S[] toLower(S)(S s)
10094 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10095 {
10096     static import std.ascii;
10097     return toCase!(LowerTriple, std.ascii.toLower)(s);
10098 }
10099 
10100 // overloads for the most common cases to reduce compile time
10101 @safe pure /*TODO nothrow*/
10102 {
10103     string toLower(return scope string s)
10104     { return toLower!string(s); }
10105     wstring toLower(return scope wstring s)
10106     { return toLower!wstring(s); }
10107     dstring toLower(return scope dstring s)
10108     { return toLower!dstring(s); }
10109 
10110     @safe unittest
10111     {
10112         // https://issues.dlang.org/show_bug.cgi?id=16663
10113 
10114         static struct String
10115         {
10116             string data;
10117             alias data this;
10118         }
10119 
10120         void foo()
10121         {
10122             auto u = toLower(String(""));
10123         }
10124     }
10125 }
10126 
10127 
10128 @safe unittest
10129 {
10130     static import std.ascii;
10131     import std.format : format;
10132     foreach (ch; 0 .. 0x80)
10133         assert(std.ascii.toLower(ch) == toLower(ch));
10134     assert(toLower('Я') == 'я');
10135     assert(toLower('Δ') == 'δ');
10136     foreach (ch; unicode.upperCase.byCodepoint)
10137     {
10138         dchar low = ch.toLower();
10139         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
10140     }
10141     assert(toLower("АЯ") == "ая");
10142 
10143     assert("\u1E9E".toLower == "\u00df");
10144     assert("\u00df".toUpper == "SS");
10145 }
10146 
10147 // https://issues.dlang.org/show_bug.cgi?id=9629
10148 @safe unittest
10149 {
10150     wchar[] test = "hello þ world"w.dup;
10151     auto piece = test[6 .. 7];
10152     toUpperInPlace(piece);
10153     assert(test == "hello Þ world");
10154 }
10155 
10156 
10157 @safe unittest
10158 {
10159     import std.algorithm.comparison : cmp;
10160     string s1 = "FoL";
10161     string s2 = toLower(s1);
10162     assert(cmp(s2, "fol") == 0, s2);
10163     assert(s2 != s1);
10164 
10165     char[] s3 = s1.dup;
10166     toLowerInPlace(s3);
10167     assert(s3 == s2);
10168 
10169     s1 = "A\u0100B\u0101d";
10170     s2 = toLower(s1);
10171     s3 = s1.dup;
10172     assert(cmp(s2, "a\u0101b\u0101d") == 0);
10173     assert(s2 !is s1);
10174     toLowerInPlace(s3);
10175     assert(s3 == s2);
10176 
10177     s1 = "A\u0460B\u0461d";
10178     s2 = toLower(s1);
10179     s3 = s1.dup;
10180     assert(cmp(s2, "a\u0461b\u0461d") == 0);
10181     assert(s2 !is s1);
10182     toLowerInPlace(s3);
10183     assert(s3 == s2);
10184 
10185     s1 = "\u0130";
10186     s2 = toLower(s1);
10187     s3 = s1.dup;
10188     assert(s2 == "i\u0307");
10189     assert(s2 !is s1);
10190     toLowerInPlace(s3);
10191     assert(s3 == s2);
10192 
10193     // Test on wchar and dchar strings.
10194     assert(toLower("Some String"w) == "some string"w);
10195     assert(toLower("Some String"d) == "some string"d);
10196 
10197     // https://issues.dlang.org/show_bug.cgi?id=12455
10198     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
10199     assert(isUpper(c));
10200     assert(toLower(c) == 'i');
10201     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
10202     // check simple-case toUpper too
10203     c = '\u1f87';
10204     assert(isLower(c));
10205     assert(toUpper(c) == '\u1F8F');
10206 }
10207 
10208 @safe pure unittest
10209 {
10210     import std.algorithm.comparison : cmp, equal;
10211     import std.utf : byCodeUnit;
10212     auto r1 = "FoL".byCodeUnit;
10213     assert(r1.toLower.cmp("fol") == 0);
10214     auto r2 = "A\u0460B\u0461d".byCodeUnit;
10215     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
10216 }
10217 
10218 /++
10219     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
10220     is returned. Otherwise `c` is returned.
10221 
10222     Warning:
10223     Certain alphabets like German and Greek have no 1:1
10224     upper-lower mapping. Use overload of toUpper which takes full string instead.
10225 
10226     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
10227     to produce an algorithm that can convert a range of characters to upper case
10228     without allocating memory.
10229     A string can then be produced by using $(REF copy, std,algorithm,mutation)
10230     to send it to an $(REF appender, std,array).
10231 +/
10232 @safe pure nothrow @nogc
10233 dchar toUpper(dchar c)
10234 {
10235     // optimize ASCII case
10236     if (c < 0xAA)
10237     {
10238         if (c < 'a')
10239             return c;
10240         if (c <= 'z')
10241             return c - 32;
10242         return c;
10243     }
10244     size_t idx = toUpperSimpleIndex(c);
10245     if (idx != ushort.max)
10246     {
10247         return toUpperTab(idx);
10248     }
10249     return c;
10250 }
10251 
10252 ///
10253 @safe unittest
10254 {
10255     import std.algorithm.iteration : map;
10256     import std.algorithm.mutation : copy;
10257     import std.array : appender;
10258 
10259     auto abuf = appender!(char[])();
10260     "hello".map!toUpper.copy(abuf);
10261     assert(abuf.data == "HELLO");
10262 }
10263 
10264 @safe unittest
10265 {
10266     static import std.ascii;
10267     import std.format : format;
10268     foreach (ch; 0 .. 0x80)
10269         assert(std.ascii.toUpper(ch) == toUpper(ch));
10270     assert(toUpper('я') == 'Я');
10271     assert(toUpper('δ') == 'Δ');
10272     auto title = unicode.Titlecase_Letter;
10273     foreach (ch; unicode.lowerCase.byCodepoint)
10274     {
10275         dchar up = ch.toUpper();
10276         assert(up == ch || isUpper(up) || title[up],
10277             format("%x -> %x", ch, up));
10278     }
10279 }
10280 
10281 /++
10282     Allocates a new array which is identical to `s` except that all of its
10283     characters are converted to uppercase (by performing Unicode uppercase mapping).
10284     If none of `s` characters were affected, then `s` itself is returned if `s`
10285     is a `string`-like type.
10286 
10287     Params:
10288         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10289         of characters
10290     Returns:
10291         An new array with the same element type as `s`.
10292 +/
10293 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10294 if (isSomeString!S)
10295 {
10296     static import std.ascii;
10297     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10298 }
10299 
10300 /// ditto
10301 ElementEncodingType!S[] toUpper(S)(S s)
10302 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10303 {
10304     static import std.ascii;
10305     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10306 }
10307 
10308 // overloads for the most common cases to reduce compile time
10309 @safe pure /*TODO nothrow*/
10310 {
10311     string toUpper(return scope string s)
10312     { return toUpper!string(s); }
10313     wstring toUpper(return scope wstring s)
10314     { return toUpper!wstring(s); }
10315     dstring toUpper(return scope dstring s)
10316     { return toUpper!dstring(s); }
10317 
10318     @safe unittest
10319     {
10320         // https://issues.dlang.org/show_bug.cgi?id=16663
10321 
10322         static struct String
10323         {
10324             string data;
10325             alias data this;
10326         }
10327 
10328         void foo()
10329         {
10330             auto u = toUpper(String(""));
10331         }
10332     }
10333 }
10334 
10335 @safe unittest
10336 {
10337     import std.algorithm.comparison : cmp;
10338 
10339     string s1 = "FoL";
10340     string s2;
10341     char[] s3;
10342 
10343     s2 = toUpper(s1);
10344     s3 = s1.dup; toUpperInPlace(s3);
10345     assert(s3 == s2, s3);
10346     assert(cmp(s2, "FOL") == 0);
10347     assert(s2 !is s1);
10348 
10349     s1 = "a\u0100B\u0101d";
10350     s2 = toUpper(s1);
10351     s3 = s1.dup; toUpperInPlace(s3);
10352     assert(s3 == s2);
10353     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10354     assert(s2 !is s1);
10355 
10356     s1 = "a\u0460B\u0461d";
10357     s2 = toUpper(s1);
10358     s3 = s1.dup; toUpperInPlace(s3);
10359     assert(s3 == s2);
10360     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10361     assert(s2 !is s1);
10362 }
10363 
10364 @safe unittest
10365 {
10366     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10367     {
10368         import std.format : format;
10369         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10370         auto low = s.toLower() , up = s.toUpper();
10371         auto lowInp = s.dup, upInp = s.dup;
10372         lowInp.toLowerInPlace();
10373         upInp.toUpperInPlace();
10374         assert(low == trueLow, format(diff, low, trueLow));
10375         assert(up == trueUp,  format(diff, up, trueUp));
10376         assert(lowInp == trueLow,
10377             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10378         assert(upInp == trueUp,
10379             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10380     }
10381     static foreach (S; AliasSeq!(dstring, wstring, string))
10382     {{
10383 
10384         S easy = "123";
10385         S good = "abCФеж";
10386         S awful = "\u0131\u023f\u2126";
10387         S wicked = "\u0130\u1FE2";
10388         auto options = [easy, good, awful, wicked];
10389         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10390         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10391 
10392         foreach (val; [easy, good])
10393         {
10394             auto e = val.dup;
10395             auto g = e;
10396             e.toUpperInPlace();
10397             assert(e is g);
10398             e.toLowerInPlace();
10399             assert(e is g);
10400         }
10401         foreach (i, v; options)
10402         {
10403             doTest(v, upper[i], lower[i]);
10404         }
10405 
10406         // a few combinatorial runs
10407         foreach (i; 0 .. options.length)
10408         foreach (j; i .. options.length)
10409         foreach (k; j .. options.length)
10410         {
10411             auto sample = options[i] ~ options[j] ~ options[k];
10412             auto sample2 = options[k] ~ options[j] ~ options[i];
10413             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10414                 lower[i] ~ lower[j] ~ lower[k]);
10415             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10416                 lower[k] ~ lower[j] ~ lower[i]);
10417         }
10418     }}
10419 }
10420 
10421 // test random access ranges
10422 @safe pure unittest
10423 {
10424     import std.algorithm.comparison : cmp;
10425     import std.utf : byCodeUnit;
10426     auto s1 = "FoL".byCodeUnit;
10427     assert(s1.toUpper.cmp("FOL") == 0);
10428     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10429     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10430 }
10431 
10432 /++
10433     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10434     (general Unicode category: Alphabetic).
10435 +/
10436 @safe pure nothrow @nogc
10437 bool isAlpha(dchar c)
10438 {
10439     // optimization
10440     if (c < 0xAA)
10441     {
10442         return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
10443     }
10444 
10445     return alphaTrie[c];
10446 }
10447 
10448 @safe unittest
10449 {
10450     auto alpha = unicode("Alphabetic");
10451     foreach (ch; alpha.byCodepoint)
10452         assert(isAlpha(ch));
10453     foreach (ch; 0 .. 0x4000)
10454         assert((ch in alpha) == isAlpha(ch));
10455 }
10456 
10457 
10458 /++
10459     Returns whether `c` is a Unicode mark
10460     (general Unicode category: Mn, Me, Mc).
10461 +/
10462 @safe pure nothrow @nogc
10463 bool isMark(dchar c)
10464 {
10465     return markTrie[c];
10466 }
10467 
10468 @safe unittest
10469 {
10470     auto mark = unicode("Mark");
10471     foreach (ch; mark.byCodepoint)
10472         assert(isMark(ch));
10473     foreach (ch; 0 .. 0x4000)
10474         assert((ch in mark) == isMark(ch));
10475 }
10476 
10477 /++
10478     Returns whether `c` is a Unicode numerical $(CHARACTER)
10479     (general Unicode category: Nd, Nl, No).
10480 +/
10481 @safe pure nothrow @nogc
10482 bool isNumber(dchar c)
10483 {
10484     // optimization for ascii case
10485     if (c <= 0x7F)
10486     {
10487         return c >= '0' && c <= '9';
10488     }
10489     else
10490     {
10491         return numberTrie[c];
10492     }
10493 }
10494 
10495 @safe unittest
10496 {
10497     auto n = unicode("N");
10498     foreach (ch; n.byCodepoint)
10499         assert(isNumber(ch));
10500     foreach (ch; 0 .. 0x4000)
10501         assert((ch in n) == isNumber(ch));
10502 }
10503 
10504 /++
10505     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10506     (general Unicode category: Alphabetic, Nd, Nl, No).
10507 
10508     Params:
10509         c = any Unicode character
10510     Returns:
10511         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10512         categories
10513 +/
10514 @safe pure nothrow @nogc
10515 bool isAlphaNum(dchar c)
10516 {
10517     static import std.ascii;
10518 
10519     // optimization for ascii case
10520     if (std.ascii.isASCII(c))
10521     {
10522         return std.ascii.isAlphaNum(c);
10523     }
10524     else
10525     {
10526         return isAlpha(c) || isNumber(c);
10527     }
10528 }
10529 
10530 @safe unittest
10531 {
10532     auto n = unicode("N");
10533     auto alpha = unicode("Alphabetic");
10534 
10535     foreach (ch; n.byCodepoint)
10536         assert(isAlphaNum(ch));
10537 
10538     foreach (ch; alpha.byCodepoint)
10539         assert(isAlphaNum(ch));
10540 
10541     foreach (ch; 0 .. 0x4000)
10542     {
10543         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10544     }
10545 }
10546 
10547 /++
10548     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10549     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10550 +/
10551 @safe pure nothrow @nogc
10552 bool isPunctuation(dchar c)
10553 {
10554     static import std.ascii;
10555 
10556     // optimization for ascii case
10557     if (c <= 0x7F)
10558     {
10559         return std.ascii.isPunctuation(c);
10560     }
10561     else
10562     {
10563         return punctuationTrie[c];
10564     }
10565 }
10566 
10567 @safe unittest
10568 {
10569     assert(isPunctuation('\u0021'));
10570     assert(isPunctuation('\u0028'));
10571     assert(isPunctuation('\u0029'));
10572     assert(isPunctuation('\u002D'));
10573     assert(isPunctuation('\u005F'));
10574     assert(isPunctuation('\u00AB'));
10575     assert(isPunctuation('\u00BB'));
10576     foreach (ch; unicode("P").byCodepoint)
10577         assert(isPunctuation(ch));
10578 }
10579 
10580 /++
10581     Returns whether `c` is a Unicode symbol $(CHARACTER)
10582     (general Unicode category: Sm, Sc, Sk, So).
10583 +/
10584 @safe pure nothrow @nogc
10585 bool isSymbol(dchar c)
10586 {
10587    return symbolTrie[c];
10588 }
10589 
10590 @safe unittest
10591 {
10592     import std.format : format;
10593     assert(isSymbol('\u0024'));
10594     assert(isSymbol('\u002B'));
10595     assert(isSymbol('\u005E'));
10596     assert(isSymbol('\u00A6'));
10597     foreach (ch; unicode("S").byCodepoint)
10598         assert(isSymbol(ch), format("%04x", ch));
10599 }
10600 
10601 /++
10602     Returns whether `c` is a Unicode space $(CHARACTER)
10603     (general Unicode category: Zs)
10604     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10605     For commonly used less strict semantics see $(LREF isWhite).
10606 +/
10607 @safe pure nothrow @nogc
10608 bool isSpace(dchar c)
10609 {
10610     import std.internal.unicode_tables : isSpaceGen; // generated file
10611     return isSpaceGen(c);
10612 }
10613 
10614 @safe unittest
10615 {
10616     assert(isSpace('\u0020'));
10617     auto space = unicode.Zs;
10618     foreach (ch; space.byCodepoint)
10619         assert(isSpace(ch));
10620     foreach (ch; 0 .. 0x1000)
10621         assert(isSpace(ch) == space[ch]);
10622 }
10623 
10624 
10625 /++
10626     Returns whether `c` is a Unicode graphical $(CHARACTER)
10627     (general Unicode category: L, M, N, P, S, Zs).
10628 
10629 +/
10630 @safe pure nothrow @nogc
10631 bool isGraphical(dchar c)
10632 {
10633     return graphicalTrie[c];
10634 }
10635 
10636 
10637 @safe unittest
10638 {
10639     auto set = unicode("Graphical");
10640     import std.format : format;
10641     foreach (ch; set.byCodepoint)
10642         assert(isGraphical(ch), format("%4x", ch));
10643     foreach (ch; 0 .. 0x4000)
10644         assert((ch in set) == isGraphical(ch));
10645 }
10646 
10647 
10648 /++
10649     Returns whether `c` is a Unicode control $(CHARACTER)
10650     (general Unicode category: Cc).
10651 +/
10652 @safe pure nothrow @nogc
10653 bool isControl(dchar c)
10654 {
10655     import std.internal.unicode_tables : isControlGen; // generated file
10656     return isControlGen(c);
10657 }
10658 
10659 @safe unittest
10660 {
10661     assert(isControl('\u0000'));
10662     assert(isControl('\u0081'));
10663     assert(!isControl('\u0100'));
10664     auto cc = unicode.Cc;
10665     foreach (ch; cc.byCodepoint)
10666         assert(isControl(ch));
10667     foreach (ch; 0 .. 0x1000)
10668         assert(isControl(ch) == cc[ch]);
10669 }
10670 
10671 
10672 /++
10673     Returns whether `c` is a Unicode formatting $(CHARACTER)
10674     (general Unicode category: Cf).
10675 +/
10676 @safe pure nothrow @nogc
10677 bool isFormat(dchar c)
10678 {
10679     import std.internal.unicode_tables : isFormatGen; // generated file
10680     return isFormatGen(c);
10681 }
10682 
10683 
10684 @safe unittest
10685 {
10686     assert(isFormat('\u00AD'));
10687     foreach (ch; unicode("Format").byCodepoint)
10688         assert(isFormat(ch));
10689 }
10690 
10691 // code points for private use, surrogates are not likely to change in near feature
10692 // if need be they can be generated from unicode data as well
10693 
10694 /++
10695     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10696     (general Unicode category: Co).
10697 +/
10698 @safe pure nothrow @nogc
10699 bool isPrivateUse(dchar c)
10700 {
10701     return (0x00_E000 <= c && c <= 0x00_F8FF)
10702         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10703         || (0x10_0000 <= c && c <= 0x10_FFFD);
10704 }
10705 
10706 /++
10707     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10708     (general Unicode category: Cs).
10709 +/
10710 @safe pure nothrow @nogc
10711 bool isSurrogate(dchar c)
10712 {
10713     return (0xD800 <= c && c <= 0xDFFF);
10714 }
10715 
10716 /++
10717     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10718 +/
10719 @safe pure nothrow @nogc
10720 bool isSurrogateHi(dchar c)
10721 {
10722     return (0xD800 <= c && c <= 0xDBFF);
10723 }
10724 
10725 /++
10726     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10727 +/
10728 @safe pure nothrow @nogc
10729 bool isSurrogateLo(dchar c)
10730 {
10731     return (0xDC00 <= c && c <= 0xDFFF);
10732 }
10733 
10734 /++
10735     Returns whether `c` is a Unicode non-character i.e.
10736     a $(CODEPOINT) with no assigned abstract character.
10737     (general Unicode category: Cn)
10738 +/
10739 @safe pure nothrow @nogc
10740 bool isNonCharacter(dchar c)
10741 {
10742     return nonCharacterTrie[c];
10743 }
10744 
10745 @safe unittest
10746 {
10747     auto set = unicode("Cn");
10748     foreach (ch; set.byCodepoint)
10749         assert(isNonCharacter(ch));
10750 }
10751 
10752 private:
10753 // load static data from pre-generated tables into usable datastructures
10754 
10755 
10756 @safe auto asSet(const (ubyte)[] compressed) pure
10757 {
10758     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10759 }
10760 
10761 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10762 {
10763     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10764 }
10765 
10766 @safe pure nothrow @nogc @property
10767 {
10768     // It's important to use auto return here, so that the compiler
10769     // only runs semantic on the return type if the function gets
10770     // used. Also these are functions rather than templates to not
10771     // increase the object size of the caller.
10772     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10773     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10774     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10775     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10776     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10777     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10778     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10779     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10780     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10781     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10782     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10783 
10784     //normalization quick-check tables
10785     auto nfcQCTrie()
10786     {
10787         import std.internal.unicode_norm : nfcQCTrieEntries;
10788         static immutable res = asTrie(nfcQCTrieEntries);
10789         return res;
10790     }
10791 
10792     auto nfdQCTrie()
10793     {
10794         import std.internal.unicode_norm : nfdQCTrieEntries;
10795         static immutable res = asTrie(nfdQCTrieEntries);
10796         return res;
10797     }
10798 
10799     auto nfkcQCTrie()
10800     {
10801         import std.internal.unicode_norm : nfkcQCTrieEntries;
10802         static immutable res = asTrie(nfkcQCTrieEntries);
10803         return res;
10804     }
10805 
10806     auto nfkdQCTrie()
10807     {
10808         import std.internal.unicode_norm : nfkdQCTrieEntries;
10809         static immutable res = asTrie(nfkdQCTrieEntries);
10810         return res;
10811     }
10812 
10813     //grapheme breaking algorithm tables
10814     auto spacingMarkTrie()
10815     {
10816         import std.internal.unicode_grapheme : spacingMarkTrieEntries;
10817         static immutable res = asTrie(spacingMarkTrieEntries);
10818         return res;
10819     }
10820 
10821     auto graphemeExtendTrie()
10822     {
10823         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10824         static immutable res = asTrie(graphemeExtendTrieEntries);
10825         return res;
10826     }
10827 
10828     auto hangLV()
10829     {
10830         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10831         static immutable res = asTrie(hangulLVTrieEntries);
10832         return res;
10833     }
10834 
10835     auto hangLVT()
10836     {
10837         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10838         static immutable res = asTrie(hangulLVTTrieEntries);
10839         return res;
10840     }
10841 
10842     auto prependTrie()
10843     {
10844         import std.internal.unicode_grapheme : prependTrieEntries;
10845         static immutable res = asTrie(prependTrieEntries);
10846         return res;
10847     }
10848 
10849     auto graphemeControlTrie()
10850     {
10851         import std.internal.unicode_grapheme : controlTrieEntries;
10852         static immutable res = asTrie(controlTrieEntries);
10853         return res;
10854     }
10855 
10856     auto xpictoTrie()
10857     {
10858         import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
10859         static immutable res = asTrie(Extended_PictographicTrieEntries);
10860         return res;
10861     }
10862 
10863     // tables below are used for composition/decomposition
10864     auto combiningClassTrie()
10865     {
10866         import std.internal.unicode_comp : combiningClassTrieEntries;
10867         static immutable res = asTrie(combiningClassTrieEntries);
10868         return res;
10869     }
10870 
10871     auto compatMappingTrie()
10872     {
10873         import std.internal.unicode_decomp : compatMappingTrieEntries;
10874         static immutable res = asTrie(compatMappingTrieEntries);
10875         return res;
10876     }
10877 
10878     auto canonMappingTrie()
10879     {
10880         import std.internal.unicode_decomp : canonMappingTrieEntries;
10881         static immutable res = asTrie(canonMappingTrieEntries);
10882         return res;
10883     }
10884 
10885     auto compositionJumpTrie()
10886     {
10887         import std.internal.unicode_comp : compositionJumpTrieEntries;
10888         static immutable res = asTrie(compositionJumpTrieEntries);
10889         return res;
10890     }
10891 
10892     //case conversion tables
10893     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10894     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10895     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10896     //simple case conversion tables
10897     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10898     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10899     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10900 
10901 }
10902 
10903 }// version (!std_uni_bootstrap)